Transformers 源码解析（八十二）

`.\models\nllb_moe\modeling_nllb_moe.py`

# 定义一个函数，根据输入的 `input_ids` 张量，将其中的 token 向右移动一位
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    将输入的 `input_ids` 张量中的 token 向右移动一位。
    """
    # 创建一个与 `input_ids` 形状相同的全零张量
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将 `input_ids` 张量中的数据复制到新张量中，每个序列向右移动一个位置
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    # 将每个序列的第一个 token 替换为 `decoder_start_token_id`
    shifted_input_ids[:, 0] = decoder_start_token_id

    # 如果 `pad_token_id` 为 None，则抛出异常
    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # 将标签中可能存在的 -100 值替换为 `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    # 返回向右移动后的 `input_ids` 张量
    return shifted_input_ids


# 定义一个函数，根据输入的 `input_ids` 和 `padding_idx` 创建位置编码
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    将非填充符号替换为它们的位置编号。位置编号从 `padding_idx+1` 开始。填充符号不变。
    """
    # 返回一个与 `input_ids` 形状相同的张量，其中非填充符号被替换为它们的位置编号
    pass  # 函数体未完，后续代码未提供，请补充完整。
    # 这一系列的类型转换和类型转换是精心平衡的，旨在同时与ONNX导出和XLA兼容。
    mask = input_ids.ne(padding_idx).int()
    # 创建一个与输入张量维度相同的掩码张量，其中非填充位置为1，填充位置为0
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # 计算递增的索引，用于非填充位置的元素，累加非填充位置的数量，并加上过去键值长度，再乘以掩码
    return incremental_indices.long() + padding_idx
    # 将增量索引转换为长整型并加上填充索引，以得到最终的位置索引张量
# 定义一个函数，计算辅助的负载平衡损失，如在Switch Transformer中实现的PyTorch版本。
# 详细信息请参考Switch Transformer论文（https://arxiv.org/abs/2101.03961）。
# 该函数实现了论文中方程（4）到（6）中提出的损失函数，旨在惩罚路由专家之间不平衡的情况。
def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
    if router_probs is None:
        return 0

    # 获取专家数量
    num_experts = router_probs.shape[-1]

    # 将专家索引转换为int64类型，否则独热编码将失败
    if expert_indices.dtype != torch.int64:
        expert_indices = expert_indices.to(torch.int64)

    # 如果专家索引的维度是2，则扩展一个维度以适应独热编码的需求
    if len(expert_indices.shape) == 2:
        expert_indices = expert_indices.unsqueeze(2)

    # 创建一个独热编码的专家掩码
    expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts)

    # 对于每个token，确定它是否路由到了某个专家
    expert_mask = torch.max(expert_mask, axis=-2).values

    # 将专家掩码转换为float32类型，否则计算均值会失败
    expert_mask = expert_mask.to(torch.float32)

    # 计算每个专家组和专家的token平均分布
    tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)

    # 计算每个专家组和专家的路由概率平均值
    router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2)

    # 计算辅助损失，乘以(num_experts的平方)，以增强惩罚效果
    return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2)


# 从transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding中复制的类
class NllbMoeSinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
        super().__init__()
        self.offset = 2
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)

    # 创建权重矩阵的函数
    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        # 获取嵌入权重矩阵
        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
        if hasattr(self, "weights"):
            # 在前向传播中，将权重转换为正确的dtype和设备上的参数
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        # 注册权重缓冲区，这里不是持久性的缓冲区
        self.register_buffer("weights", emb_weights, persistent=False)

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        """
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        """
        # 计算嵌入维度的一半
        half_dim = embedding_dim // 2
        # 计算对数值，用于生成正弦位置编码
        emb = math.log(10000) / (half_dim - 1)
        # 计算正弦位置编码
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        # 计算位置编码矩阵
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        # 将正弦和余弦编码连接起来，构成最终的位置编码矩阵
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        # 如果嵌入维度是奇数，进行零填充
        if embedding_dim % 2 == 1:
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        # 如果提供了填充索引，将对应位置的编码置为零
        if padding_idx is not None:
            emb[padding_idx, :] = 0

        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(
        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
    ):
        if input_ids is not None:
            bsz, seq_len = input_ids.size()
            # 根据输入的标记 id 创建位置 id，保持任何填充标记的填充状态
            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
                input_ids.device
            )
        else:
            bsz, seq_len = inputs_embeds.size()[:-1]
            # 根据输入的嵌入向量创建位置 id
            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)

        # 如果需要扩展嵌入向量，确保权重矩阵足够大
        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
        if max_pos > self.weights.size(0):
            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)

        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()

    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        # 获取输入嵌入的形状信息
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 创建顺序位置 id，假设所有输入都是有效的，无法推断填充状态
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 扩展位置 id 到与输入形状相匹配
        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
    class NllbMoeTop2Router(nn.Module):
        """
        Router using tokens choose top-2 experts assignment.

        This router uses the same mechanism as in NLLB-MoE from the fairseq repository. Items are sorted by router_probs
        and then routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee
        that each token is processed by an expert**, or that each expert receives at least one token.

        The router combining weights are also returned to make sure that the states that are not updated will be masked.

        """

        def __init__(self, config: NllbMoeConfig):
            super().__init__()
            self.num_experts = config.num_experts
            self.expert_capacity = config.expert_capacity
            self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias)
            self.router_ignore_padding_tokens = config.router_ignore_padding_tokens
            self.dtype = getattr(torch, config.router_dtype)

            self.second_expert_policy = config.second_expert_policy
            self.normalize_router_prob_before_dropping = config.normalize_router_prob_before_dropping
            self.batch_prioritized_routing = config.batch_prioritized_routing
            self.moe_eval_capacity_token_fraction = config.moe_eval_capacity_token_fraction

        def _cast_classifier(self):
            r"""
            `bitsandbytes` `Linear8bitLt` layers does not support manual casting Therefore we need to check if they are an
            instance of the `Linear8bitLt` class by checking special attributes.
            """
            # 检查是否存在特定属性以判断是否需要类型转换
            if not (hasattr(self.classifier, "SCB") or hasattr(self.classifier, "CB")):
                self.classifier = self.classifier.to(self.dtype)

        def normalize_router_probabilities(self, router_probs, top_1_mask, top_2_mask):
            # 计算每个样本中top-1和top-2的概率
            top_1_max_probs = (router_probs * top_1_mask).sum(dim=1)
            top_2_max_probs = (router_probs * top_2_mask).sum(dim=1)
            # 规范化概率，避免除零错误
            denom_s = torch.clamp(top_1_max_probs + top_2_max_probs, min=torch.finfo(router_probs.dtype).eps)
            top_1_max_probs = top_1_max_probs / denom_s
            top_2_max_probs = top_2_max_probs / denom_s
            return top_1_max_probs, top_2_max_probs

        def route_tokens(
            self,
            router_logits: torch.Tensor,
            input_dtype: torch.dtype = torch.float32,
            padding_mask: Optional[torch.LongTensor] = None,
    # 定义前向传播方法，用于处理隐藏状态和可选的填充掩码，返回三个张量元组
    def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.LongTensor] = None) -> Tuple:
        r"""
        The hidden states are reshaped to simplify the computation of the router probabilities (combining weights for
        each experts.)
        隐藏状态被重新整形，以简化路由概率的计算（结合每个专家的权重）。

        Args:
            hidden_states (`torch.Tensor`):
                (batch_size, sequence_length, hidden_dim) from which router probabilities are computed.
                用于计算路由概率的隐藏状态张量，形状为(batch_size, sequence_length, hidden_dim)。

        Returns:
            top_1_mask (`torch.Tensor` of shape (batch_size, sequence_length)):
                Index tensor of shape [batch_size, sequence_length] corresponding to the expert selected for each token
                using the top1 probabilities of the router.
                形状为(batch_size, sequence_length)的索引张量，每个令牌对应的专家选择索引，使用路由器的top1概率。

            router_probabilities (`torch.Tensor` of shape (batch_size, sequence_length, nump_experts)):
                Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each
                token and expert. Used for routing tokens to experts.
                形状为(batch_size, sequence_length, num_experts)的张量，每个令牌和专家的概率值，用于将令牌路由到专家。

            router_logits (`torch.Tensor` of shape (batch_size, sequence_length))):
                Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits.
                This is used later for computing router z-loss.
                形状为(batch_size, sequence_length, num_experts)的原始路由器logits张量，用于后续计算路由器的z-loss。
        """
        # 将输入的dtype赋值给实例变量input_dtype
        self.input_dtype = hidden_states.dtype
        # 获取输入hidden_states的维度信息
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        # 将hidden_states重新整形为(batch_size * sequence_length, hidden_dim)
        hidden_states = hidden_states.reshape((batch_size * sequence_length), hidden_dim)
        # 将hidden_states转换为self.dtype类型
        hidden_states = hidden_states.to(self.dtype)
        # 调用内部方法_cast_classifier，执行类型转换
        self._cast_classifier()
        # 将转换后的hidden_states输入分类器，得到router_logits
        router_logits = self.classifier(hidden_states)
        # 调用route_tokens方法，使用router_logits、self.input_dtype和padding_mask计算top_1_mask和router_probs
        top_1_mask, router_probs = self.route_tokens(router_logits, self.input_dtype, padding_mask)
        # 返回top_1_mask和router_probs作为前向传播的输出
        return top_1_mask, router_probs
# 定义一个名为 NllbMoeDenseActDense 的神经网络模块类，继承自 nn.Module
class NllbMoeDenseActDense(nn.Module):
    
    # 初始化方法，接收一个 NllbMoeConfig 类型的配置参数和一个整数 ffn_dim
    def __init__(self, config: NllbMoeConfig, ffn_dim: int):
        super().__init__()
        
        # 创建一个线性层 fc1，输入维度为 config.d_model，输出维度为 ffn_dim
        self.fc1 = nn.Linear(config.d_model, ffn_dim)
        
        # 创建一个线性层 fc2，输入维度为 ffn_dim，输出维度为 config.d_model
        self.fc2 = nn.Linear(ffn_dim, config.d_model)
        
        # 创建一个以 config.activation_dropout 为概率的 Dropout 层
        self.dropout = nn.Dropout(config.activation_dropout)
        
        # 根据配置中的激活函数名称从预定义的 ACT2FN 字典中选择激活函数，并赋值给 self.act
        self.act = ACT2FN[config.activation_function]

    # 前向传播方法，接收输入的 hidden_states
    def forward(self, hidden_states):
        
        # 将输入 hidden_states 通过线性层 fc1
        hidden_states = self.fc1(hidden_states)
        
        # 将线性层的输出通过激活函数 self.act
        hidden_states = self.act(hidden_states)
        
        # 对激活后的结果应用 Dropout
        hidden_states = self.dropout(hidden_states)
        
        # 如果 fc2 的权重是 Tensor 类型，并且 hidden_states 的数据类型不等于 fc2 的权重的数据类型
        # 并且 fc2 的权重数据类型不是 torch.int8 和 torch.uint8 类型
        if (
            isinstance(self.fc2.weight, torch.Tensor)
            and hidden_states.dtype != self.fc2.weight.dtype
            and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
        ):
            # 将 hidden_states 转换成与 fc2 权重相同的数据类型
            hidden_states = hidden_states.to(self.fc2.weight.dtype)
        
        # 将转换后的 hidden_states 通过线性层 fc2
        hidden_states = self.fc2(hidden_states)
        
        # 返回线性层 fc2 的输出结果
        return hidden_states


# 定义一个名为 NllbMoeSparseMLP 的神经网络模块类，继承自 nn.Module
class NllbMoeSparseMLP(nn.Module):
    
    # 初始化方法，接收一个 NllbMoeConfig 类型的配置参数，一个整数 ffn_dim 和一个专家类 expert_class
    def __init__(self, config: NllbMoeConfig, ffn_dim: int, expert_class: nn.Module = NllbMoeDenseActDense):
        super().__init__()
        
        # 创建一个 NllbMoeTop2Router 类的实例，存储在 self.router 中
        self.router = NllbMoeTop2Router(config)
        
        # 设置 self.moe_token_dropout 为 config 中的 moe_token_dropout 参数
        self.moe_token_dropout = config.moe_token_dropout
        
        # 创建一个以 moe_token_dropout 为概率的 Dropout 层，存储在 self.token_dropout 中
        self.token_dropout = nn.Dropout(self.moe_token_dropout)
        
        # 设置 self.num_experts 为 config 中的 num_experts 参数
        self.num_experts = config.num_experts
        
        # 创建一个 ModuleDict 存储专家模型，key 是 "expert_0", "expert_1", ...，值是 expert_class 的实例
        self.experts = nn.ModuleDict()
        for idx in range(self.num_experts):
            self.experts[f"expert_{idx}"] = expert_class(config, ffn_dim)
    def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = False):
        r"""
        The goal of this forward pass is to have the same number of operation as the equivalent `NllbMoeDenseActDense`
        (mlp) layer. This means that all of the hidden states should be processed at most twice ( since we are using a
        top_2 gating mecanism). This means that we keep the complexity to O(batch_size x sequence_length x hidden_dim)
        instead of O(num_experts x batch_size x sequence_length x hidden_dim).

        1- Get the `router_probs` from the `router`. The shape of the `router_mask` is `(batch_size X sequence_length,
        num_expert)` and corresponds to the boolean version of the `router_probs`. The inputs are masked using the
        `router_mask`.
        """
        batch_size, sequence_length, hidden_dim = hidden_states.shape

        # 2- Dispatch the hidden_states to its associated experts. The router probabilities are used to weight the
        # contribution of each experts when updating the masked hidden states.
        
        # Obtain the top 1 mask and router probabilities from the router module
        top_1_mask, router_probs = self.router(hidden_states, padding_mask)

        # Convert router_probs to boolean router_mask
        router_mask = router_probs.bool()

        # Reshape hidden_states for efficient masking
        hidden_states = hidden_states.reshape((batch_size * sequence_length), hidden_dim)

        # Mask the hidden_states using router_mask
        masked_hidden_states = torch.einsum("bm,be->ebm", hidden_states, router_mask)

        # Iterate over each expert and update the masked hidden states accordingly
        for idx, expert in enumerate(self.experts.values()):
            token_indices = router_mask[:, idx]
            combining_weights = router_probs[token_indices, idx]
            expert_output = expert(masked_hidden_states[idx, token_indices])

            # Apply MoE token dropout if configured
            if self.moe_token_dropout > 0:
                if self.training:
                    expert_output = self.token_dropout(expert_output)
                else:
                    expert_output *= 1 - self.moe_token_dropout

            masked_hidden_states[idx, token_indices] = torch.einsum("b,be->be", combining_weights, expert_output)

        # Aggregate the masked hidden states to get updated hidden_states
        hidden_states = masked_hidden_states.sum(dim=0).reshape(batch_size, sequence_length, hidden_dim)

        # Determine the index of the top 1 expert based on top_1_mask
        top_1_expert_index = torch.argmax(top_1_mask, dim=-1)

        # Return updated hidden_states and necessary values for loss computation
        return hidden_states, (router_probs, top_1_expert_index)
# 从transformers.models.bart.modeling_bart.BartAttention复制的NllbMoeAttention类，用于NllbMoe模型的注意力机制实现
class NllbMoeAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[NllbMoeConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim  # 设定注意力机制的输入维度
        self.num_heads = num_heads  # 设定注意力头的数量
        self.dropout = dropout  # 设定dropout概率
        self.head_dim = embed_dim // num_heads  # 计算每个头的维度
        self.config = config  # 存储模型配置信息

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子，用于注意力分数的缩放
        self.is_decoder = is_decoder  # 是否为解码器的标志
        self.is_causal = is_causal  # 是否使用因果注意力

        # 线性层，用于投影键（k_proj）、值（v_proj）、查询（q_proj）、输出（out_proj）到指定维度
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    # 将张量重塑为适合多头注意力计算的形状
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播函数，实现注意力机制的计算
    def forward(
        self,
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,



# NllbMoeEncoderLayer类，用于NllbMoe模型中的编码器层实现
class NllbMoeEncoderLayer(nn.Module):
    def __init__(self, config: NllbMoeConfig, is_sparse: bool = False):
        super().__init__()
        self.embed_dim = config.d_model  # 获取模型配置中的输入维度
        self.is_sparse = is_sparse  # 是否为稀疏模型的标志
        # 自注意力层，使用NllbMoeAttention实例化，处理自注意力机制
        self.self_attn = NllbMoeAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
        )
        self.attn_dropout = nn.Dropout(config.dropout)  # 注意力机制的dropout层
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 自注意力层的归一化层
        # 如果不是稀疏模型，则使用NllbMoeDenseActDense处理前馈神经网络
        if not self.is_sparse:
            self.ffn = NllbMoeDenseActDense(config, ffn_dim=config.encoder_ffn_dim)
        else:  # 否则使用NllbMoeSparseMLP处理前馈神经网络
            self.ffn = NllbMoeSparseMLP(config, ffn_dim=config.encoder_ffn_dim)
        self.ff_layer_norm = nn.LayerNorm(config.d_model)  # 前馈网络的归一化层
        self.ff_dropout = nn.Dropout(config.activation_dropout)  # 前馈网络的dropout层

    # 编码器层的前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_head_mask: torch.Tensor,
        output_attentions: bool = False,
        output_router_logits: bool = False,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 保存残差连接的输入hidden_states
        residual = hidden_states
        # 对输入hidden_states进行 layer normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 使用自注意力机制计算新的hidden_states，同时获取注意力权重attn_weights
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 对自注意力机制的输出进行dropout处理
        hidden_states = self.attn_dropout(hidden_states)
        # 将残差连接的输入与经过注意力机制处理后的hidden_states相加
        hidden_states = residual + hidden_states

        # 再次保存残差连接的输入hidden_states
        residual = hidden_states

        # 对更新后的hidden_states进行 layer normalization
        hidden_states = self.ff_layer_norm(hidden_states)
        
        # 如果模型是稀疏的，使用稀疏的前馈网络FFN计算新的hidden_states和router_states
        if self.is_sparse:
            hidden_states, router_states = self.ffn(hidden_states, attention_mask)
        else:
            # 否则，直接使用前馈网络FFN计算新的hidden_states，同时将router_states设为None
            # 用于追踪哪些层的梯度为None
            hidden_states, router_states = self.ffn(hidden_states), None

        # 对前馈网络的输出进行dropout处理
        hidden_states = self.ff_dropout(hidden_states)

        # 将残差连接的输入与经过前馈网络处理后的hidden_states相加
        hidden_states = residual + hidden_states

        # 如果hidden_states的数据类型为torch.float16并且包含无穷大或NaN值，进行截断处理
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 将处理后的hidden_states作为输出
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将attn_weights加入输出元组
        if output_attentions:
            outputs += (attn_weights,)

        # 如果需要输出router_states，则将router_states加入输出元组
        if output_router_logits:
            outputs += (router_states,)

        # 返回最终的输出元组
        return outputs
class NllbMoeDecoderLayer(nn.Module):
    def __init__(self, config: NllbMoeConfig, is_sparse: bool = False):
        super().__init__()
        self.embed_dim = config.d_model  # 从配置中获取编码维度
        self.is_sparse = is_sparse  # 标记是否稀疏模式
        # 初始化自注意力层，使用NllbMoeAttention模块
        self.self_attn = NllbMoeAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.dropout = config.dropout  # 配置的dropout值
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数
        self.attn_dropout = nn.Dropout(config.dropout)  # 注意力机制的dropout

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 自注意力层后的LayerNorm
        # 初始化跨注意力层，使用NllbMoeAttention模块
        self.cross_attention = NllbMoeAttention(
            self.embed_dim, config.decoder_attention_heads, config.attention_dropout, is_decoder=True
        )
        self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)  # 跨注意力层后的LayerNorm
        if not self.is_sparse:
            # 如果不是稀疏模式，使用全连接层的FFN
            self.ffn = NllbMoeDenseActDense(config, ffn_dim=config.decoder_ffn_dim)
        else:
            # 如果是稀疏模式，使用稀疏MLP
            self.ffn = NllbMoeSparseMLP(config, ffn_dim=config.decoder_ffn_dim)
        self.ff_layer_norm = nn.LayerNorm(config.d_model)  # FFN层后的LayerNorm
        self.ff_dropout = nn.Dropout(config.activation_dropout)  # FFN层的dropout

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        output_router_logits: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ):
        # 省略了具体的前向传播逻辑，这里应该包含该层的前向传播逻辑
        pass


class NllbMoePreTrainedModel(PreTrainedModel):
    config_class = NllbMoeConfig  # 使用的配置类
    base_model_prefix = "model"  # 基础模型前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["NllbMoeEncoderLayer", "NllbMoeDecoderLayer"]  # 不进行分割的模块列表

    def _init_weights(self, module):
        """Initialize the weights"""
        std = self.config.init_std  # 初始化标准差
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)  # 线性层权重初始化为正态分布
            if module.bias is not None:
                module.bias.data.zero_()  # 如果有偏置，则初始化为零
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)  # Embedding层权重初始化为正态分布
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()  # 如果有padding_idx，则对应位置初始化为零
    # 使用此类作为普通的 PyTorch 模块，并参考 PyTorch 文档以获取有关一般使用和行为的所有信息。

    Parameters:
        config ([`NllbMoeConfig`]):
            这是模型配置类，包含模型的所有参数。使用配置文件初始化类时，并不会加载模型的权重，只会加载配置信息。
            可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
"""

NLLB_MOE_GENERATION_EXAMPLE = r"""
    Translation example:

    ```
    >>> from transformers import AutoTokenizer, NllbMoeForConditionalGeneration

    >>> model = NllbMoeForConditionalGeneration.from_pretrained("facebook/nllb-moe-54b")
    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b")

    >>> text_to_translate = "Life is like a box of chocolates"
    >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")

    >>> # translate to French
    >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("eng_Latn"))
    >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
    ```
"""

NLLB_MOE_INPUTS_DOCSTRING = r"""
"""


class NllbMoeEncoder(NllbMoePreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`NllbMoeEncoderLayer`].

    Args:
        config:
            NllbMoeConfig
        embed_tokens (nn.Embedding):
            output embedding
    """

    def __init__(self, config: NllbMoeConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = NllbMoeSinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
            self.padding_idx,
        )
        sparse_step = config.encoder_sparse_step
        self.layers = nn.ModuleList()
        for i in range(config.encoder_layers):
            is_sparse = (i + 1) % sparse_step == 0 if sparse_step > 0 else False
            self.layers.append(NllbMoeEncoderLayer(config, is_sparse))

        self.layer_norm = nn.LayerNorm(config.d_model)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Perform forward pass of the NllbMoeEncoder.

        Args:
            input_ids (Optional[torch.Tensor]): Input token IDs.
            attention_mask (Optional[torch.Tensor]): Attention mask.
            head_mask (Optional[torch.Tensor]): Head mask for attention computation.
            inputs_embeds (Optional[torch.Tensor]): Embedded input tokens.
            output_attentions (Optional[bool]): Whether to output attentions.
            output_hidden_states (Optional[bool]): Whether to output hidden states.
            output_router_logits (Optional[bool]): Whether to output router logits.
            return_dict (Optional[bool]): Whether to return a dictionary.

        Returns:
            Depending on `return_dict`, either a tuple or a dictionary with model outputs.
        """
        # TODO: Implement the forward pass logic here
        pass


class NllbMoeDecoder(NllbMoePreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`NllbMoeDecoderLayer`].
    """
    # The implementation of the NllbMoeDecoder class would go here, but it's not provided in the snippet.
    Args:
        config:
            NllbMoeConfig
            模型配置对象，包含模型的各种配置参数
        embed_tokens (nn.Embedding):
            output embedding
            输出嵌入层，用于将输入的标记转换为嵌入表示
    """

    def __init__(self, config: NllbMoeConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        # 初始化输入标记的嵌入层，将词汇表大小、嵌入维度和填充标记应用于嵌入层
        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        if embed_tokens is not None:
            # 如果提供了额外的嵌入层，使用其权重来初始化当前的嵌入层
            self.embed_tokens.weight = embed_tokens.weight

        # 初始化位置嵌入层，使用正弦函数生成位置编码
        self.embed_positions = NllbMoeSinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            self.padding_idx,
        )

        sparse_step = config.decoder_sparse_step
        self.layers = nn.ModuleList()
        for i in range(config.decoder_layers):
            # 判断当前层是否是稀疏注意力层
            is_sparse = (i + 1) % sparse_step == 0 if sparse_step > 0 else False
            # 向层列表中添加一个解码器层对象
            self.layers.append(NllbMoeDecoderLayer(config, is_sparse))

        # 初始化层归一化层，对模型输出进行归一化处理
        self.layer_norm = nn.LayerNorm(config.d_model)

        # 初始化梯度检查点，默认为 False
        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    "The bare NllbMoe Model outputting raw hidden-states without any specific head on top.",
    NLLB_MOE_START_DOCSTRING,
)
# 定义 NllbMoeModel 类，用于输出不带特定头部的原始隐藏状态
class NllbMoeModel(NllbMoePreTrainedModel):
    # 定义共享权重的键列表，这些权重会被绑定
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config: NllbMoeConfig):
        super().__init__(config)

        # 初始化填充索引和词汇表大小
        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
        # 创建共享的嵌入层，用于输入词汇表到指定的 d_model 维度
        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)

        # 初始化编码器和解码器
        self.encoder = NllbMoeEncoder(config, self.shared)
        self.decoder = NllbMoeDecoder(config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

    # 返回共享的嵌入层
    def get_input_embeddings(self):
        return self.shared

    # 设置输入的嵌入层
    def set_input_embeddings(self, value):
        self.shared = value
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    # 绑定权重，如果配置要求绑定词嵌入
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    # 获取编码器
    def get_encoder(self):
        return self.encoder

    # 获取解码器
    def get_decoder(self):
        return self.decoder

    @add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
    @add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqMoEModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义前向方法，用于模型的正向传播，支持多种输入参数和返回类型
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    "The NllbMoe Model with a language modeling head. Can be used for summarization.", NLLB_MOE_START_DOCSTRING
)
# 定义 NllbMoeForConditionalGeneration 类，具有语言建模头部，可用于摘要生成
class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel):
    # 基础模型前缀
    base_model_prefix = "model"
    # 定义共享权重的键列表，这些权重会被绑定
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
    # 使用给定的配置初始化模型
    def __init__(self, config: NllbMoeConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 使用给定配置创建 NllbMoeModel 模型实例
        self.model = NllbMoeModel(config)
        # 创建一个线性层，用于生成输出的词汇表大小的结果，没有偏置
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # 设置路由器 z 损失系数和辅助损失系数
        self.router_z_loss_coef = config.router_z_loss_coef
        self.router_aux_loss_coef = config.router_aux_loss_coef
        # 执行初始化权重和应用最终处理
        self.post_init()

    # 获取编码器部分
    def get_encoder(self):
        return self.model.get_encoder()

    # 获取解码器部分
    def get_decoder(self):
        return self.model.get_decoder()

    # 获取输出嵌入层
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 重写的前向方法，用于模型前向推断
    @add_start_docstrings_to_model_forward(NLLB_MOE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqMoEOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(NLLB_MOE_GENERATION_EXAMPLE)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_router_logits: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此方法是用来执行模型的前向推断

    # 解压路由器输出的方法，从路由器输出中提取总的路由器对数和专家索引
    def _unpack_router_logits(self, router_outputs):
        # 初始化总路由器对数和总专家索引列表
        total_router_logits = []
        total_expert_indexes = []
        # 遍历路由器输出列表
        for router_output in router_outputs:
            # 如果路由器输出不为空
            if router_output is not None:
                # 分别取出路由器对数和专家索引
                router_logits, expert_indexes = router_output
                # 将路由器对数和专家索引添加到总列表中
                total_router_logits.append(router_logits)
                total_expert_indexes.append(expert_indexes)

        # 如果总路由器对数列表不为空，则沿着维度1拼接成一个张量，否则为 None
        total_router_logits = torch.cat(total_router_logits, dim=1) if len(total_router_logits) > 0 else None
        # 如果总专家索引列表不为空，则沿着维度1堆叠成一个张量，否则为 None
        total_expert_indexes = torch.stack(total_expert_indexes, dim=1) if len(total_expert_indexes) > 0 else None
        # 返回总路由器对数和总专家索引
        return total_router_logits, total_expert_indexes

    # 从 transformers.models.switch_transformers.SwitchTransformersForConditionalGeneration.prepare_inputs_for_generation 复制而来
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用了过去的键值对，则需要调整decoder_input_ids的长度
        if past_key_values is not None:
            # 获取过去键值对的长度
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经仅传递最后一个输入ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认情况下保留仅最后一个ID的旧行为
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 裁剪decoder_input_ids以去除过去的前缀
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回一个包含各种生成过程输入的字典
        return {
            "input_ids": None,  # encoder_outputs已定义。不需要input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 更改此处以避免缓存（可能是为了调试）
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        # 重新排序过去的键值对，以匹配新的beam索引
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                # 对每个层的过去状态进行索引选择，以匹配新的beam索引并添加到重排序过去中
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

`.\models\nllb_moe\init.py`

# 版权声明和许可信息，指明代码版权和许可的使用条款
# The HuggingFace Team 版权声明和保留所有权利

# 引入类型检查模块中的 TYPE_CHECKING 类型
from typing import TYPE_CHECKING

# 引入可选依赖未安装时的异常处理和延迟加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构字典
_import_structure = {
    "configuration_nllb_moe": [
        "NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "NllbMoeConfig",
    ]
}

# 检查是否有必要导入 Torch 库，若未安装则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 Torch 可用，则将相关模型定义添加到导入结构字典中
    _import_structure["modeling_nllb_moe"] = [
        "NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "NllbMoeForConditionalGeneration",
        "NllbMoeModel",
        "NllbMoePreTrainedModel",
        "NllbMoeTop2Router",
        "NllbMoeSparseMLP",
    ]

# 如果是类型检查阶段，则导入相应的配置和模型类
if TYPE_CHECKING:
    from .configuration_nllb_moe import (
        NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP,
        NllbMoeConfig,
    )

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_nllb_moe import (
            NLLB_MOE_PRETRAINED_MODEL_ARCHIVE_LIST,
            NllbMoeForConditionalGeneration,
            NllbMoeModel,
            NllbMoePreTrainedModel,
            NllbMoeSparseMLP,
            NllbMoeTop2Router,
        )

# 如果不是类型检查阶段，则将当前模块设为延迟加载模块
else:
    import sys

    # 使用 LazyModule 类在 sys.modules 中注册当前模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\nougat\convert_nougat_to_hf.py`

# 设置编码为 UTF-8，确保可以正确处理中文等特殊字符
# 版权声明，指明本代码的版权归属于 HuggingFace Inc. 团队
# 根据 Apache 许可证 2.0 版本使用本代码，详细条款可参考许可证链接
"""Convert Nougat checkpoints using the original `nougat` library. URL:
https://github.com/facebookresearch/nougat/tree/main"""
# 导入 argparse 模块，用于处理命令行参数
import argparse

# 导入 torch 库
import torch
# 导入 hf_hub_download 函数，用于从 Hugging Face Hub 下载模型
from huggingface_hub import hf_hub_download
# 导入 NougatModel 类，用于加载 Nougat 模型
from nougat import NougatModel
# 导入 rasterize_paper 函数，用于将数据转换为光栅化的图像数据
from nougat.dataset.rasterize import rasterize_paper
# 导入 get_checkpoint 函数，用于获取检查点
from nougat.utils.checkpoint import get_checkpoint
# 导入 Image 类，用于处理图像
from PIL import Image
# 导入 transformers 库的多个类和函数
from transformers import (
    DonutSwinConfig,
    DonutSwinModel,
    MBartConfig,
    MBartForCausalLM,
    NougatImageProcessor,
    NougatProcessor,
    NougatTokenizerFast,
    VisionEncoderDecoderModel,
)

# 定义函数 get_configs，用于根据给定模型获取编码器和解码器的配置
def get_configs(model):
    # 获取原始模型的配置
    original_config = model.config

    # 定义编码器的配置，使用 DonutSwinConfig 类
    encoder_config = DonutSwinConfig(
        image_size=original_config.input_size,
        patch_size=4,
        depths=original_config.encoder_layer,
        num_heads=[4, 8, 16, 32],
        window_size=original_config.window_size,
        embed_dim=128,
    )
    
    # 定义解码器的配置，使用 MBartConfig 类
    decoder_config = MBartConfig(
        is_decoder=True,
        is_encoder_decoder=False,
        add_cross_attention=True,
        decoder_layers=original_config.decoder_layer,
        max_position_embeddings=original_config.max_position_embeddings,
        vocab_size=len(
            model.decoder.tokenizer
        ),  # 根据模型的解码器的 tokenizer 获得词汇表大小
        scale_embedding=True,
        add_final_layer_norm=True,
        tie_word_embeddings=False,
    )

    # 返回编码器和解码器的配置
    return encoder_config, decoder_config

# 定义函数 rename_key，用于重命名模型中的特定键名，以便与 PyTorch 模型兼容
# 这些名称更改主要用于适应不同框架的不同命名习惯
def rename_key(name):
    if "encoder.model" in name:
        name = name.replace("encoder.model", "encoder")
    if "decoder.model" in name:
        name = name.replace("decoder.model", "decoder")
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "embeddings.norm")
    # 检查字符串 name 是否以 "encoder" 开头
    if name.startswith("encoder"):
        # 如果字符串 name 中包含 "layers"
        if "layers" in name:
            # 在字符串 name 前加上 "encoder."
            name = "encoder." + name
        # 如果字符串 name 中包含 "attn.proj"
        if "attn.proj" in name:
            # 将字符串 name 中的 "attn.proj" 替换为 "attention.output.dense"
            name = name.replace("attn.proj", "attention.output.dense")
        # 如果字符串 name 中包含 "attn" 且不包含 "mask"
        if "attn" in name and "mask" not in name:
            # 将字符串 name 中的 "attn" 替换为 "attention.self"
            name = name.replace("attn", "attention.self")
        # 如果字符串 name 中包含 "norm1"
        if "norm1" in name:
            # 将字符串 name 中的 "norm1" 替换为 "layernorm_before"
            name = name.replace("norm1", "layernorm_before")
        # 如果字符串 name 中包含 "norm2"
        if "norm2" in name:
            # 将字符串 name 中的 "norm2" 替换为 "layernorm_after"
            name = name.replace("norm2", "layernorm_after")
        # 如果字符串 name 中包含 "mlp.fc1"
        if "mlp.fc1" in name:
            # 将字符串 name 中的 "mlp.fc1" 替换为 "intermediate.dense"
            name = name.replace("mlp.fc1", "intermediate.dense")
        # 如果字符串 name 中包含 "mlp.fc2"
        if "mlp.fc2" in name:
            # 将字符串 name 中的 "mlp.fc2" 替换为 "output.dense"
            name = name.replace("mlp.fc2", "output.dense")

        # 如果字符串 name 等于 "encoder.norm.weight"
        if name == "encoder.norm.weight":
            # 将字符串 name 替换为 "encoder.layernorm.weight"
            name = "encoder.layernorm.weight"
        # 如果字符串 name 等于 "encoder.norm.bias"
        if name == "encoder.norm.bias":
            # 将字符串 name 替换为 "encoder.layernorm.bias"
            name = "encoder.layernorm.bias"

    # 返回经过处理的字符串 name
    return name
# 从 transformers.models.donut.convert_donut_to_pytorch.convert_state_dict 复制的函数，用于将原始状态字典转换为 PyTorch 模型的状态字典
def convert_state_dict(orig_state_dict, model):
    # 遍历原始状态字典的副本的键
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键名中包含 "qkv"
        if "qkv" in key:
            # 拆分键名为列表
            key_split = key.split(".")
            # 获取层编号和块编号
            layer_num = int(key_split[3])
            block_num = int(key_split[5])
            # 获取当前自注意力的维度
            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size

            # 如果键名包含 "weight"
            if "weight" in key:
                # 更新状态字典，设置查询权重的新键值对
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                ] = val[:dim, :]
                # 更新状态字典，设置键权重的新键值对
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
                ] = val[dim : dim * 2, :]
                # 更新状态字典，设置值权重的新键值对
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                ] = val[-dim:, :]
            else:
                # 更新状态字典，设置查询偏置的新键值对
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
                ] = val[:dim]
                # 更新状态字典，设置键偏置的新键值对
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
                ] = val[dim : dim * 2]
                # 更新状态字典，设置值偏置的新键值对
                orig_state_dict[
                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
                ] = val[-dim:]
        # 如果键名中包含 "attn_mask" 或者是特定的键名列表
        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
            # HuggingFace 实现不使用 attn_mask 缓冲区，且模型不使用编码器的最终 LayerNorm
            # 跳过处理这些键
            pass
        else:
            # 使用自定义的函数处理键名，然后更新状态字典
            orig_state_dict[rename_key(key)] = val

    # 返回更新后的原始状态字典
    return orig_state_dict


# 根据模型标签和可能的 PyTorch 导出文件夹路径，以及是否推送到 Hub，转换 Nougat 检查点
def convert_nougat_checkpoint(model_tag, pytorch_dump_folder_path=None, push_to_hub=False):
    # 获取检查点路径
    checkpoint_path = get_checkpoint(None, model_tag)
    # 从预训练模型路径加载原始模型
    original_model = NougatModel.from_pretrained(checkpoint_path)
    # 将原始模型设置为评估模式
    original_model.eval()

    # 加载 HuggingFace 模型的编码器和解码器配置
    encoder_config, decoder_config = get_configs(original_model)
    # 创建 DonutSwinModel 编码器和 MBartForCausalLM 解码器
    encoder = DonutSwinModel(encoder_config)
    decoder = MBartForCausalLM(decoder_config)
    # 创建 VisionEncoderDecoderModel 模型，设置编码器和解码器
    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
    # 将模型设置为评估模式
    model.eval()

    # 获取原始模型的状态字典
    state_dict = original_model.state_dict()
    # 将原始状态字典转换为新的状态字典
    new_state_dict = convert_state_dict(state_dict, model)
    # 使用新的状态字典加载模型的参数
    model.load_state_dict(new_state_dict)

    # 在 PDF 上验证结果
    filepath = hf_hub_download(repo_id="ysharma/nougat", filename="input/nougat.pdf", repo_type="space")
    # 将 PDF 渲染为图像，并返回 PIL 图像列表
    images = rasterize_paper(pdf=filepath, return_pil=True)
    # 打开第一张图像
    image = Image.open(images[0])

    # 加载 NougatTokenizerFast，设置 tokenizer 文件路径和填充标记
    tokenizer_file = checkpoint_path / "tokenizer.json"
    tokenizer = NougatTokenizerFast(tokenizer_file=str(tokenizer_file))
    tokenizer.pad_token = "<pad>"
    # 设置 tokenizer 的特殊符号
    tokenizer.bos_token = "<s>"  # 开始符号
    tokenizer.eos_token = "</s>"  # 结束符号
    tokenizer.unk_token = "<unk>"  # 未知符号
    # 设置 tokenizer 的最大模型长度为原始模型的最大长度
    tokenizer.model_max_length = original_model.config.max_length

    # 创建图像处理器对象，配置对齐长轴和大小
    size = {"height": original_model.config.input_size[0], "width": original_model.config.input_size[1]}
    image_processor = NougatImageProcessor(
        do_align_long_axis=original_model.config.align_long_axis,
        size=size,
    )
    # 创建处理器对象，整合图像处理器和 tokenizer
    processor = NougatProcessor(image_processor=image_processor, tokenizer=tokenizer)

    # 验证像素值
    pixel_values = processor(image, return_tensors="pt").pixel_values
    # 准备输入图像的像素值并展开为张量
    original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)

    # 断言检查原始像素值与处理后的像素值是否相等
    assert torch.allclose(original_pixel_values, pixel_values)

    # 验证补丁嵌入
    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
    # 计算模型的补丁嵌入和补丁嵌入器的结果
    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
    # 断言检查原始补丁嵌入和模型补丁嵌入是否相等
    assert torch.allclose(original_patch_embed, patch_embeddings)

    # 验证编码器隐藏状态
    original_last_hidden_state = original_model.encoder(pixel_values)
    # 计算模型的编码器最后隐藏状态
    last_hidden_state = model.encoder(pixel_values).last_hidden_state
    # 断言检查原始编码器隐藏状态和模型编码器隐藏状态是否相等，容忍度为 1e-2
    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)

    # 注意：原始模型在解码器的嵌入中不使用绑定权重
    # 检查原始模型和当前模型的解码器嵌入权重是否相等，容忍度为 1e-3
    original_embeddings = original_model.decoder.model.model.decoder.embed_tokens
    embeddings = model.decoder.model.decoder.embed_tokens
    assert torch.allclose(original_embeddings.weight, embeddings.weight, atol=1e-3)

    # 验证解码器隐藏状态
    prompt = "hello world"
    # 使用原始模型的 tokenizer 对提示进行编码
    decoder_input_ids = original_model.decoder.tokenizer(
        prompt, add_special_tokens=False, return_tensors="pt"
    ).input_ids
    decoder_attention_mask = torch.ones_like(decoder_input_ids)
    # 计算原始模型和当前模型的 logits
    original_logits = original_model(
        image_tensors=pixel_values, decoder_input_ids=decoder_input_ids, attention_mask=decoder_attention_mask
    ).logits
    logits = model(
        pixel_values,
        decoder_input_ids=decoder_input_ids[:, :-1],
        decoder_attention_mask=decoder_attention_mask[:, :-1],
    ).logits
    # 断言检查原始 logits 和当前 logits 是否相等，容忍度为 1e-3
    assert torch.allclose(original_logits, logits, atol=1e-3)

    # 验证生成结果
    outputs = model.generate(
        pixel_values,
        min_length=1,
        max_length=30,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
        bad_words_ids=[
            [tokenizer.unk_token_id],
        ],
        return_dict_in_generate=True,
        do_sample=False,
    )
    # 解码生成的文本并跳过特殊符号
    generated = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0]

    # 如果模型版本标签是 "0.1.0-base"，则验证生成的文本是否符合预期
    if model_tag == "0.1.0-base":
        expected_generation = "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lblec"
    # 如果模型标签为 "0.1.0-small"，设置期望生成的文本内容
    elif model_tag == "0.1.0-small":
        expected_generation = (
            "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lble"
        )
    else:
        # 如果模型标签不是已知的版本，则抛出值错误异常
        raise ValueError(f"Unexpected model tag: {model_tag}")

    # 断言生成的文本与期望的生成文本相等，用于验证生成结果是否符合预期
    assert generated == expected_generation
    # 打印确认信息，表示生成的文本符合预期
    print("Looks ok!")

    # 如果指定了 PyTorch 模型保存路径
    if pytorch_dump_folder_path is not None:
        # 打印保存模型和处理器到指定路径的消息
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 定义模型标签到 Hub 仓库名称的映射
        tag_to_name = {"0.1.0-base": "nougat-base", "0.1.0-small": "nougat-small"}
        # 获取当前模型标签对应的 Hub 仓库名称
        model_name = tag_to_name[model_tag]

        # 将模型推送到 Facebook Hub 中对应的仓库
        model.push_to_hub(f"facebook/{model_name}")
        # 将处理器推送到 Facebook Hub 中对应的仓库
        processor.push_to_hub(f"facebook/{model_name}")
if __name__ == "__main__":
    # 如果作为主程序执行，则开始执行以下代码

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_tag",
        default="0.1.0-base",
        required=False,
        type=str,
        choices=["0.1.0-base", "0.1.0-small"],
        help="Tag of the original model you'd like to convert.",
    )
    # 添加一个必需的参数 --model_tag，用于指定要转换的原始模型的标签

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        required=False,
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加一个参数 --pytorch_dump_folder_path，用于指定输出的 PyTorch 模型目录的路径

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether or not to push the converted model and processor to the 🤗 hub.",
    )
    # 添加一个参数 --push_to_hub，是一个布尔标志，用于指示是否将转换后的模型和处理器推送到 🤗 hub

    args = parser.parse_args()
    # 解析命令行参数并存储到 args 对象中

    convert_nougat_checkpoint(args.model_tag, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 convert_nougat_checkpoint，传入解析后的参数 args 中的相关信息作为参数

`.\models\nougat\image_processing_nougat.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Nougat."""

from typing import Dict, List, Optional, Union  # 导入类型提示所需的模块

import numpy as np  # 导入 NumPy 模块，用于处理数组和矩阵数据

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict  # 导入图像处理所需的辅助函数和类
from ...image_transforms import (
    get_resize_output_image_size,
    pad,
    resize,
    to_channel_dimension_format,
    to_pil_image,
)  # 导入图像变换函数，包括调整大小、填充、转换格式等
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)  # 导入图像处理和验证函数
from ...utils import TensorType, logging  # 导入工具函数和日志记录模块
from ...utils.import_utils import is_cv2_available, is_vision_available  # 导入检查模块是否可用的函数

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


if is_cv2_available():
    pass  # 如果 OpenCV 可用，则无需额外导入任何模块

if is_vision_available():
    import PIL  # 如果 PIL 库可用，则导入 PIL 模块
    """
    Args:
        do_crop_margin (`bool`, *optional*, defaults to `True`):
            Whether to crop the image margins.
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 896, "width": 672}`):
            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_thumbnail (`bool`, *optional*, defaults to `True`):
            Whether to resize the image using thumbnail method.
        do_align_long_axis (`bool`, *optional*, defaults to `False`):
            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the images to the largest image size in the batch.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Image standard deviation.
    """

    # 定义模型输入名称列表
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_crop_margin: bool = True,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_thumbnail: bool = True,
        do_align_long_axis: bool = False,
        do_pad: bool = True,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        **kwargs,
    ):
        """
        Initialize the ImagePreprocessor object with optional parameters for image preprocessing.

        Args:
            do_crop_margin (bool, optional, default=True):
                Whether to crop the image margins.
            do_resize (bool, optional, default=True):
                Whether to resize the image to the specified `size`.
            size (Dict[str, int], optional, default={"height": 896, "width": 672}):
                Size of the image after resizing.
            resample (PILImageResampling, optional, default=PILImageResampling.BILINEAR):
                Resampling filter to use if resizing the image.
            do_thumbnail (bool, optional, default=True):
                Whether to resize the image using thumbnail method.
            do_align_long_axis (bool, optional, default=False):
                Whether to align the long axis of the image with the long axis of `size`.
            do_pad (bool, optional, default=True):
                Whether to pad the images to the largest image size in the batch.
            do_rescale (bool, optional, default=True):
                Whether to rescale the image by the specified scale `rescale_factor`.
            rescale_factor (int or float, optional, default=1/255):
                Scale factor to use if rescaling the image.
            do_normalize (bool, optional, default=True):
                Whether to normalize the image.
            image_mean (float or List[float], optional, default=IMAGENET_DEFAULT_MEAN):
                Mean to use if normalizing the image.
            image_std (float or List[float], optional, default=IMAGENET_DEFAULT_STD):
                Standard deviation of the image.
        """
    ) -> None:
        # 调用父类的初始化方法，传入所有关键字参数
        super().__init__(**kwargs)

        # 如果 size 不为 None，则使用传入的 size，否则使用默认的高度和宽度
        size = size if size is not None else {"height": 896, "width": 672}
        # 将 size 转换为标准格式的字典
        size = get_size_dict(size)

        # 设置是否裁剪边距的标志
        self.do_crop_margin = do_crop_margin
        # 设置是否调整尺寸的标志
        self.do_resize = do_resize
        # 设置图像的尺寸
        self.size = size
        # 设置图像重采样的方式
        self.resample = resample
        # 设置是否生成缩略图的标志
        self.do_thumbnail = do_thumbnail
        # 设置是否沿长轴对齐的标志
        self.do_align_long_axis = do_align_long_axis
        # 设置是否填充图像的标志
        self.do_pad = do_pad
        # 设置是否重新缩放的标志
        self.do_rescale = do_rescale
        # 设置重新缩放的因子
        self.rescale_factor = rescale_factor
        # 设置是否归一化的标志
        self.do_normalize = do_normalize
        # 设置图像的均值，如果未提供则使用默认值 IMAGENET_DEFAULT_MEAN
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        # 设置图像的标准差，如果未提供则使用默认值 IMAGENET_DEFAULT_STD
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        # 设置有效的处理器关键字列表
        self._valid_processor_keys = [
            "images",
            "do_crop_margin",
            "do_resize",
            "size",
            "resample",
            "do_thumbnail",
            "do_align_long_axis",
            "do_pad",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    ) -> np.array:
        """
        Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the
        threshold).

        Args:
            image (`np.array`):
                The image to be cropped.
            gray_threshold (`int`, *optional*, defaults to `200`):
                Value below which pixels are considered to be gray.
            data_format (`ChannelDimension`, *optional*):
                The channel dimension format of the output image. If unset, will use the inferred format from the
                input.
            input_data_format (`ChannelDimension`, *optional*):
                The channel dimension format of the input image. If unset, will use the inferred format from the input.
        """
        # 如果未指定输入数据格式，则推断输入数据的通道维度格式
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

        # 将输入图像转换为PIL图像对象，并根据输入数据格式转换
        image = to_pil_image(image, input_data_format=input_data_format)
        
        # 将PIL图像转换为灰度图像的NumPy数组，并将数据类型转换为uint8
        data = np.array(image.convert("L")).astype(np.uint8)
        
        # 计算灰度图像的最大值和最小值
        max_val = data.max()
        min_val = data.min()
        
        # 如果最大值等于最小值，说明图像中所有像素值相同，直接返回原始图像
        if max_val == min_val:
            image = np.array(image)
            # 根据输出数据格式和输入数据格式重新调整通道维度格式（如果输出数据格式不为空）
            image = (
                to_channel_dimension_format(image, data_format, input_data_format)
                if data_format is not None
                else image
            )
            return image
        
        # 对灰度图像进行归一化处理到0-255之间
        data = (data - min_val) / (max_val - min_val) * 255
        
        # 根据灰度阈值将图像分割成背景和前景
        gray = data < gray_threshold
        
        # 查找非零像素点的坐标
        coords = self.python_find_non_zero(gray)
        
        # 根据非零像素点的坐标计算边界框的位置和大小
        x_min, y_min, width, height = self.python_bounding_rect(coords)
        
        # 根据计算出的边界框裁剪原始图像
        image = image.crop((x_min, y_min, x_min + width, y_min + height))
        
        # 将裁剪后的图像转换为NumPy数组，并将数据类型转换为uint8
        image = np.array(image).astype(np.uint8)
        
        # 根据输出数据格式和输入数据格式重新调整通道维度格式
        image = to_channel_dimension_format(image, input_data_format, ChannelDimension.LAST)
        
        # 根据输出数据格式和输入数据格式重新调整通道维度格式（如果输出数据格式不为空）
        image = (
            to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
        )

        # 返回裁剪和调整通道维度格式后的图像
        return image

    # Copied from transformers.models.donut.image_processing_donut.DonutImageProcessor.align_long_axis
    def align_long_axis(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    def align_long_axis(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Align the long axis of the image to the longest axis of the specified size.

        Args:
            image (`np.ndarray`):
                The image to be aligned.
            size (`Dict[str, int]`):
                The size `{"height": h, "width": w}` to align the long axis to.
            data_format (`str` or `ChannelDimension`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.

        Returns:
            `np.ndarray`: The aligned image.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出图像的高度和宽度
        output_height, output_width = size["height"], size["width"]

        # 如果输出宽度小于输出高度且输入宽度大于输入高度，或者输出宽度大于输出高度且输入宽度小于输入高度，则旋转图像
        if (output_width < output_height and input_width > input_height) or (
            output_width > output_height and input_width < input_height
        ):
            image = np.rot90(image, 3)

        # 如果指定了输出图像的数据格式，则将图像转换为指定格式
        if data_format is not None:
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)

        # 返回对齐后的图像
        return image

    def pad_image(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Pad the image to the specified size at the top, bottom, left and right.

        Args:
            image (`np.ndarray`):
                The image to be padded.
            size (`Dict[str, int]`):
                The size `{"height": h, "width": w}` to pad the image to.
            data_format (`str` or `ChannelDimension`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 获取输出图像的高度和宽度
        output_height, output_width = size["height"], size["width"]
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)

        # 计算宽度和高度的差值
        delta_width = output_width - input_width
        delta_height = output_height - input_height

        # 计算上、左、下、右四个方向的填充量
        pad_top = delta_height // 2
        pad_left = delta_width // 2
        pad_bottom = delta_height - pad_top
        pad_right = delta_width - pad_left

        # 构造填充元组
        padding = ((pad_top, pad_bottom), (pad_left, pad_right))
        
        # 返回填充后的图像
        return pad(image, padding, data_format=data_format, input_data_format=input_data_format)
    # 定义一个方法 `thumbnail`，用于生成缩略图
    def thumbnail(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
        corresponding dimension of the specified size.

        Args:
            image (`np.ndarray`):
                The image to be resized.
            size (`Dict[str, int]`):
                The size `{"height": h, "width": w}` to resize the image to.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                The resampling filter to use.
            data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出缩略图的高度和宽度
        output_height, output_width = size["height"], size["width"]

        # 始终将图像大小调整为输入图像和输出大小的最小值
        height = min(input_height, output_height)
        width = min(input_width, output_width)

        # 如果输入图像的大小已经符合要求，直接返回原图像
        if height == input_height and width == input_width:
            return image

        # 根据输入图像的高宽比例调整输出缩略图的高度和宽度
        if input_height > input_width:
            width = int(input_width * height / input_height)
        elif input_width > input_height:
            height = int(input_height * width / input_width)

        # 调用resize方法进行图像缩放
        return resize(
            image,
            size=(height, width),
            resample=resample,
            reducing_gap=2.0,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    # 从transformers.models.donut.image_processing_donut.DonutImageProcessor.resize方法复制而来
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
    ) -> np.ndarray:
        """
        Resizes `image` to `(height, width)` specified by `size` using the PIL library.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 将 `size` 转换为标准的大小字典格式
        size = get_size_dict(size)
        # 获取 `size` 中较小的边长作为最短边
        shortest_edge = min(size["height"], size["width"])
        # 根据最短边和其他参数获取调整后的输出图像尺寸
        output_size = get_resize_output_image_size(
            image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format
        )
        # 使用 resize 函数调整图像大小
        resized_image = resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
        return resized_image

    def preprocess(
        self,
        images: ImageInput,
        do_crop_margin: bool = None,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_thumbnail: bool = None,
        do_align_long_axis: bool = None,
        do_pad: bool = None,
        do_rescale: bool = None,
        rescale_factor: Union[int, float] = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\nougat\processing_nougat.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for Nougat.
"""

from typing import Dict, List, Optional, Union

# 导入所需的类型和工具类
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy

# 导入自定义的混合处理器和工具
from ...processing_utils import ProcessorMixin
from ...utils import PaddingStrategy, TensorType


class NougatProcessor(ProcessorMixin):
    r"""
    Constructs a Nougat processor which wraps a Nougat image processor and a Nougat tokenizer into a single processor.

    [`NougatProcessor`] offers all the functionalities of [`NougatImageProcessor`] and [`NougatTokenizerFast`]. See the
    [`~NougatProcessor.__call__`] and [`~NougatProcessor.decode`] for more information.

    Args:
        image_processor ([`NougatImageProcessor`]):
            An instance of [`NougatImageProcessor`]. The image processor is a required input.
        tokenizer ([`NougatTokenizerFast`]):
            An instance of [`NougatTokenizerFast`]. The tokenizer is a required input.
    """

    # 定义类属性，包括可以访问的属性列表和默认的处理器类名称
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor, tokenizer):
        # 调用父类的构造函数初始化混合处理器
        super().__init__(image_processor, tokenizer)
        # 将当前处理器设置为图像处理器
        self.current_processor = self.image_processor
    # 定义一个方法，使对象可以像函数一样被调用
    def __call__(
        self,
        images=None,
        text=None,
        do_crop_margin: bool = None,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: "PILImageResampling" = None,  # noqa: F821
        do_thumbnail: bool = None,
        do_align_long_axis: bool = None,
        do_pad: bool = None,
        do_rescale: bool = None,
        rescale_factor: Union[int, float] = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
        input_data_format: Optional[Union[str, "ChannelDimension"]] = None,  # noqa: F821
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair_target: Optional[
            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
        ] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ):
        if images is None and text is None:
            # 如果既没有指定 images 也没有指定 text 输入，则抛出数值错误异常
            raise ValueError("You need to specify either an `images` or `text` input to process.")

        if images is not None:
            # 如果指定了 images 输入，则使用 image_processor 处理图片数据
            inputs = self.image_processor(
                images,
                do_crop_margin=do_crop_margin,
                do_resize=do_resize,
                size=size,
                resample=resample,
                do_thumbnail=do_thumbnail,
                do_align_long_axis=do_align_long_axis,
                do_pad=do_pad,
                do_rescale=do_rescale,
                rescale_factor=rescale_factor,
                do_normalize=do_normalize,
                image_mean=image_mean,
                image_std=image_std,
                return_tensors=return_tensors,
                data_format=data_format,
                input_data_format=input_data_format,
            )
        if text is not None:
            # 如果指定了 text 输入，则使用 tokenizer 处理文本数据
            encodings = self.tokenizer(
                text,
                text_pair=text_pair,
                text_target=text_target,
                text_pair_target=text_pair_target,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                stride=stride,
                is_split_into_words=is_split_into_words,
                pad_to_multiple_of=pad_to_multiple_of,
                return_tensors=return_tensors,
                return_token_type_ids=return_token_type_ids,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_offsets_mapping=return_offsets_mapping,
                return_length=return_length,
                verbose=verbose,
            )

        if text is None:
            # 如果没有 text 输入，则返回处理过的 images 输入数据
            return inputs
        elif images is None:
            # 如果没有 images 输入，则返回处理过的 text 输入数据
            return encodings
        else:
            # 如果既有 images 又有 text 输入，则将 labels 添加到 inputs 字典中并返回
            inputs["labels"] = encodings["input_ids"]
            return inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to NougatTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        """
        # 将所有参数转发给 NougatTokenizer 的 batch_decode 方法，并返回其结果
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to NougatTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        # 将所有参数转发给 NougatTokenizer 的 decode 方法，并返回其结果
        return self.tokenizer.decode(*args, **kwargs)
    def post_process_generation(self, *args, **kwargs):
        """
        将所有参数转发到 NougatTokenizer 的 [`~PreTrainedTokenizer.post_process_generation`] 方法。
        请参考该方法的文档字符串获取更多信息。
        """
        # 调用内部的 NougatTokenizer 对象的 post_process_generation 方法，并返回其结果
        return self.tokenizer.post_process_generation(*args, **kwargs)

`.\models\nougat\tokenization_nougat_fast.py`

# 设置文件编码为 UTF-8
# 版权声明，声明此代码的版权归 HuggingFace Inc. 团队所有，使用 Apache License 2.0 授权
# 根据 Apache License 2.0 许可证，除非符合许可证的规定，否则不得使用此文件
# 获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
"""
Nougat 的快速分词器类。
"""
# 导入正则表达式模块
import re
# 导入 functools 模块的 partial 函数
from functools import partial
# 导入 multiprocessing 模块的 Pool 类
from multiprocessing import Pool
# 导入 List 和 Union 类型提示
from typing import List, Union

# 导入 numpy 库，并用 np 作为别名
import numpy as np

# 从 transformers 库中导入相关的函数和类
# INIT_TOKENIZER_DOCSTRING 是来自 tokenization_utils_base 模块的一个文档字符串
from transformers.tokenization_utils_base import INIT_TOKENIZER_DOCSTRING
# 导入 PreTrainedTokenizerFast 类
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
# 导入 add_end_docstrings 函数
from transformers.utils import add_end_docstrings

# 导入本地工具函数和模块
# is_levenshtein_available 和 is_nltk_available 是本地工具函数
from ...utils import is_levenshtein_available, is_nltk_available, logging, requires_backends

# 如果 Levenshtein 可用，则从 Levenshtein 库导入 ratio 函数
if is_levenshtein_available():
    from Levenshtein import ratio

# 如果 NLTK 可用，则导入 nltk 库
if is_nltk_available():
    import nltk

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 在 INIT_TOKENIZER_DOCSTRING 后添加额外的文档字符串说明
INIT_TOKENIZER_DOCSTRING += """
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
"""

# 预训练词汇文件映射，指定了预训练模型的 tokenizer_file 的 URL
PRETRAINED_VOCAB_FILES_MAP = {
    "tokenizer_file": {
        "facebook/nougat-base": "https://huggingface.co/facebook/nougat-base/tokenizer/blob/main/tokenizer.json",
    },
}

# 指定词汇文件的名称
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}

# 指定预训练位置编码的尺寸
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/nougat-base": 3584}


def markdown_compatible(text: str) -> str:
    """
    使文本兼容 Markdown 格式。

    此函数对文本进行各种格式调整，以使其与 Markdown 兼容。

    Args:
        text (`str`):
            要使其兼容 Markdown 的输入文本。

    Returns:
        `str`: 兼容 Markdown 的文本。
    """
    # 等式标签
    # 用 \[some text\] 样式的模式替换以十进制数字开头的行，将其转换为 \[[some text] \tag{decimal}\]。
    text = re.sub(r"^\(([\d.]+[a-zA-Z]?)\) \\\[(.+?)\\\]$", r"\[\2 \\tag{\1}\]", text, flags=re.M)
    # 用 \[some text\] 样式的模式替换以十进制数字结尾的行，将其转换为 \[[some text] \tag{decimal}\]。
    text = re.sub(r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\)$", r"\[\1 \\tag{\2}\]", text, flags=re.M)
    # 用 \[some text\] 样式的模式替换以数字开头，以 \[another text\] 结尾的行，将其转换为 \[[some text] \tag{digits}\] [another text].
    # 使用正则表达式替换文本中符合特定格式的字符串，将其转换为特定格式的标记
    text = re.sub(
        r"^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\) (\\\[.+?\\\])$",
        r"\[\1 \\tag{\2}\] \3",
        text,
        flags=re.M,
    )
    # 替换文本中的特定字符串，去除反斜杠和句点之间的空格
    text = text.replace(r"\. ", ". ")
    # 将文本中的特定粗体格式符号替换为 LaTeX 中的数学粗体格式符号
    text = text.replace(r"\bm{", r"\mathbf{").replace(r"{\\bm ", r"\mathbf{")
    # 使用正则表达式替换文本中特定格式的字符串，将其转换为数学粗体格式
    text = re.sub(r"\\mbox{ ?\\boldmath\$(.*?)\$}", r"\\mathbf{\1}", text)
    # 将文本中的 URL 格式化为 Markdown 可点击的链接格式
    text = re.sub(
        r"((?:http|ftp|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))",
        r"[\1](\1)",
        text,
    )
    # 使用正则表达式重新格式化文本中的算法块，确保其在 Markdown 中显示为合适的代码块格式
    text = re.sub(r"```\s*(.+?)\s*```", r"```\n\1\n```", text, flags=re.S)

    # 返回处理后的文本结果
    return text
# 将文本中类似列表项的行规范化。函数查找以'-'或'*'开头的行，可能后跟表示嵌套级别的罗马数字或数字。
# 函数重新格式化这些行，使其结构更加清晰。
def normalize_list_like_lines(generation):
    # 匹配以'-'或'*'开头的行，后面不是'-'或'*'（列表），可能后跟数字\d或罗马数字，然后捕获此行的可选附加编号，
    # 传递给re.finditer使用的正则表达式模式。
    pattern = r"(?:^)(-|\*)?(?!-|\*) ?((?:\d|[ixv])+ )?.+? (-|\*) (((?:\d|[ixv])+)\.(\d|[ixv]) )?.*(?:$)"

    # 以逆序遍历生成的匹配对象列表
    for match in reversed(list(re.finditer(pattern, generation, flags=re.I | re.M))):
        start, stop = match.span()
        delim = match.group(3) + " "
        splits = match.group(0).split(delim)
        replacement = ""

        # 如果第一个捕获组匹配不是None，则忽略第一个条目并继续
        if match.group(1) is not None:
            splits = splits[1:]
            delim1 = match.group(1) + " "
        else:
            delim1 = ""
            continue  # 跳过错误的正则表达式结果

        pre, post = generation[:start], generation[stop:]

        # 处理分割的条目并生成替换文本
        for i, item in enumerate(splits):
            level = 0
            potential_numeral, _, rest = item.strip().partition(" ")
            if not rest:
                continue
            # 根据检测到的编号推断当前嵌套级别
            if re.match(r"^[\dixv]+((?:\.[\dixv])?)+$", potential_numeral, flags=re.I | re.M):
                level = potential_numeral.count(".")

            replacement += (
                ("\n" if i > 0 else "") + ("\t" * level) + (delim if i > 0 or start == 0 else delim1) + item.strip()
            )

        if post == "":
            post = "\n"

        generation = pre + replacement + post

    return generation


# 找到文本中下一个标点符号的索引
def find_next_punctuation(text: str, start_idx=0):
    """
    Find the index of the next punctuation mark.

    Args:
        text (`str`):
            String to examine
        start_idx (`int`, *optional*)
            Index where to start
    """
    for i in range(start_idx, len(text)):
        if text[i] in [".", "?", "!", "\n"]:
            return i

    return None


# 尝试截断输入字符串中的重复部分
def truncate_repetitions(text: str, min_len: int = 30) -> str:
    """
    Attempt to truncate repeating segments in the input string.

    Args:
        text (str): The input text to process.
        min_len (int, optional): The minimum length of repeating segments to truncate.

    Returns:
        str: The processed text with repeated segments truncated.
    """
    # 将输入文本转换为小写
    text_lower = text.lower()
    # 获取输入文本的长度
    text_length = len(text_lower)

    # 如果输入文本长度小于最小重复段长度的两倍，直接返回原始文本
    if text_length < 2 * min_len:
        return text

    # 尝试查找尾部重复的最大长度
    max_repetition_length = None
    for repetition_length in range(min_len, int(text_length / 2)):
        # 检查尾部是否有重复
        same = True
        for i in range(0, repetition_length):
            if text_lower[text_length - repetition_length - i - 1] != text_lower[text_length - i - 1]:
                same = False
                break

        if same:
            max_repetition_length = repetition_length

    # 如果没有找到重复的部分，返回原始文本
    if max_repetition_length is None:
        return text

    # 获取最长重复子串
    lcs = text_lower[-max_repetition_length:]

    # 移除除最后一个重复外的所有重复
    substituted_text = text
    substituted_text_lower = text_lower
    while substituted_text_lower.endswith(lcs):
        substituted_text = substituted_text[:-max_repetition_length]
        substituted_text_lower = substituted_text_lower[:-max_repetition_length]

    # 获取包含重复的尾部内容
    repeating_tail = text_lower[len(substituted_text_lower):]

    # 从文本开头添加内容，直到下一个标点，并确保最后一句不重复
    substituted_text_lower_out = substituted_text_lower
    while True:
        # 找到下一个标点符号的位置
        sentence_end = find_next_punctuation(text_lower, len(substituted_text_lower_out))
        sentence_start = find_next_punctuation(text_lower[::-1], len(substituted_text_lower_out))
        if sentence_end and sentence_start:
            # 提取当前句子
            sentence = text_lower[sentence_start:sentence_end]
            # 更新输出的文本为当前位置前的内容
            substituted_text_lower_out = text_lower[:sentence_end + 1]
            # 如果当前句子在重复的尾部出现，结束循环
            if sentence in repeating_tail:
                break
        else:
            break

    # 获取最终输出的文本
    text_out = text[:len(substituted_text_lower_out)]

    return text_out
def remove_numbers(lines):
    def _clean(s):
        return re.sub(r"(?:[\d_]|\*\*)", "", s).strip()  # 移除字符串中的数字和特殊字符

    if isinstance(lines, str):
        return _clean(lines)  # 如果输入是字符串，则直接清理并返回该字符串
    out = []
    for l in lines:
        out.append(_clean(l))  # 对列表中的每个字符串进行清理，并加入到输出列表中
    return out  # 返回清理后的字符串列表


def get_slices(lines, clean_lines):
    """
    Get slices of text based on specific criteria within the lines.

    This function identifies and returns slices of text from the input lines based on certain conditions.

    These conditions were chosen by the Nougat authors:
    - The slice is less than 200 characters long.
    - The slice is more than 3 characters long.
    - The slice does not start with "[MISSING_PAGE".
    - The slice is either the same as the next slice or the ratio of the two in terms of Levensthein distance is
      greater than 0.9.

    Args:
        lines (`List[str]`):
            The list of lines containing the text.
        clean_lines (`List[str]`):
            A cleaned version of the text (without numbers).

    Returns:
        `List[tuple]`: A list of tuples representing the start and end indices of text slices.
    """
    indices = np.zeros(len(lines))  # 创建一个与输入行数相同长度的零数组
    for i in range(len(lines) - 1):
        j = i + 1
        while not clean_lines[j] and j < len(lines) - 1:
            j += 1  # 跳过空行或清理后为空的行，直到找到非空行或清理后不为空的行
        if (
            len(clean_lines[i]) < 200
            and len(clean_lines[i]) > 3
            and len(clean_lines[j]) < 200
            and len(clean_lines[j]) > 3
            and not clean_lines[i].startswith("[MISSING_PAGE")
            and (clean_lines[i] == clean_lines[j] or ratio(clean_lines[i], clean_lines[j]) > 0.9)
        ):
            indices[i:j] = 1  # 根据条件设置索引数组中的标记为1
    ids = np.where(indices)[0]  # 获取所有标记为1的索引位置
    slices = []
    if len(ids) == 0:
        return slices  # 如果没有找到符合条件的索引，直接返回空列表
    j0 = 0
    for j, x in enumerate(np.diff(ids) > 3):
        if x:
            slices.append((ids[j0], ids[j] + 2))  # 将符合条件的片段起始和结束索引添加到slices列表中
            j0 = j + 1
    slices.append((ids[j0], ids[-1] + 2))  # 添加最后一个符合条件的片段起始和结束索引
    return [sli for sli in slices if sli[1] - sli[0] > 15]  # 返回长度大于15的片段列表


def remove_slice_from_lines(lines, clean_text, slice) -> str:
    """
    Remove a slice of text from the lines based on specific criteria.

    This function identifies a slice of text within the lines and removes it based on certain conditions.

    Args:
        lines (list of str): The list of lines containing the text.
        clean_text (list of str): A cleaned version of the text (without numbers).
        slice (tuple): A tuple representing the start and end indices of the slice to be removed.

    Returns:
        str: The removed slice of text as a single string.
    """
    base = clean_text[slice[0]]  # 获取要删除的片段的起始位置的清理后文本
    section = list(slice)  # 将要删除的片段转换为列表形式
    check_start_flag = False  # 初始化检查开始标志为False
    # backwards pass, at most 5 lines
    # 循环遍历文本的指定范围，最多向前查找5行
    for line_idx in range(max(0, slice[0] - 1), max(0, slice[0] - 5), -1):
        # 如果当前行为空，则跳过
        if not lines[line_idx]:
            continue
        # 如果当前行是"## References"，则确定该段落起始位置为当前行
        if lines[line_idx] == "## References":
            section[0] = line_idx
            break
        # 否则，比较当前行与基准字符串的相似度，如果低于阈值0.9
        elif ratio(base, remove_numbers(lines[line_idx])) < 0.9:
            # 将段落起始位置设为当前行的下一行
            section[0] = line_idx + 1
            # 获取潜在引用的内容（当前行的上一行以"* ["开头的部分）
            potential_ref = remove_numbers(lines[max(0, line_idx - 1)].partition("* [")[-1])
            # 如果潜在引用的长度大于基准字符串长度的0.75，并且与基准字符串的相似度低于0.9
            if len(potential_ref) >= 0.75 * len(base) and ratio(base, potential_ref) < 0.9:
                # 将段落起始位置设为当前行
                section[0] = line_idx
            # 设置检查起始标志为真
            check_start_flag = True
            break
    
    # forward pass，最多向后查找5行
    for line_idx in range(min(len(lines), slice[1]), min(len(lines), slice[1] + 5)):
        # 如果当前行与基准字符串的相似度低于阈值0.9
        if ratio(base, remove_numbers(lines[line_idx])) < 0.9:
            # 确定段落结束位置为当前行
            section[1] = line_idx
            break
    
    # 如果文本行数小于等于段落结束位置，将段落结束位置设为文本行数减1
    if len(lines) <= section[1]:
        section[1] = len(lines) - 1
    
    # 获取待删除的文本段落，从section[0]到section[1]行（包括）
    to_delete = "\n".join(lines[section[0] : section[1] + 1])
    
    # 截取下一页内容
    itera, iterb = enumerate(lines[section[1] - 1]), enumerate(lines[section[1]])
    while True:
        try:
            (ia, a) = next(itera)
            # 跳过数字行
            while a.isnumeric():
                (ia, a) = next(itera)
            (ib, b) = next(iterb)
            # 跳过数字行
            while b.isnumeric():
                (ib, b) = next(iterb)
            # 如果遇到不相同的字符，则停止截取
            if a != b:
                break
        except StopIteration:
            break
    
    # 如果检查起始标志为真且待删除的文本包含"* ["，则保留"* ["开头的内容
    if check_start_flag and "* [" in to_delete:
        to_delete = "* [" + to_delete.partition("* [")[-1]
    
    try:
        # 计算截取的尾部内容长度
        delta = len(lines[section[1]]) - ib - 1
        # 如果长度大于0，则从to_delete中删除相应长度的尾部内容
        if delta > 0:
            to_delete = to_delete[:-delta]
    except UnboundLocalError:
        # 忽略未绑定局部变量的异常
        pass
    
    # 返回经过处理的待删除文本段落（去除首尾空白字符）
    return to_delete.strip()
# 使用装饰器添加文档字符串，文档字符串内容为INIT_TOKENIZER_DOCSTRING变量的值
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
# 定义一个名为NougatTokenizerFast的类，它继承自PreTrainedTokenizerFast类
class NougatTokenizerFast(PreTrainedTokenizerFast):
    """
    Fast tokenizer for Nougat (backed by HuggingFace tokenizers library).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods. This class mainly adds Nougat-specific
    methods for postprocessing the generated text.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        tokenizer_file (`str`, *optional*):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.

        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
            spaces.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.

        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
    """

    # 类属性：定义词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 类属性：定义预训练词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 类属性：定义最大模型输入尺寸列表
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 类属性：定义模型输入名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 类属性：定义慢速分词器类别，此处为None
    slow_tokenizer_class = None

    # 初始化方法
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        clean_up_tokenization_spaces=False,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        pad_token="<pad>",
        **kwargs,
    ):
        # 调用父类PreTrainedTokenizerFast的初始化方法，传入各种参数
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            **kwargs,
        )
        # 设置实例属性self.vocab_file为传入的vocab_file参数值
        self.vocab_file = vocab_file
    def remove_hallucinated_references(self, text: str) -> str:
        """
        Remove hallucinated or missing references from the text.

        This function identifies and removes references that are marked as missing or hallucinated from the input text.

        Args:
            text (`str`):
                The input text containing references.

        Returns:
            `str`: The text with hallucinated references removed.
        """
        # 将文本按行分割成列表
        lines = text.split("\n")
        # 如果行数为0，返回空字符串
        if len(lines) == 0:
            return ""
        # 从文本中移除数字
        clean_lines = remove_numbers(lines)
        # 获取干净行和原始行的切片
        slices = get_slices(lines, clean_lines)
        # 存储待删除的切片
        to_delete = []
        # 遍历每个切片
        for slice in slices:
            # 从原始行和干净行中移除切片，返回要删除的部分
            to_delete.append(remove_slice_from_lines(lines, clean_lines, slice))
        # 反向遍历待删除的部分
        for to_delete in reversed(to_delete):
            # 用指定标记替换要删除的部分，用于标记丢失页码
            text = text.replace(to_delete, "\n\n[MISSING_PAGE_POST]\n\n")
        # 使用正则表达式替换文本中的特定格式的引用标记
        text = re.sub(
            r"## References\n+\[MISSING_PAGE_POST(:\d+)?\]",
            "\n\n[MISSING_PAGE_POST\\1]",
            text,
        )
        # 返回处理后的文本
        return text

    def correct_tables(self, generation: str) -> str:
        """
        Takes a generated string and fixes tables/tabulars to make them match the markdown format needed.

        Args:
            generation (str): The generated text to be postprocessed.

        Returns:
            str: The postprocessed text.

        Example:

        ```
        correct_tables("\\begin{table} \\begin{tabular}{l l} & \\ \\end{tabular} \\end{table}")
        "\\begin{table}\n\\begin{tabular}{l l} & \\ \\end{tabular}\n\\end{table}"
        ```
        """
        # 移除明显错误的表格
        for l in generation.split("\n"):
            if l.count("\\begin{tabular}") > 15 or l.count("\\multicolumn") > 60 or l.count("&") > 400:
                generation = generation.replace(l, "")
        # 修正空白和格式
        generation = generation.replace("\\begin{table} \\begin{tabular}", "\\begin{table}\n\\begin{tabular}")
        generation = generation.replace("\\end{tabular} \\end{table}", "\\end{tabular}\n\\end{table}")
        generation = generation.replace("\\end{table} Tab", "\\end{table}\nTab")
        # 在每个表格前加入新行，以确保正确的格式
        generation = re.sub(r"(^.+)\\begin{tab", r"\1\n\\begin{tab", generation, flags=re.M)
        # 移除左对齐的空 LaTeX 表格块
        generation = generation.replace(r"\begin{tabular}{l l}  & \\ \end{tabular}", "")
        # 移除只有两个换行符的表格
        generation = generation.replace("\\begin{tabular}{}\n\n\\end{tabular}", "")
        # 返回处理后的文本
        return generation
    ) -> Union[str, List[str]]:
        """
        Postprocess a generated text or a list of generated texts.

        This function can be used to perform postprocessing on generated text, such as fixing Markdown formatting.

        Postprocessing is quite slow so it is recommended to use multiprocessing to speed up the process.

        Args:
            generation (Union[str, List[str]]):
                The generated text or a list of generated texts.
            fix_markdown (`bool`, *optional*, defaults to `True`):
                Whether to perform Markdown formatting fixes.
            num_workers (`int`, *optional`):
                Optional number of workers to pass to leverage multiprocessing (postprocessing several texts in
                parallel).

        Returns:
            Union[str, List[str]]: The postprocessed text or list of postprocessed texts.
        """
        # 确保依赖的后端模块已经加载
        requires_backends(self, ["nltk", "levenshtein"])

        # 如果生成物是一个列表
        if isinstance(generation, list):
            # 如果指定了并行处理的 worker 数量
            if num_workers is not None and isinstance(num_workers, int):
                # 使用 multiprocessing.Pool 创建一个进程池
                with Pool(num_workers) as p:
                    # 使用进程池的 map 函数并行处理生成的文本列表
                    return p.map(partial(self.post_process_single, fix_markdown=fix_markdown), generation)
            else:
                # 否则，串行处理每个生成的文本
                return [self.post_process_single(s, fix_markdown=fix_markdown) for s in generation]
        else:
            # 如果生成物是单个字符串，则直接调用单文本处理函数
            return self.post_process_single(generation, fix_markdown=fix_markdown)

`.\models\nougat\init.py`

# 版权声明及许可信息，指出此代码的版权归HuggingFace团队所有，并按Apache License, Version 2.0许可使用
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块的类型检查工具
from typing import TYPE_CHECKING

# 导入必要的异常和模块，用于处理可选依赖项未安装的情况，以及懒加载模块和可用性检查工具
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_vision_available

# 定义要导入的结构，初始化处理模块的列表
_import_structure = {
    "processing_nougat": ["NougatProcessor"],
}

# 检查是否安装了tokenizers，若未安装则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若安装了tokenizers，则添加tokenization_nougat_fast模块到导入结构中
    _import_structure["tokenization_nougat_fast"] = ["NougatTokenizerFast"]

# 检查是否安装了vision模块，若未安装则抛出OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若安装了vision模块，则添加image_processing_nougat模块到导入结构中
    _import_structure["image_processing_nougat"] = ["NougatImageProcessor"]

# 如果当前是类型检查模式
if TYPE_CHECKING:
    # 导入processing_nougat模块中的NougatProcessor类
    from .processing_nougat import NougatProcessor

    # 检查是否安装了tokenizers，若安装则导入tokenization_nougat_fast模块中的NougatTokenizerFast类
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_nougat_fast import NougatTokenizerFast

    # 检查是否安装了vision模块，若安装则导入image_processing_nougat模块中的NougatImageProcessor类
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_nougat import NougatImageProcessor

# 如果不是类型检查模式
else:
    import sys

    # 将当前模块注册为一个懒加载模块，使用_LazyModule进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\nystromformer\configuration_nystromformer.py`

# 设置编码格式为 UTF-8
# 版权声明，版权归属于2022年的UW-Madison和The HuggingFace Inc.团队，保留所有权利
#
# 根据Apache许可证版本2.0授权使用此文件；
# 除非遵守许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于“原样”的基础分发，
# 没有任何明示或暗示的担保或条件。
# 有关更多信息，请参阅许可证。
""" Nystromformer模型配置"""

# 从configuration_utils导入PretrainedConfig类
# 从utils导入logging模块
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取logger对象
logger = logging.get_logger(__name__)

# 定义Nystromformer预训练模型配置文件映射字典
NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "uw-madison/nystromformer-512": "https://huggingface.co/uw-madison/nystromformer-512/resolve/main/config.json",
    # 查看所有Nystromformer模型的列表，网址为https://huggingface.co/models?filter=nystromformer
}

# 定义NystromformerConfig类，继承自PretrainedConfig类
class NystromformerConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储[`NystromformerModel`]的配置信息。它用于根据指定的参数实例化
    一个Nystromformer模型，定义模型的架构。使用默认参数实例化一个配置对象将生成与Nystromformer
    [uw-madison/nystromformer-512](https://huggingface.co/uw-madison/nystromformer-512)架构类似的配置。

    配置对象继承自[`PretrainedConfig`]，可用于控制模型的输出。有关更多信息，请参阅
    [`PretrainedConfig`]的文档。
    # 定义 NystromformerConfig 类，用于配置 Nystromformer 模型的参数
    class NystromformerConfig:
    
        # 初始化函数，设置 Nystromformer 模型的各种参数
        def __init__(
            self,
            vocab_size: int = 30000,  # Nystromformer 模型的词汇表大小，默认为 30000
            hidden_size: int = 768,  # 编码器层和池化层的维度，默认为 768
            num_hidden_layers: int = 12,  # Transformer 编码器中隐藏层的数量，默认为 12
            num_attention_heads: int = 12,  # 每个注意力层中的注意力头数，默认为 12
            intermediate_size: int = 3072,  # Transformer 编码器中"中间"（即前馈）层的维度，默认为 3072
            hidden_act: str = "gelu",  # 编码器和池化器中的非线性激活函数，默认为 "gelu"
            hidden_dropout_prob: float = 0.1,  # 嵌入层、编码器和池化器中全连接层的 dropout 概率，默认为 0.1
            attention_probs_dropout_prob: float = 0.1,  # 注意力概率的 dropout 比率，默认为 0.1
            max_position_embeddings: int = 512,  # 模型可能使用的最大序列长度，默认为 512
            type_vocab_size: int = 2,  # 调用 NystromformerModel 时传递的 token_type_ids 的词汇表大小，默认为 2
            segment_means_seq_len: int = 64,  # segment-means 中使用的序列长度，默认为 64
            num_landmarks: int = 64,  # Nystrom 近似 softmax 自注意力矩阵时使用的 landmark（或 Nystrom）点数量，默认为 64
            conv_kernel_size: int = 65,  # Nystrom 近似中使用的深度卷积的内核大小，默认为 65
            inv_coeff_init_option: bool = False,  # 是否使用精确系数计算来初始化 Moore-Penrose 矩阵的迭代方法的初始值，默认为 False
            initializer_range: float = 0.02,  # 用于初始化所有权重矩阵的截断正态初始化器的标准差，默认为 0.02
            layer_norm_eps: float = 1e-12,  # 层归一化层使用的 epsilon，默认为 1e-12
        ):
            # 将参数设置为类的属性
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
            self.segment_means_seq_len = segment_means_seq_len
            self.num_landmarks = num_landmarks
            self.conv_kernel_size = conv_kernel_size
            self.inv_coeff_init_option = inv_coeff_init_option
            self.initializer_range = initializer_range
            self.layer_norm_eps = layer_norm_eps
    # 设置模型类型为 Nystromformer
    model_type = "nystromformer"
    
    # 定义一个初始化方法，初始化 NystromformerConfig 类的实例
    def __init__(
        self,
        vocab_size=30000,  # 设置词汇表大小，默认为 30000
        hidden_size=768,  # 设置隐藏层大小，默认为 768
        num_hidden_layers=12,  # 设置隐藏层数，默认为 12
        num_attention_heads=12,  # 设置注意力头数，默认为 12
        intermediate_size=3072,  # 设置中间层大小，默认为 3072
        hidden_act="gelu_new",  # 设置隐藏层激活函数，默认为 gelu_new
        hidden_dropout_prob=0.1,  # 设置隐藏层的 dropout 概率，默认为 0.1
        attention_probs_dropout_prob=0.1,  # 设置注意力概率的 dropout 概率，默认为 0.1
        max_position_embeddings=510,  # 设置最大位置编码长度，默认为 510
        type_vocab_size=2,  # 设置类型词汇表大小，默认为 2
        segment_means_seq_len=64,  # 设置段落均值序列长度，默认为 64
        num_landmarks=64,  # 设置地标数，默认为 64
        conv_kernel_size=65,  # 设置卷积核大小，默认为 65
        inv_coeff_init_option=False,  # 设置逆系数初始化选项，默认为 False
        initializer_range=0.02,  # 设置初始化范围，默认为 0.02
        layer_norm_eps=1e-5,  # 设置层归一化的 epsilon，默认为 1e-5
        pad_token_id=1,  # 设置填充标记 ID，默认为 1
        bos_token_id=0,  # 设置起始标记 ID，默认为 0
        eos_token_id=2,  # 设置结束标记 ID，默认为 2
        **kwargs,  # 其他可选参数
    ):
        # 调用父类的初始化方法，传入 pad_token_id, bos_token_id, eos_token_id 和其他参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        # 将传入的参数保存为对象的属性
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.type_vocab_size = type_vocab_size
        self.segment_means_seq_len = segment_means_seq_len
        self.num_landmarks = num_landmarks
        self.conv_kernel_size = conv_kernel_size
        self.inv_coeff_init_option = inv_coeff_init_option
        self.layer_norm_eps = layer_norm_eps

`.\models\nystromformer\convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py`

# 设置脚本的编码格式为 UTF-8
# Copyright 2022 The HuggingFace Inc. team.
#
# 根据 Apache 许可证 2.0 版本授权使用此文件
# 除非符合许可证的规定，否则不得使用此文件
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非法律要求或书面同意，否则依法分发的软件
# 在"原样"基础上分发，不带任何明示或暗示的担保或条件
# 请参阅许可证以获取详细的法律条款
"""从原始存储库转换 Nystromformer 检查点"""

# 导入参数解析库
import argparse

# 导入 PyTorch 库
import torch

# 从 transformers 库中导入 NystromformerConfig 和 NystromformerForMaskedLM 类
from transformers import NystromformerConfig, NystromformerForMaskedLM


# 定义函数：重命名键名以匹配新模型的结构
def rename_key(orig_key):
    # 如果键名中包含 "model"，则替换为 ""
    if "model" in orig_key:
        orig_key = orig_key.replace("model.", "")
    # 如果键名中包含 "norm1"，则替换为 "attention.output.LayerNorm"
    if "norm1" in orig_key:
        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
    # 如果键名中包含 "norm2"，则替换为 "output.LayerNorm"
    if "norm2" in orig_key:
        orig_key = orig_key.replace("norm2", "output.LayerNorm")
    # 如果键名中包含 "norm"，则替换为 "LayerNorm"
    if "norm" in orig_key:
        orig_key = orig_key.replace("norm", "LayerNorm")
    # 如果键名中包含 "transformer"，则根据层编号重构为 "encoder.layer.X"
    if "transformer" in orig_key:
        layer_num = orig_key.split(".")[0].split("_")[-1]
        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
    # 如果键名中包含 "mha.attn"，则替换为 "attention.self"
    if "mha.attn" in orig_key:
        orig_key = orig_key.replace("mha.attn", "attention.self")
    # 如果键名中包含 "mha"，则替换为 "attention"
    if "mha" in orig_key:
        orig_key = orig_key.replace("mha", "attention")
    # 如果键名中包含 "W_q"，则替换为 "self.query"
    if "W_q" in orig_key:
        orig_key = orig_key.replace("W_q", "self.query")
    # 如果键名中包含 "W_k"，则替换为 "self.key"
    if "W_k" in orig_key:
        orig_key = orig_key.replace("W_k", "self.key")
    # 如果键名中包含 "W_v"，则替换为 "self.value"
    if "W_v" in orig_key:
        orig_key = orig_key.replace("W_v", "self.value")
    # 如果键名中包含 "ff1"，则替换为 "intermediate.dense"
    if "ff1" in orig_key:
        orig_key = orig_key.replace("ff1", "intermediate.dense")
    # 如果键名中包含 "ff2"，则替换为 "output.dense"
    if "ff2" in orig_key:
        orig_key = orig_key.replace("ff2", "output.dense")
    # 如果键名中包含 "ff"，则替换为 "output.dense"
    if "ff" in orig_key:
        orig_key = orig_key.replace("ff", "output.dense")
    # 如果键名中包含 "mlm_class"，则替换为 "cls.predictions.decoder"
    if "mlm_class" in orig_key:
        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
    # 如果键名中包含 "mlm"，则替换为 "cls.predictions.transform"
    if "mlm" in orig_key:
        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
    # 如果键名中不包含 "cls"，则添加前缀 "nystromformer."
    if "cls" not in orig_key:
        orig_key = "nystromformer." + orig_key

    return orig_key


# 定义函数：帮助转换检查点，调整键名以匹配新模型的结构
def convert_checkpoint_helper(config, orig_state_dict):
    # 遍历原始状态字典的键名副本
    for key in orig_state_dict.copy().keys():
        # 弹出当前键名的值
        val = orig_state_dict.pop(key)

        # 如果键名中包含 "pooler"、"sen_class" 或 "conv.bias"，则跳过处理
        if ("pooler" in key) or ("sen_class" in key) or ("conv.bias" in key):
            continue
        else:
            # 否则，根据定义的函数重命名键名，并使用原始值更新字典
            orig_state_dict[rename_key(key)] = val

    # 将原始状态字典中的 "cls.predictions.bias" 键名设为 "cls.predictions.decoder.bias" 的值
    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
    # 设置 "nystromformer.embeddings.position_ids" 键名为一系列从 2 开始的数字，形状为 (1, max_position_embeddings)
    orig_state_dict["nystromformer.embeddings.position_ids"] = (
        torch.arange(config.max_position_embeddings).expand((1, -1)) + 2
    )

    return orig_state_dict
# 定义函数，用于将 Nystromformer 模型的检查点转换为 PyTorch 模型
def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file, pytorch_dump_path):
    # 加载原始检查点的状态字典，使用CPU进行计算
    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
    # 根据给定的 Nystromformer 配置文件创建配置对象
    config = NystromformerConfig.from_json_file(nystromformer_config_file)
    # 根据配置创建一个 NystromformerForMaskedLM 模型对象
    model = NystromformerForMaskedLM(config)

    # 调用辅助函数将原始状态字典转换为适合新模型的状态字典
    new_state_dict = convert_checkpoint_helper(config, orig_state_dict)

    # 加载新的模型状态字典到模型中
    model.load_state_dict(new_state_dict)
    # 设置模型为评估模式
    model.eval()
    # 将模型保存为 PyTorch 模型到指定路径
    model.save_pretrained(pytorch_dump_path)

    # 打印转换成功信息，显示保存的模型路径
    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")


if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    # 必选参数
    parser.add_argument(
        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Nystromformer pytorch checkpoint."
    )
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The json file for Nystromformer model config.",
    )
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 解析命令行参数
    args = parser.parse_args()
    # 调用转换函数，传入解析后的参数
    convert_nystromformer_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)

`.\models\nystromformer\modeling_nystromformer.py`

# coding=utf-8
# 指定文件编码为 UTF-8

# 版权声明
# 2022 年 UW-Madison The HuggingFace Inc. 团队版权所有

# 根据 Apache 许可证 2.0 版本授权
# 除非符合许可证要求，否则不得使用此文件
# 可以从以下网址获取许可证副本：
# http://www.apache.org/licenses/LICENSE-2.0

# 在适用法律要求或书面同意的情况下，本软件按“原样”分发
# 没有任何明示或暗示的担保或条件

# 引入 PyTorch 和其他依赖
import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 引入不同的激活函数和模型输出
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)

# 引入模型工具和 PyTorch 实用函数
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging

# 引入 Nystromformer 的配置
from .configuration_nystromformer import NystromformerConfig

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 用于文档的检查点路径
_CHECKPOINT_FOR_DOC = "uw-madison/nystromformer-512"
# 用于文档的配置名称
_CONFIG_FOR_DOC = "NystromformerConfig"

# 预训练模型存档列表
NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "uw-madison/nystromformer-512",
    # 查看所有 Nyströmformer 模型 https://huggingface.co/models?filter=nystromformer
]


class NystromformerEmbeddings(nn.Module):
    """构建来自单词、位置和标记类型嵌入的嵌入层。"""
    # 初始化函数，接受一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 初始化词嵌入层，vocab_size 是词汇表大小，hidden_size 是隐藏层大小，padding_idx 是填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层，max_position_embeddings 是最大位置嵌入数量，hidden_size 是隐藏层大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
        # 初始化 token 类型嵌入层，type_vocab_size 是 token 类型数量，hidden_size 是隐藏层大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 初始化 LayerNorm 层，用于归一化隐藏层的输出，eps 是归一化的 epsilon 参数
        # 注释中提到不采用蛇形命名以与 TensorFlow 模型变量名保持一致，以便加载 TensorFlow 的检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，用于随机置零隐藏层的部分神经元，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 注册缓冲区 position_ids，torch.arange 创建一个从 0 到 max_position_embeddings 的序列并扩展维度，加 2 是因为序号从 2 开始
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
        )
        # 设置位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        
        # 注册缓冲区 token_type_ids，初始化为全零张量，用于标识 token 的类型
        self.register_buffer(
            "token_type_ids",
            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
            persistent=False,
        )

    # 前向传播函数，接受多种输入形式的参数，并返回嵌入向量
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        # 如果传入 input_ids，则获取其形状；否则，从 inputs_embeds 获取形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度 seq_length
        seq_length = input_shape[1]

        # 如果未传入 position_ids，则使用预先注册的 position_ids 的前 seq_length 个位置
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果未传入 token_type_ids，则根据注册的缓冲区生成全零 token_type_ids
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果未传入 inputs_embeds，则使用 input_ids 获取词嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 获取 token 类型嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将词嵌入和 token 类型嵌入相加得到最终嵌入向量 embeddings
        embeddings = inputs_embeds + token_type_embeddings
        
        # 如果使用绝对位置编码，则加上位置嵌入
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        
        # 应用 LayerNorm 进行归一化处理
        embeddings = self.LayerNorm(embeddings)
        # 应用 Dropout 进行随机置零处理，以防止过拟合
        embeddings = self.dropout(embeddings)
        
        # 返回最终的嵌入向量
        return embeddings
# NystromformerSelfAttention 类定义，继承自 nn.Module，用于自注意力机制
class NystromformerSelfAttention(nn.Module):
    # 初始化函数，接受配置 config 和位置嵌入类型 position_embedding_type（可选）
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏大小是否能被注意力头数整除，若不能且没有嵌入大小属性，则引发 ValueError
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 设置 Nystromformer 特有的参数
        self.num_landmarks = config.num_landmarks  # 地标数目
        self.seq_len = config.segment_means_seq_len  # 序列长度
        self.conv_kernel_size = config.conv_kernel_size  # 卷积核大小

        # 初始化选项设定
        if config.inv_coeff_init_option:
            self.init_option = config["inv_init_coeff_option"]
        else:
            self.init_option = "original"

        # 线性变换层，用于计算查询、键、值
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # Dropout 层，用于注意力概率的随机失活
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )

        # 若卷积核大小不为 None，则初始化卷积层
        if self.conv_kernel_size is not None:
            self.conv = nn.Conv2d(
                in_channels=self.num_attention_heads,
                out_channels=self.num_attention_heads,
                kernel_size=(self.conv_kernel_size, 1),
                padding=(self.conv_kernel_size // 2, 0),
                bias=False,
                groups=self.num_attention_heads,
            )

    # 迭代方法近似计算 Moore-Penrose 伪逆
    def iterative_inv(self, mat, n_iter=6):
        # 创建单位矩阵并复制到与输入矩阵相同的设备上
        identity = torch.eye(mat.size(-1), device=mat.device)
        key = mat

        # 根据初始化选项选择不同的计算公式来计算初始系数矩阵 value
        if self.init_option == "original":
            # 原始实现，更保守地计算 Z_0 的系数
            value = 1 / torch.max(torch.sum(key, dim=-2)) * key.transpose(-1, -2)
        else:
            # 精确系数计算，计算 Z_0 的初始化系数，加快收敛速度
            value = 1 / torch.max(torch.sum(key, dim=-2), dim=-1).values[:, :, None, None] * key.transpose(-1, -2)

        # 迭代更新系数矩阵 value
        for _ in range(n_iter):
            key_value = torch.matmul(key, value)
            value = torch.matmul(
                0.25 * value,
                13 * identity
                - torch.matmul(key_value, 15 * identity - torch.matmul(key_value, 7 * identity - key_value)),
            )

        # 返回更新后的系数矩阵 value
        return value
    # 对输入的张量进行维度重塑，以便进行注意力分数计算
    def transpose_for_scores(self, layer):
        # 计算新的张量形状，保持前面的维度不变，最后两个维度分别为注意力头的数量和每个头的大小
        new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        # 将输入张量按照新的形状重塑
        layer = layer.view(*new_layer_shape)
        # 对张量进行维度置换，将注意力头的数量移到第二个维度，保证在矩阵乘法时能够正确对齐
        return layer.permute(0, 2, 1, 3)
    # 定义神经网络的前向传播函数，接受隐藏状态、注意力掩码和是否输出注意力矩阵作为参数
    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 计算混合查询层，通过查询函数生成
        mixed_query_layer = self.query(hidden_states)

        # 计算键层，并为得分转置以进行注意力计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        
        # 计算值层，并为得分转置以进行注意力计算
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        
        # 对查询层进行得分归一化，使用平方根进行缩放
        query_layer = self.transpose_for_scores(mixed_query_layer)
        query_layer = query_layer / math.sqrt(math.sqrt(self.attention_head_size))
        
        # 对键层进行得分归一化，使用平方根进行缩放
        key_layer = key_layer / math.sqrt(math.sqrt(self.attention_head_size))

        # 如果 landmarks 的数量等于序列长度
        if self.num_landmarks == self.seq_len:
            # 计算注意力分数
            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

            # 如果有注意力掩码，将其应用于注意力分数
            if attention_mask is not None:
                # 应用预先计算的注意力掩码（在 NystromformerModel 的 forward() 函数中计算）
                attention_scores = attention_scores + attention_mask

            # 计算注意力概率
            attention_probs = nn.functional.softmax(attention_scores, dim=-1)
            
            # 计算上下文层，通过加权值层得出
            context_layer = torch.matmul(attention_probs, value_layer)

        # 如果 landmarks 的数量不等于序列长度
        else:
            # 将查询层和键层重塑以处理 landmarks
            q_landmarks = query_layer.reshape(
                -1,
                self.num_attention_heads,
                self.num_landmarks,
                self.seq_len // self.num_landmarks,
                self.attention_head_size,
            ).mean(dim=-2)
            k_landmarks = key_layer.reshape(
                -1,
                self.num_attention_heads,
                self.num_landmarks,
                self.seq_len // self.num_landmarks,
                self.attention_head_size,
            ).mean(dim=-2)

            # 计算 kernel_1，通过查询层和键 landmarks 的转置进行 softmax
            kernel_1 = torch.nn.functional.softmax(torch.matmul(query_layer, k_landmarks.transpose(-1, -2)), dim=-1)
            
            # 计算 kernel_2，通过 landmarks 之间的注意力关系进行 softmax
            kernel_2 = torch.nn.functional.softmax(torch.matmul(q_landmarks, k_landmarks.transpose(-1, -2)), dim=-1)

            # 计算 q_landmarks 和键层之间的注意力分数
            attention_scores = torch.matmul(q_landmarks, key_layer.transpose(-1, -2))

            # 如果有注意力掩码，将其应用于注意力分数
            if attention_mask is not None:
                # 应用预先计算的注意力掩码（在 NystromformerModel 的 forward() 函数中计算）
                attention_scores = attention_scores + attention_mask

            # 计算 kernel_3，通过 landmarks 和键层之间的注意力关系进行 softmax
            kernel_3 = nn.functional.softmax(attention_scores, dim=-1)
            
            # 计算注意力概率，通过 kernel_1 和 kernel_2 的加权平均并使用 iterative_inv 函数反演
            attention_probs = torch.matmul(kernel_1, self.iterative_inv(kernel_2))
            
            # 计算新的值层，通过 kernel_3 和值层的乘积得出
            new_value_layer = torch.matmul(kernel_3, value_layer)
            
            # 计算上下文层，通过加权新值层得出
            context_layer = torch.matmul(attention_probs, new_value_layer)

        # 如果存在卷积核大小，则将卷积操作应用于上下文层
        if self.conv_kernel_size is not None:
            context_layer += self.conv(value_layer)

        # 对上下文层进行维度置换和重塑，以适应多头注意力的结构
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        # 根据是否需要输出注意力矩阵，返回相应的输出结果
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        # 返回最终的输出结果
        return outputs
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
class NystromformerSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，用于变换输入的隐藏状态到同样大小的输出
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化 LayerNorm 层，用于规范化输出向量
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，以概率 config.hidden_dropout_prob 随机丢弃部分数据，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层变换隐藏状态
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态进行 Dropout
        hidden_states = self.dropout(hidden_states)
        # 对变换后的隐藏状态和输入张量 input_tensor 进行残差连接，并进行 LayerNorm 规范化
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class NystromformerAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化自注意力层
        self.self = NystromformerSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化输出层
        self.output = NystromformerSelfOutput(config)
        # 初始化一个集合，用于存储要剪枝的注意力头部
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数找到可以剪枝的头部索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储已剪枝的头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 调用自注意力层的 forward 方法，获取自注意力层的输出
        self_outputs = self.self(hidden_states, attention_mask, output_attentions)
        # 将自注意力层的输出作为输入，调用输出层的 forward 方法
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重，将它们加入到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Nystromformer
class NystromformerIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，用于变换输入的隐藏状态到中间隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层变换隐藏状态
        hidden_states = self.dense(hidden_states)
        # 应用中间层激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Nystromformer
# 定义一个名为 NystromformerOutput 的类，继承自 nn.Module
class NystromformerOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将输入特征大小转换为隐藏大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # LayerNorm 层，对隐藏状态进行归一化处理
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于随机失活以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 使用 Dropout 层随机失活
        hidden_states = self.dropout(hidden_states)
        # LayerNorm 层归一化处理并加上输入张量
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 定义一个名为 NystromformerLayer 的类，继承自 nn.Module
class NystromformerLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义用于分块处理前馈的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度的维度
        self.seq_len_dim = 1
        # NystromformerAttention 类的实例
        self.attention = NystromformerAttention(config)
        # 是否添加跨层注意力
        self.add_cross_attention = config.add_cross_attention
        # NystromformerIntermediate 类的实例
        self.intermediate = NystromformerIntermediate(config)
        # NystromformerOutput 类的实例
        self.output = NystromformerOutput(config)

    # 前向传播方法
    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 使用注意力机制处理隐藏状态
        self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
        attention_output = self_attention_outputs[0]

        # 如果需要输出注意力权重，添加自注意力结果到输出中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 对注意力输出进行分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        return outputs

    # 前馈处理的分块方法
    def feed_forward_chunk(self, attention_output):
        # 中间层处理注意力输出
        intermediate_output = self.intermediate(attention_output)
        # 输出层处理中间输出和注意力输出
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


# 定义一个名为 NystromformerEncoder 的类，继承自 nn.Module
class NystromformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 使用 NystromformerLayer 类创建指定数量的层
        self.layer = nn.ModuleList([NystromformerLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点，默认为 False
        self.gradient_checkpointing = False

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
        # 遍历每一层，依次调用其前向传播方法
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask, output_attentions=output_attentions)[0]

        return hidden_states
        ):
            # 如果不输出隐藏状态，则初始化为空元组
            all_hidden_states = () if output_hidden_states else None
            # 如果不输出注意力权重，则初始化为空元组
            all_self_attentions = () if output_attentions else None

            # 遍历每个 Transformer 层
            for i, layer_module in enumerate(self.layer):
                # 如果需要输出隐藏状态，则追加当前层的隐藏状态到 all_hidden_states 元组中
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 如果启用了梯度检查点且在训练阶段
                if self.gradient_checkpointing and self.training:
                    # 使用梯度检查点函数来调用当前层的前向传播
                    layer_outputs = self._gradient_checkpointing_func(
                        layer_module.__call__,
                        hidden_states,
                        attention_mask,
                        output_attentions,
                    )
                else:
                    # 否则，直接调用当前层的前向传播
                    layer_outputs = layer_module(hidden_states, attention_mask, output_attentions)

                # 更新隐藏状态为当前层的输出的第一个元素
                hidden_states = layer_outputs[0]
                # 如果需要输出注意力权重，则追加当前层的注意力权重到 all_self_attentions 元组中
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            # 如果需要输出隐藏状态，则追加最终的隐藏状态到 all_hidden_states 元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果不返回字典形式的输出，则返回非空的元组，包括隐藏状态、所有隐藏状态和所有注意力权重
            if not return_dict:
                return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            # 如果返回字典形式的输出，则创建 BaseModelOutputWithPastAndCrossAttentions 对象
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Nystromformer
class NystromformerPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，将输入特征维度变换为配置文件中指定的隐藏层大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置文件中指定的激活函数类型选择相应的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # 创建一个 LayerNorm 层，用于规范化隐藏状态张量
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入隐藏状态经过全连接层变换
        hidden_states = self.dense(hidden_states)
        # 变换后的隐藏状态经过选择的激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # 变换后的隐藏状态经过 LayerNorm 规范化
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Nystromformer
class NystromformerLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个 NystromformerPredictionHeadTransform 对象，用于处理隐藏状态
        self.transform = NystromformerPredictionHeadTransform(config)

        # 输出权重与输入嵌入大小相同，但每个标记有一个输出偏置
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 创建一个与词汇表大小相同的偏置参数
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要在两个变量之间建立链接，以便偏置在 `resize_token_embeddings` 时能正确调整大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 隐藏状态经过 NystromformerPredictionHeadTransform 处理
        hidden_states = self.transform(hidden_states)
        # 处理后的隐藏状态经过线性层变换得到预测分数
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Nystromformer
class NystromformerOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个 NystromformerLMPredictionHead 对象，用于生成 MLM 预测分数
        self.predictions = NystromformerLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        # 序列输出作为输入传递给预测头部生成预测分数
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


class NystromformerPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定 NystromformerConfig 作为配置类
    config_class = NystromformerConfig
    # 指定模型前缀
    base_model_prefix = "nystromformer"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果 module 是线性层或者二维卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果 module 是嵌入层
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果定义了填充索引，则将对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果 module 是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为全1
            module.weight.data.fill_(1.0)
NYSTROMFORMER_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
    
    Parameters:
        config ([`NystromformerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""



NYSTROMFORMER_INPUTS_DOCSTRING = r"""
    This is a docstring describing the inputs expected by the Nystromformer model.

    Inputs:
        **input_ids** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
        **attention_mask** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Mask to avoid performing attention on padding tokens.
        **position_ids** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Indices of positions of each input sequence tokens in the position embeddings.
        **inputs_embeds** (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, optional):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful when you want more control over how to convert input tokens to embeddings before feeding them
            into the model.

    Returns:
        :obj:`torch.Tensor`: Returns tensor(s) containing the model outputs.
"""
        Args:
            input_ids (`torch.LongTensor` of shape `({0})`):
                # 输入序列标记的索引，对应词汇表中的位置。

                Indices of input sequence tokens in the vocabulary.
                Indices can be obtained using `AutoTokenizer`. See `PreTrainedTokenizer.encode` and
                `PreTrainedTokenizer.__call__` for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
                # 遮罩，用于在填充的标记索引上避免进行注意力计算。

                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                # 分段标记索引，指示输入的第一和第二部分。

                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
                1]`:

                - 0 corresponds to a *sentence A* token,
                - 1 corresponds to a *sentence B* token.

                [What are token type IDs?](../glossary#token-type-ids)
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                # 输入序列标记在位置嵌入中的位置索引。

                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.max_position_embeddings - 1]`.

                [What are position IDs?](../glossary#position-ids)
            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                # 用于选择自注意力模块中屏蔽的头部的掩码。

                Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
                # 可选参数，代替传递`input_ids`，直接传递嵌入表示。

                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
                is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
                model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                # 是否返回所有注意力层的注意力张量。

                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                # 是否返回所有层的隐藏状态。

                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                # 是否返回`ModelOutput`而不是普通元组。

                Whether or not to return a `ModelOutput` instead of a plain tuple.
"""
@add_start_docstrings(
    "The bare Nyströmformer Model transformer outputting raw hidden-states without any specific head on top.",
    NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerModel(NystromformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        # Initialize the Nystromformer embeddings and encoder based on the given configuration
        self.embeddings = NystromformerEmbeddings(config)
        self.encoder = NystromformerEncoder(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        # Retrieve the word embeddings from the Nystromformer embeddings
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # Set new word embeddings for the Nystromformer embeddings
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            # Prune heads in each layer of the encoder
            self.encoder.layer[layer].attention.prune_heads(heads)

@add_start_docstrings(
    "Nyströmformer Model with a `language modeling` head on top.",
    NYSTROMFORMER_START_DOCSTRING
)
class NystromformerForMaskedLM(NystromformerPreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder"]

    def __init__(self, config):
        super().__init__(config)

        # Initialize the Nystromformer model and MLM head based on the provided configuration
        self.nystromformer = NystromformerModel(config)
        self.cls = NystromformerOnlyMLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        # Retrieve the output embeddings from the MLM head
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        # Set new output embeddings for the MLM head
        self.cls.predictions.decoder = new_embeddings
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token ids序列，可以为空
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码，用于指示哪些位置是填充的
        token_type_ids: Optional[torch.LongTensor] = None,  # token类型ids，如用于区分句子A和句子B的位置
        position_ids: Optional[torch.LongTensor] = None,  # 位置ids，用于指定每个token的位置信息
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码，用于控制每个注意力头部的选择性
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入向量，可以用于直接输入嵌入而不是token ids
        labels: Optional[torch.LongTensor] = None,  # 用于计算MLM损失的标签，指示哪些位置是被mask的
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回一个字典格式的输出
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 确定是否使用返回字典模式

        outputs = self.nystromformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]  # 获取模型输出的序列输出
        prediction_scores = self.cls(sequence_output)  # 使用线性层对序列输出进行预测得分计算

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 定义交叉熵损失函数，用于计算MLM损失
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))  # 计算MLM损失

        if not return_dict:
            output = (prediction_scores,) + outputs[1:]  # 如果不使用字典返回，则组装输出元组
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output  # 返回损失和输出元组

        return MaskedLMOutput(
            loss=masked_lm_loss,  # 返回MLM损失
            logits=prediction_scores,  # 返回预测得分
            hidden_states=outputs.hidden_states,  # 返回隐藏状态
            attentions=outputs.attentions,  # 返回注意力权重
        )
# 定义一个用于序列级分类任务的模型头部
class NystromformerClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 线性层，输入尺寸为config.hidden_size，输出尺寸为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Dropout 层，以config.hidden_dropout_prob的概率随机置零输入张量的部分元素
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 输出投影层，将输入尺寸为config.hidden_size的张量线性映射到尺寸为config.num_labels的张量
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

        self.config = config

    def forward(self, features, **kwargs):
        # 提取features中的第一个位置处的张量（对应于<CLS> token）
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        # 对提取的张量施加dropout操作
        x = self.dropout(x)
        # 将张量输入到线性层中进行线性变换
        x = self.dense(x)
        # 根据config中指定的激活函数对输出进行非线性变换
        x = ACT2FN[self.config.hidden_act](x)
        # 再次对张量施加dropout操作
        x = self.dropout(x)
        # 将张量输入到输出投影层中进行线性映射，得到最终输出
        x = self.out_proj(x)
        return x


# 应用于序列分类/回归任务的Nyströmformer模型变换器，顶部带有一个线性层（汇聚输出的线性层），例如用于GLUE任务
@add_start_docstrings(
    """
    Nyströmformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerForSequenceClassification(NystromformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 初始化模型的标签数量
        self.num_labels = config.num_labels
        # 初始化Nyströmformer模型
        self.nystromformer = NystromformerModel(config)
        # 初始化分类器头部
        self.classifier = NystromformerClassificationHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        定义了一个函数签名，表明该函数接受一些输入，并返回一个包含torch.Tensor或SequenceClassifierOutput的元组。
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            用于计算序列分类/回归损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
            如果 `config.num_labels == 1`，则计算回归损失（均方损失）；如果 `config.num_labels > 1`，则计算分类损失（交叉熵）。
        """
        # 初始化返回字典，如果return_dict为None，则根据self.config.use_return_dict确定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用self.nystromformer进行前向传播
        outputs = self.nystromformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传入分类器得到logits
        logits = self.classifier(sequence_output)

        # 初始化损失为None
        loss = None
        # 如果labels不为None，则计算损失
        if labels is not None:
            # 如果self.config.problem_type未定义，则根据self.num_labels的值设定problem_type
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据problem_type计算不同类型的损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果return_dict为False，则按顺序返回元组
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果return_dict为True，则返回SequenceClassifierOutput对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
Nyströmformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output
and a softmax) e.g. for RocStories/SWAG tasks.
"""

# 使用 Nyströmformer 模型，并在其顶部添加一个多选分类头部（即在池化输出之上的线性层和 softmax），例如用于 RocStories/SWAG 任务。

class NystromformerForMultipleChoice(NystromformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 Nyströmformer 模型
        self.nystromformer = NystromformerModel(config)
        
        # 初始化用于预分类的线性层
        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        
        # 初始化用于最终分类的线性层
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(
        NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播函数，接受一系列输入参数，并返回模型输出的多选分类结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 参数和返回值的文档字符串，描述了输入和输出的具体格式
        ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据 return_dict 是否为 None，确定是否使用配置中的 use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算 num_choices，如果 input_ids 不为 None，则为其第二维度的大小
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 如果 input_ids 不为 None，则重新调整其形状为 (batch_size * num_choices, seq_len)
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 如果 attention_mask 不为 None，则重新调整其形状为 (batch_size * num_choices, seq_len)
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 如果 token_type_ids 不为 None，则重新调整其形状为 (batch_size * num_choices, seq_len)
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 如果 position_ids 不为 None，则重新调整其形状为 (batch_size * num_choices, seq_len)
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 如果 inputs_embeds 不为 None，则重新调整其形状为 (batch_size * num_choices, seq_len, dim)
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 NystromFormer 模型进行前向传播
        outputs = self.nystromformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的隐藏状态
        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
        # 提取池化后的输出，仅保留每个序列的第一个标记的表示
        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
        # 通过预分类器进行线性变换
        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
        # 应用 ReLU 激活函数
        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
        # 通过分类器获取最终的 logits
        logits = self.classifier(pooled_output)

        # 重新调整 logits 的形状为 (batch_size, num_choices)
        reshaped_logits = logits.view(-1, num_choices)

        # 计算损失值
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果 return_dict 为 False，则返回扁平化的输出元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 MultipleChoiceModelOutput 对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 定义一个 Nyströmformer 模型，带有一个顶部的标记分类头部（隐藏状态输出上的线性层），用于命名实体识别（NER）等任务。
# 例如，用于命名实体识别（NER）任务的 Nyströmformer 模型，具有一个在隐藏状态输出之上的线性层作为标记分类头部。
@add_start_docstrings(
    """
    Nyströmformer Model with a token classification head on top (a linear layer on top of the hidden-states output)
    e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerForTokenClassification(NystromformerPreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        # 从配置中获取标签数目
        self.num_labels = config.num_labels

        # 初始化 Nyströmformer 模型
        self.nystromformer = NystromformerModel(config)
        # 添加 dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 添加线性分类器，将隐藏状态大小映射到标签数目
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 根据参数 `return_dict` 的值决定是否使用配置中的返回字典选项
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 Nystromformer 模型，并收集其输出
        outputs = self.nystromformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的序列输出（通常是最后一层的隐藏状态）
        sequence_output = outputs[0]

        # 对序列输出应用 dropout，用于防止过拟合
        sequence_output = self.dropout(sequence_output)
        # 将 dropout 后的输出传递给分类器，得到分类 logits
        logits = self.classifier(sequence_output)

        # 初始化损失值为 None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 将 logits 和标签展平后计算损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不需要返回字典格式的输出，则组装输出为元组
        if not return_dict:
            output = (logits,) + outputs[1:]  # 包括可能的额外输出
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的输出，则使用 TokenClassifierOutput 类封装输出
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 将 Nyströmformer 模型用于提取式问答任务，例如 SQuAD，顶部包含一个用于分类的线性层来计算“起始位置logits”和“结束位置logits”
@add_start_docstrings(
    """
    Nyströmformer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    NYSTROMFORMER_START_DOCSTRING,
)
class NystromformerForQuestionAnswering(NystromformerPreTrainedModel):
    
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        
        # 设置分类标签数为2（起始位置和结束位置）
        config.num_labels = 2
        self.num_labels = config.num_labels
        
        # 初始化 Nyströmformer 模型和用于问答的输出层
        self.nystromformer = NystromformerModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数详细说明
):
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 根据 return_dict 参数确定是否返回一个字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 NystromFormer 模型进行处理
        outputs = self.nystromformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 NystromFormer 模型的输出中获取序列输出
        sequence_output = outputs[0]

        # 使用序列输出计算问题回答的 logits
        logits = self.qa_outputs(sequence_output)
        # 将 logits 按照最后一个维度分割成 start_logits 和 end_logits
        start_logits, end_logits = logits.split(1, dim=-1)
        # 去除多余的维度，使得 start_logits 和 end_logits 的形状变为 (batch_size, sequence_length)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        # 如果给定了 start_positions 和 end_positions，则计算损失
        if start_positions is not None and end_positions is not None:
            # 如果是多 GPU 环境，可能会有额外的维度，这里进行压缩处理
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 将超出模型输入范围的 start/end positions 忽略掉
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 使用交叉熵损失函数，忽略索引为 ignored_index 的部分
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            # 计算总损失
            total_loss = (start_loss + end_loss) / 2

        # 如果不要求返回字典形式的输出，直接返回 logits 和可能的损失
        if not return_dict:
            output = (start_logits, end_logits) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果要求返回 QuestionAnsweringModelOutput 对象，构建并返回
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\nystromformer\init.py`

# 版权声明和许可证信息，版权归 HuggingFace 团队所有
#
# 根据 Apache 许可证 2.0 版本许可，除非符合许可要求，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 没有任何明示或暗示的保证或条件
# 请参阅许可证，了解详细信息
from typing import TYPE_CHECKING

# 从 utils 模块导入所需的工具函数和异常类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_nystromformer": ["NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "NystromformerConfig"],
}

# 尝试检查 torch 是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加 modeling_nystromformer 模块到导入结构中
    _import_structure["modeling_nystromformer"] = [
        "NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "NystromformerForMaskedLM",
        "NystromformerForMultipleChoice",
        "NystromformerForQuestionAnswering",
        "NystromformerForSequenceClassification",
        "NystromformerForTokenClassification",
        "NystromformerLayer",
        "NystromformerModel",
        "NystromformerPreTrainedModel",
    ]

# 如果是类型检查模式，则从相应模块导入必要的类和变量
if TYPE_CHECKING:
    from .configuration_nystromformer import NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, NystromformerConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_nystromformer import (
            NYSTROMFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            NystromformerForMaskedLM,
            NystromformerForMultipleChoice,
            NystromformerForQuestionAnswering,
            NystromformerForSequenceClassification,
            NystromformerForTokenClassification,
            NystromformerLayer,
            NystromformerModel,
            NystromformerPreTrainedModel,
        )

# 如果不是类型检查模式，则将当前模块设置为 LazyModule 的实例，以支持延迟加载
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\oneformer\configuration_oneformer.py`

# 设置文件编码为UTF-8，确保能够正确处理中文等特殊字符
# 版权声明，指出版权归属于SHI Labs和HuggingFace Inc.团队，并保留所有权利
# 使用Apache许可证2.0版本，允许按照此许可证使用和分发本软件
# 获取Apache许可证2.0的详细信息，请访问指定的URL
# 除非法律要求或书面同意，否则不得使用此文件
# 本软件基于"按原样"提供，没有任何明示或暗示的保证或条件
# 更多信息请参见许可证
"""OneFormer模型配置"""

# 从typing库导入Dict和Optional类，用于类型提示
from typing import Dict, Optional

# 导入预训练配置类PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入日志工具logging
from ...utils import logging
# 从自动导入中导入配置映射CONFIG_MAPPING
from ..auto import CONFIG_MAPPING

# 获取logger对象，用于记录日志
logger = logging.get_logger(__name__)

# 定义OneFormer预训练配置文件的存档映射
ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "shi-labs/oneformer_ade20k_swin_tiny": (
        "https://huggingface.co/shi-labs/oneformer_ade20k_swin_tiny/blob/main/config.json"
    ),
    # 查看所有OneFormer模型，请访问指定的URL
}


class OneFormerConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储[`OneFormerModel`]的配置。根据指定的参数实例化一个OneFormer模型，定义模型架构。
    使用默认值实例化配置会产生一个类似于OneFormer[shi-labs/oneformer_ade20k_swin_tiny]架构的配置，
    它在ADE20k-150数据集上进行了训练。

    配置对象继承自[`PretrainedConfig`]，可用于控制模型的输出。更多信息请参阅[`PretrainedConfig`]的文档。

    Examples:
    ```
    >>> from transformers import OneFormerConfig, OneFormerModel

    >>> # 初始化一个OneFormer shi-labs/oneformer_ade20k_swin_tiny配置
    >>> configuration = OneFormerConfig()
    >>> # 使用该配置初始化一个模型（带有随机权重）
    >>> model = OneFormerModel(configuration)
    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型设为"oneformer"
    model_type = "oneformer"
    # 属性映射表，将"hidden_size"映射为"hidden_dim"
    attribute_map = {"hidden_size": "hidden_dim"}
    # 初始化函数，用于创建一个对象实例，并接受多个可选参数
    def __init__(
        self,
        backbone_config: Optional[Dict] = None,  # 设置骨干网络的配置字典，可选，默认为None
        backbone: Optional[str] = None,  # 指定使用的骨干网络的名称，可选，默认为None
        use_pretrained_backbone: bool = False,  # 是否使用预训练的骨干网络模型，默认为False
        use_timm_backbone: bool = False,  # 是否使用timm库提供的骨干网络模型，默认为False
        backbone_kwargs: Optional[Dict] = None,  # 骨干网络的额外参数字典，可选，默认为None
        ignore_value: int = 255,  # 忽略值，用于特定任务中指定像素值，默认为255
        num_queries: int = 150,  # 查询的数量，用于某些任务中的查询数量设定，默认为150
        no_object_weight: int = 0.1,  # 无对象的权重，用于某些任务中的权重设定，默认为0.1
        class_weight: float = 2.0,  # 类别权重，用于某些任务中的类别权重设定，默认为2.0
        mask_weight: float = 5.0,  # 掩码权重，用于某些任务中的掩码权重设定，默认为5.0
        dice_weight: float = 5.0,  # Dice损失的权重，用于某些任务中的Dice损失权重设定，默认为5.0
        contrastive_weight: float = 0.5,  # 对比损失的权重，用于某些任务中的对比损失权重设定，默认为0.5
        contrastive_temperature: float = 0.07,  # 对比损失的温度参数，用于某些任务中的对比损失温度设定，默认为0.07
        train_num_points: int = 12544,  # 训练点的数量，用于某些任务中的训练点数量设定，默认为12544
        oversample_ratio: float = 3.0,  # 过采样比率，用于某些任务中的过采样比率设定，默认为3.0
        importance_sample_ratio: float = 0.75,  # 重要样本比率，用于某些任务中的重要样本比率设定，默认为0.75
        init_std: float = 0.02,  # 初始化标准差，用于某些初始化操作中的标准差设定，默认为0.02
        init_xavier_std: float = 1.0,  # Xavier初始化中的标准差，用于某些初始化操作中的Xavier标准差设定，默认为1.0
        layer_norm_eps: float = 1e-05,  # 层归一化中的epsilon参数，用于某些层归一化操作中的epsilon设定，默认为1e-05
        is_training: bool = False,  # 是否处于训练模式，用于指示当前是否在训练模型，默认为False
        use_auxiliary_loss: bool = True,  # 是否使用辅助损失，用于某些任务中的辅助损失设定，默认为True
        output_auxiliary_logits: bool = True,  # 是否输出辅助Logits，用于某些任务中是否输出辅助Logits，默认为True
        strides: Optional[list] = [4, 8, 16, 32],  # 步长列表，用于某些网络结构中的步长设定，默认为[4, 8, 16, 32]
        task_seq_len: int = 77,  # 任务序列长度，用于某些任务中的任务序列长度设定，默认为77
        text_encoder_width: int = 256,  # 文本编码器的宽度，用于某些任务中的文本编码器宽度设定，默认为256
        text_encoder_context_length: int = 77,  # 文本编码器的上下文长度，用于某些任务中的文本编码器上下文长度设定，默认为77
        text_encoder_num_layers: int = 6,  # 文本编码器的层数，用于某些任务中的文本编码器层数设定，默认为6
        text_encoder_vocab_size: int = 49408,  # 文本编码器的词汇表大小，用于某些任务中的文本编码器词汇表大小设定，默认为49408
        text_encoder_proj_layers: int = 2,  # 文本编码器的投影层数，用于某些任务中的文本编码器投影层数设定，默认为2
        text_encoder_n_ctx: int = 16,  # 文本编码器的上下文数，用于某些任务中的文本编码器上下文数设定，默认为16
        conv_dim: int = 256,  # 卷积层的维度，用于某些任务中的卷积层维度设定，默认为256
        mask_dim: int = 256,  # 掩码的维度，用于某些任务中的掩码维度设定，默认为256
        hidden_dim: int = 256,  # 隐藏层的维度，用于某些任务中的隐藏层维度设定，默认为256
        encoder_feedforward_dim: int = 1024,  # 编码器前馈层的维度，用于某些任务中的编码器前馈层维度设定，默认为1024
        norm: str = "GN",  # 标准化方法，用于某些任务中的标准化方法选择，默认为"GN"
        encoder_layers: int = 6,  # 编码器的层数，用于某些任务中的编码器层数设定，默认为6
        decoder_layers: int = 10,  # 解码器的层数，用于某些任务中的解码器层数设定，默认为10
        use_task_norm: bool = True,  # 是否使用任务归一化，用于某些任务中的任务归一化设定，默认为True
        num_attention_heads: int = 8,  # 注意力头的数量，用于某些任务中的注意力头数量设定，默认为8
        dropout: float = 0.1,  # 丢弃率，用于某些层中的丢弃率设定，默认为0.1
        dim_feedforward: int = 2048,  # 前馈层的维度，用于某些任务中的前馈层维度设定，默认为2048
        pre_norm: bool = False,  # 是否在层归一化之前应用归一化，用于某些任务中的层归一化顺序设定，默认为False
        enforce_input_proj: bool = False,  # 是否强制输入投影，用于某些任务中的输入投影设定，默认为False
        query_dec_layers: int = 2,  # 查询解码器的层数，用于某些任务中的查询解码器层数设定，默认为2
        common_stride: int = 4,  # 公共步长，用于某些任务中的公共步长设定，默认为4
        **kwargs,  # 其他未列出的关键字参数，用于接收任意其他参数

`.\models\oneformer\convert_to_hf_oneformer.py`

# coding=utf-8
# 声明文件编码格式为 UTF-8

# 版权声明和许可证信息
# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Convert OneFormer checkpoints from the original repository. URL: https://github.com/SHI-Labs/OneFormer"""
# 文件描述：从原始存储库转换 OneFormer 检查点的功能

import os
import sys
from argparse import ArgumentParser
from dataclasses import dataclass
from pathlib import Path
from pprint import pformat
from typing import Any, Dict, Iterator, List, Set, Tuple

import requests
import torch
import torchvision.transforms as T
from PIL import Image
from torch import Tensor, nn

# 尝试导入依赖库（detectron2），如果导入失败则忽略
try:
    from detectron2.checkpoint import DetectionCheckpointer
    from detectron2.config import get_cfg
    from detectron2.data import MetadataCatalog
    from detectron2.projects.deeplab import add_deeplab_config
except ImportError:
    pass

# 导入 OneFormer 相关模块和类
from transformers import CLIPTokenizer, DinatConfig, SwinConfig
from transformers.models.oneformer.image_processing_oneformer import OneFormerImageProcessor
from transformers.models.oneformer.modeling_oneformer import (
    OneFormerConfig,
    OneFormerForUniversalSegmentation,
    OneFormerForUniversalSegmentationOutput,
    OneFormerModel,
    OneFormerModelOutput,
)
from transformers.models.oneformer.processing_oneformer import OneFormerProcessor
from transformers.utils import logging

# 定义 StateDict 类型别名
StateDict = Dict[str, Tensor]

# 设置日志的详细程度为信息级别
logging.set_verbosity_info()
logger = logging.get_logger()

# 设定随机数种子
torch.manual_seed(0)


class TrackedStateDict:
    def __init__(self, to_track: Dict):
        """This class "tracks" a python dictionary by keeping track of which item is accessed.

        Args:
            to_track (Dict): The dictionary we wish to track
        """
        self.to_track = to_track
        self._seen: Set[str] = set()

    def __getitem__(self, key: str) -> Any:
        return self.to_track[key]

    def __setitem__(self, key: str, item: Any):
        self._seen.add(key)
        self.to_track[key] = item

    def diff(self) -> List[str]:
        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
        This is an effective method to check if we have update all the keys

        Returns:
            List[str]: List of keys not yet updated
        """
        return set(self.to_track.keys()) - self._seen

    def copy(self) -> Dict:
        # proxy the call to the internal dictionary
        return self.to_track.copy()


# 准备用于验证结果的图像
def prepare_img():
    # 定义一个 URL 变量，指向图像文件的网络地址
    url = "https://praeclarumjj3.github.io/files/coco.jpeg"
    # 使用 requests 库发起 GET 请求获取图像数据，设置 stream=True 以获取原始字节流
    img_data = requests.get(url, stream=True).raw
    # 使用 Image.open() 方法打开图像数据流，返回一个图像对象
    im = Image.open(img_data)
    # 返回打开的图像对象
    return im
# 定义一个数据类，用于存储模型配置文件路径等命令行参数
@dataclass
class Args:
    """Fake command line arguments needed by oneformer/detectron2 implementation"""

    config_file: str


# 配置模型的函数，从指定的配置文件和命令行参数加载配置
def setup_cfg(args: Args):
    # 获取一个空的配置对象
    cfg = get_cfg()
    # 添加 Deeplab 配置到配置对象
    add_deeplab_config(cfg)
    # 添加通用配置到配置对象
    add_common_config(cfg)
    # 添加 OneFormer 特定配置到配置对象
    add_oneformer_config(cfg)
    # 添加 Swin 模型配置到配置对象
    add_swin_config(cfg)
    # 添加 Dinat 模型配置到配置对象
    add_dinat_config(cfg)
    # 从指定的配置文件中合并配置到配置对象
    cfg.merge_from_file(args.config_file)
    # 冻结配置，防止进一步修改
    cfg.freeze()
    # 返回配置对象
    return cfg


# 将原始 OneFormer 配置转换为我们自己的处理器配置的类
class OriginalOneFormerConfigToOursConverter:

# 将原始 OneFormer 配置转换为处理器配置的类
class OriginalOneFormerConfigToProcessorConverter:

    # 将原始配置对象转换为 OneFormerProcessor 实例的调用方法
    def __call__(self, original_config: object, model_repo: str) -> OneFormerProcessor:
        # 获取原始模型和输入配置
        model = original_config.MODEL
        model_input = original_config.INPUT
        # 获取元数据目录中指定测试数据集的信息
        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])

        # 根据模型仓库名称选择类别信息文件
        if "ade20k" in model_repo:
            class_info_file = "ade20k_panoptic.json"
        elif "coco" in model_repo:
            class_info_file = "coco_panoptic.json"
        elif "cityscapes" in model_repo:
            class_info_file = "cityscapes_panoptic.json"
        else:
            raise ValueError("Invalid Dataset!")

        # 创建 OneFormerImageProcessor 实例，设置图像处理参数和类别信息文件
        image_processor = OneFormerImageProcessor(
            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
            size=model_input.MIN_SIZE_TEST,
            max_size=model_input.MAX_SIZE_TEST,
            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
            ignore_index=dataset_catalog.ignore_label,
            class_info_file=class_info_file,
        )

        # 从模型仓库加载 CLIPTokenizer 实例
        tokenizer = CLIPTokenizer.from_pretrained(model_repo)

        # 返回一个 OneFormerProcessor 实例，包含图像处理器、分词器及相关配置
        return OneFormerProcessor(
            image_processor=image_processor,
            tokenizer=tokenizer,
            task_seq_length=original_config.INPUT.TASK_SEQ_LEN,
            max_seq_length=original_config.INPUT.MAX_SEQ_LEN,
        )


# 将原始 OneFormer 检查点转换为我们自己的检查点转换器的类
class OriginalOneFormerCheckpointToOursConverter:

    # 初始化函数，接受原始模型和 OneFormer 配置对象作为参数
    def __init__(self, original_model: nn.Module, config: OneFormerConfig):
        self.original_model = original_model
        self.config = config

    # 弹出所有重命名的键到目标状态字典中
    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
        for src_key, dst_key in renamed_keys:
            dst_state_dict[dst_key] = src_state_dict.pop(src_key)

    # Swin Backbone
    # Dinat Backbone
    # Backbone + Pixel Decoder
    # Transformer Decoder
    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典的键前缀
        dst_prefix: str = "transformer_module.decoder.layers"
        # 源状态字典的键前缀
        src_prefix: str = "sem_seg_head.predictor"
        
        # 遍历每个解码器层
        for i in range(self.config.decoder_layers - 1):
            # 从源状态字典中弹出自注意力层的输入投影层权重和偏置
            in_proj_weight = src_state_dict.pop(
                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
            )
            in_proj_bias = src_state_dict.pop(
                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
            )
            
            # 将查询、键和值（按顺序）添加到目标状态字典
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.bias"] = in_proj_bias[:256]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.bias"] = in_proj_bias[256:512]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.bias"] = in_proj_bias[-256:]

    def replace_task_mlp(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典的键前缀
        dst_prefix: str = "task_encoder"
        # 源状态字典的键前缀
        src_prefix: str = "task_mlp"

        # 定义用于重命名权重和偏置的函数
        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
            return [
                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
            ]

        renamed_keys = []

        # 遍历两个MLP层
        for i in range(2):
            # 扩展重命名键列表，将源状态字典中的对应键映射到目标状态字典
            renamed_keys.extend(
                rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.task_mlp.layers.{i}.0")
            )

        # 调用方法，从两个状态字典中移除所有重命名的键
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)

    def replace_text_projector(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典的键前缀
        dst_prefix: str = "text_mapper.text_projector"
        # 源状态字典的键前缀
        src_prefix: str = "text_projector"

        # 定义用于重命名权重和偏置的函数
        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
            return [
                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
            ]

        renamed_keys = []

        # 根据文本编码器配置中的投影层数量，重命名权重和偏置的键
        for i in range(self.config.text_encoder_config["text_encoder_proj_layers"]):
            renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.{i}.0"))

        # 调用方法，从两个状态字典中移除所有重命名的键
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
    # 定义一个方法，用于将源状态字典中的文本编码器部分映射到目标状态字典中
    def replace_text_mapper(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典中文本编码器的前缀
        dst_prefix: str = "text_mapper.text_encoder"
        # 源状态字典中文本编码器的前缀
        src_prefix: str = "text_encoder"

        # 调用内部方法，将源状态字典中的投影器映射到目标状态字典中
        self.replace_text_projector(dst_state_dict, src_state_dict)

        # 定义一个内部函数，用于重命名权重和偏置的键
        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
            return [
                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
            ]

        # 定义一个内部函数，用于重命名注意力机制相关的键
        def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
            # 初始化注意力机制相关的键
            attn_keys = [
                (f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
                (f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
            ]
            # 扩展注意力机制中的权重和偏置键
            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))

            return attn_keys

        # 定义一个内部函数，用于重命名层级的键
        def rename_keys_for_layer(src_prefix: str, dst_prefix: str):
            # 初始化层级的键列表
            resblock_keys = []

            # 扩展层级键列表，包括多层感知机的权重和偏置
            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_fc", f"{dst_prefix}.mlp.fc1"))
            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_proj", f"{dst_prefix}.mlp.fc2"))
            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_1", f"{dst_prefix}.layer_norm1"))
            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_2", f"{dst_prefix}.layer_norm2"))
            resblock_keys.extend(rename_keys_for_attn(f"{src_prefix}.attn", f"{dst_prefix}.self_attn"))

            return resblock_keys

        # 初始化已重命名的键列表，直接包含特定的重命名键
        renamed_keys = [
            ("prompt_ctx.weight", "text_mapper.prompt_ctx.weight"),
        ]

        # 扩展已重命名的键列表，包括位置嵌入和令牌嵌入的权重
        renamed_keys.extend(
            [
                (f"{src_prefix}.positional_embedding", f"{dst_prefix}.positional_embedding"),
                (f"{src_prefix}.token_embedding.weight", f"{dst_prefix}.token_embedding.weight"),
            ]
        )

        # 扩展已重命名的键列表，包括最终层级的层归一化和前缀的权重和偏置
        renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_final", f"{dst_prefix}.ln_final"))

        # 循环遍历文本编码器配置中的所有层，重命名每个层级的键
        for i in range(self.config.text_encoder_config["text_encoder_num_layers"]):
            renamed_keys.extend(
                rename_keys_for_layer(
                    f"{src_prefix}.transformer.resblocks.{i}", f"{dst_prefix}.transformer.layers.{i}"
                )
            )

        # 调用对象方法，从目标状态字典和源状态字典中弹出所有已重命名的键
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
    # 将给定的模型转换为特定格式的模型对象
    def convert(self, oneformer: OneFormerModel, is_swin: bool) -> OneFormerModel:
        # 创建目标模型状态字典的跟踪对象，复制输入模型的状态字典
        dst_state_dict = TrackedStateDict(oneformer.state_dict())
        # 获取原始模型的状态字典
        src_state_dict = self.original_model.state_dict()

        # 替换目标模型的像素模块，使用原始模型的对应部分
        self.replace_pixel_module(dst_state_dict, src_state_dict, is_swin)
        # 替换目标模型的变换模块，使用原始模型的对应部分
        self.replace_transformer_module(dst_state_dict, src_state_dict)
        # 替换目标模型的任务 MLP，使用原始模型的对应部分
        self.replace_task_mlp(dst_state_dict, src_state_dict)
        
        # 如果配置为训练模式，则替换目标模型的文本映射器，使用原始模型的对应部分
        if self.config.is_training:
            self.replace_text_mapper(dst_state_dict, src_state_dict)

        # 记录目标模型状态字典中未复制的键
        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
        # 记录原始模型状态字典中未被复制的键
        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
        # 输出完成信息
        logger.info("🙌 Done")

        # 将更新后的状态字典加载到输入的模型对象中
        oneformer.load_state_dict(dst_state_dict)

        # 返回更新后的模型对象
        return oneformer

    @staticmethod
    # 使用指定的目录查找检查点文件和配置文件，返回迭代器
    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
        # 获取所有以 .pth 结尾的检查点文件列表
        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pth")

        # 遍历每个检查点文件
        for checkpoint in checkpoints:
            # 记录正在转换的检查点文件信息
            logger.info(f"💪 Converting {checkpoint.stem}")
            # 查找关联的配置文件，根据检查点文件名生成配置文件路径
            config: Path = config_dir / f"{checkpoint.stem}.yaml"

            # 返回配置文件路径、检查点文件路径的迭代器
            yield config, checkpoint
# 对语义分割模型输出进行后处理，将输出调整到指定的目标大小
def post_process_sem_seg_output(outputs: OneFormerForUniversalSegmentationOutput, target_size: Tuple[int, int]):
    # 获取类别查询的逻辑回归输出，形状为 [BATCH, QUERIES, CLASSES + 1]
    class_queries_logits = outputs.class_queries_logits
    # 获取掩码查询的逻辑回归输出，形状为 [BATCH, QUERIES, HEIGHT, WIDTH]
    masks_queries_logits = outputs.masks_queries_logits
    if target_size is not None:
        # 如果指定了目标大小，则通过双线性插值调整掩码查询的逻辑回归输出尺寸
        masks_queries_logits = torch.nn.functional.interpolate(
            masks_queries_logits,
            size=target_size,
            mode="bilinear",
            align_corners=False,
        )
    # 去除掉空类别 `[..., :-1]`，得到掩码类别概率，形状为 [BATCH, QUERIES, CLASSES]
    masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
    # 将掩码查询的逻辑回归输出通过 sigmoid 函数转换为概率，形状为 [BATCH, QUERIES, HEIGHT, WIDTH]
    masks_probs = masks_queries_logits.sigmoid()
    # 使用 Einstein Summation 计算语义分割结果，形状为 [BATCH, CLASSES, HEIGHT, WIDTH]
    # 其中 masks_classes 是掩码类别概率，masks_probs 是掩码概率
    segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)

    return segmentation


def test(
    original_model,
    our_model: OneFormerForUniversalSegmentation,
    processor: OneFormerProcessor,
    model_repo: str,
):
    # 内部函数，用于对文本进行预处理，将文本列表转换为模型输入的张量
    def _preprocess_text(text_list=None, max_length=77):
        if text_list is None:
            raise ValueError("tokens cannot be None.")

        # 使用 tokenizer 对文本列表进行编码处理，进行填充和截断以匹配模型输入要求
        tokens = tokenizer(text_list, padding="max_length", max_length=max_length, truncation=True)

        attention_masks, input_ids = tokens["attention_mask"], tokens["input_ids"]

        token_inputs = []
        # 遍历生成每个文本的张量输入
        for attn_mask, input_id in zip(attention_masks, input_ids):
            token = torch.tensor(attn_mask) * torch.tensor(input_id)
            token_inputs.append(token.unsqueeze(0))

        # 将列表转换为张量，并按第一维拼接，形成最终的输入张量
        token_inputs = torch.cat(token_inputs, dim=0)
        return token_inputs
    # 使用 torch.no_grad() 上下文管理器，禁用梯度计算，以加快推理速度
    with torch.no_grad():
        # 使用 CLIPTokenizer 从预训练模型库加载 tokenizer
        tokenizer = CLIPTokenizer.from_pretrained(model_repo)
        # 将原始模型和我们的模型设置为评估模式
        original_model = original_model.eval()
        our_model = our_model.eval()

        # 准备图像数据
        im = prepare_img()

        # 定义图像预处理的转换操作序列
        tr = T.Compose(
            [
                # 调整图像大小为 (640, 640)
                T.Resize((640, 640)),
                # 将图像转换为张量
                T.ToTensor(),
                # 标准化图像张量
                T.Normalize(
                    mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
                    std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
                ),
            ],
        )

        # 对图像进行预处理并增加一个维度，以符合模型的输入要求
        x = tr(im).unsqueeze(0)

        # 定义任务的输入文本
        task_input = ["the task is semantic"]
        # 对任务文本进行预处理，确保长度不超过处理器的最大序列长度
        task_token = _preprocess_text(task_input, max_length=processor.task_seq_length)

        # 提取原始模型的骨干网络特征
        original_model_backbone_features = original_model.backbone(x.clone())

        # 使用我们的模型进行推理，并要求输出隐藏状态
        our_model_output: OneFormerModelOutput = our_model.model(x.clone(), task_token, output_hidden_states=True)

        # 检查原始模型和我们的模型的骨干特征是否相似
        for original_model_feature, our_model_feature in zip(
            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
        ):
            assert torch.allclose(
                original_model_feature, our_model_feature, atol=3e-3
            ), "The backbone features are not the same."

        # 提取原始模型的语义分割头部解码器特征
        mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
            original_model_backbone_features
        )

        # 收集所有的原始像素解码器特征
        original_pixel_decoder_features = []
        original_pixel_decoder_features.append(mask_features)
        for i in range(len(multi_scale_features)):
            original_pixel_decoder_features.append(multi_scale_features[i])

        # 检查原始模型和我们的模型的像素解码器特征是否相似
        for original_model_feature, our_model_feature in zip(
            original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
        ):
            assert torch.allclose(
                original_model_feature, our_model_feature, atol=3e-4
            ), "The pixel decoder feature are not the same"

        # 定义完整的图像转换操作序列
        tr_complete = T.Compose(
            [
                T.Resize((640, 640)),
                T.ToTensor(),
            ],
        )

        # 对图像进行完整的预处理并转换为整型张量
        y = (tr_complete(im) * 255.0).to(torch.int).float()

        # 测试完整模型的输出
        original_model_out = original_model([{"image": y.clone(), "task": "The task is semantic"}])

        # 提取原始模型的语义分割结果
        original_segmentation = original_model_out[0]["sem_seg"]

        # 使用我们的模型进行推理，并对语义分割结果进行后处理
        our_model_out: OneFormerForUniversalSegmentationOutput = our_model(
            x.clone(), task_token, output_hidden_states=True
        )

        our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]

        # 检查原始模型和我们的模型的语义分割结果是否相似
        assert torch.allclose(
            original_segmentation, our_segmentation, atol=1e-3
        ), "The segmentation image is not the same."

        # 记录测试通过的消息
        logger.info("✅ Test passed!")
def get_name(checkpoint_file: Path):
    # 从文件路径中获取模型名称（不含扩展名）
    model_name_raw: str = checkpoint_file.stem

    # 根据模型名称判断使用的骨干网络（backbone）
    backbone = "swin" if "swin" in model_name_raw else "dinat"

    # 初始化数据集名称为空字符串
    dataset = ""
    
    # 根据模型名称确定数据集类型
    if "coco" in model_name_raw:
        dataset = "coco"
    elif "ade20k" in model_name_raw:
        dataset = "ade20k"
    elif "cityscapes" in model_name_raw:
        dataset = "cityscapes"
    else:
        # 如果模型名称不包含预期的数据集类型，则抛出值错误异常
        raise ValueError(
            f"{model_name_raw} must be wrong since we didn't find 'coco' or 'ade20k' or 'cityscapes' in it "
        )

    # 支持的骨干网络类型列表
    backbone_types = ["tiny", "large"]

    # 使用过滤器找到模型名称中包含的骨干网络类型
    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]

    # 构建最终的模型名称
    model_name = f"oneformer_{dataset}_{backbone}_{backbone_type}"

    return model_name


if __name__ == "__main__":
    # 创建参数解析器，描述用途是转换原始 OneFormer 模型（使用 swin 骨干网络）为 Transformers 实现的命令行工具
    parser = ArgumentParser(
        description=(
            "Command line to convert the original oneformer models (with swin backbone) to transformers"
            " implementation."
        )
    )

    # 添加命令行参数：模型检查点目录的路径
    parser.add_argument(
        "--checkpoints_dir",
        type=Path,
        help=(
            "A directory containing the model's checkpoints. The directory has to have the following structure:"
            " <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pth; where <CONFIG_NAME> name must follow the"
            " following nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
        ),
    )
    
    # 添加命令行参数：模型配置文件目录的路径
    parser.add_argument(
        "--configs_dir",
        type=Path,
        help=(
            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml; where <CONFIG_NAME> name must follow the"
            " following nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
        ),
    )
    
    # 添加命令行参数：输出 PyTorch 模型的文件夹路径（必需参数）
    parser.add_argument(
        "--pytorch_dump_folder_path",
        required=True,
        type=Path,
        help="Path to the folder to output PyTorch models.",
    )
    
    # 添加命令行参数：原始 OneFormer 实现目录的路径（必需参数）
    parser.add_argument(
        "--oneformer_dir",
        required=True,
        type=Path,
        help=(
            "A path to OneFormer's original implementation directory. You can download from here: "
            "https://github.com/SHI-Labs/OneFormer"
        ),
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 初始化各参数为对应的路径对象
    checkpoints_dir: Path = args.checkpoints_dir
    config_dir: Path = args.configs_dir
    save_directory: Path = args.pytorch_dump_folder_path
    oneformer_dir: Path = args.oneformer_dir

    # 如果输出路径不存在，则创建
    if not save_directory.exists():
        save_directory.mkdir(parents=True)
    # 遍历 OriginalOneFormerCheckpointToOursConverter 类的 using_dirs 方法返回的迭代器，
    # 该方法根据给定的 checkpoints_dir 和 config_dir 返回配置文件和检查点文件的元组
    for config_file, checkpoint_file in OriginalOneFormerCheckpointToOursConverter.using_dirs(
        checkpoints_dir, config_dir
    ):
        # 创建 OriginalOneFormerConfigToProcessorConverter 的实例，将配置文件转换为处理器对象
        processor = OriginalOneFormerConfigToProcessorConverter()(
            setup_cfg(Args(config_file=config_file)), os.path.join("shi-labs", config_file.stem)
        )

        # 根据配置文件创建原始配置对象
        original_config = setup_cfg(Args(config_file=config_file))

        # 根据原始配置对象获取 OneFormer 模型的关键字参数
        oneformer_kwargs = OriginalOneFormer.from_config(original_config)

        # 创建原始的 OneFormer 模型，并设置为评估模式
        original_model = OriginalOneFormer(**oneformer_kwargs).eval()

        # 加载检查点文件到原始模型中
        DetectionCheckpointer(original_model).load(str(checkpoint_file))

        # 检查 config_file.stem 是否包含 "swin"，用于判断是否为 Swin 模型
        is_swin = "swin" in config_file.stem

        # 使用 OriginalOneFormerConfigToOursConverter 将原始配置转换为我们的配置对象
        config: OneFormerConfig = OriginalOneFormerConfigToOursConverter()(original_config, is_swin)

        # 创建 OneFormerModel 对象，并设置为评估模式
        oneformer = OneFormerModel(config=config).eval()

        # 使用 OriginalOneFormerCheckpointToOursConverter 将原始模型和配置转换为我们的 OneFormer 模型
        converter = OriginalOneFormerCheckpointToOursConverter(original_model, config)
        oneformer = converter.convert(oneformer, is_swin)

        # 创建用于通用分割的 OneFormerForUniversalSegmentation 对象，并设置为评估模式
        oneformer_for_universal_segmentation = OneFormerForUniversalSegmentation(config=config).eval()

        # 将转换后的 OneFormer 模型设置为通用分割模型的属性
        oneformer_for_universal_segmentation.model = oneformer

        # 执行测试函数，测试原始模型和转换后的通用分割模型在处理器和路径下的表现
        test(
            original_model,
            oneformer_for_universal_segmentation,
            processor,
            os.path.join("shi-labs", config_file.stem),
        )

        # 获取模型名称，用于保存和日志记录
        model_name = get_name(checkpoint_file)

        # 记录信息，表明正在保存模型
        logger.info(f"🪄 Saving {model_name}")

        # 将处理器和通用分割模型保存到指定的目录下
        processor.save_pretrained(save_directory / model_name)
        oneformer_for_universal_segmentation.save_pretrained(save_directory / model_name)

        # 将处理器和通用分割模型推送到指定的 Hub 仓库
        processor.push_to_hub(
            repo_id=os.path.join("shi-labs", config_file.stem),
            commit_message="Add configs",
            use_temp_dir=True,
        )
        oneformer_for_universal_segmentation.push_to_hub(
            repo_id=os.path.join("shi-labs", config_file.stem),
            commit_message="Add model",
            use_temp_dir=True,
        )

Transformers-源码解析-八十二-

Transformers 源码解析（八十二）

.\models\nllb_moe\modeling_nllb_moe.py

.\models\nllb_moe\__init__.py

.\models\nougat\convert_nougat_to_hf.py

.\models\nougat\image_processing_nougat.py

.\models\nougat\processing_nougat.py

.\models\nougat\tokenization_nougat_fast.py

.\models\nougat\__init__.py

.\models\nystromformer\configuration_nystromformer.py

.\models\nystromformer\convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py

.\models\nystromformer\modeling_nystromformer.py

.\models\nystromformer\__init__.py

.\models\oneformer\configuration_oneformer.py

.\models\oneformer\convert_to_hf_oneformer.py

`.\models\nllb_moe\modeling_nllb_moe.py`

`.\models\nllb_moe\init.py`

`.\models\nougat\convert_nougat_to_hf.py`

`.\models\nougat\image_processing_nougat.py`

`.\models\nougat\processing_nougat.py`

`.\models\nougat\tokenization_nougat_fast.py`

`.\models\nougat\init.py`

`.\models\nystromformer\configuration_nystromformer.py`

`.\models\nystromformer\convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py`

`.\models\nystromformer\modeling_nystromformer.py`

`.\models\nystromformer\init.py`

`.\models\oneformer\configuration_oneformer.py`

`.\models\oneformer\convert_to_hf_oneformer.py`