Transformers 源码解析（一百零二）

`.\models\segformer\modeling_segformer.py`

# coding=utf-8
# Copyright 2021 NVIDIA The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch SegFormer model."""

# 导入所需的库和模块
import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入各种辅助函数和类
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput, SemanticSegmenterOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

# 获取日志记录器
logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "SegformerConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# 预训练模型的存档列表
SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "nvidia/segformer-b0-finetuned-ade-512-512",
    # See all SegFormer models at https://huggingface.co/models?filter=segformer
]

# ImageClassifierOutput 的子类，用于图像分类模型的输出
class SegFormerImageClassifierOutput(ImageClassifierOutput):
    """
    Base class for outputs of image classification models.
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            分类（如果`config.num_labels==1`则为回归）损失。
            Loss for classification (or regression if `config.num_labels==1`).
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类（如果`config.num_labels==1`则为回归）得分（SoftMax 之前）。
            Scores for classification (or regression if `config.num_labels==1`), before SoftMax.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            `torch.FloatTensor` 的元组（如果模型有嵌入层，则包括嵌入层输出，以及每个阶段的输出），
            形状为 `(batch_size, num_channels, height, width)`。
            模型在每个阶段输出的隐藏状态（也称为特征图）。
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer,
            plus one for the output of each stage), with shape `(batch_size, num_channels, height, width)`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            `torch.FloatTensor` 的元组（每个层一个），形状为 `(batch_size, num_heads, patch_size, sequence_length)`。
            自注意力机制中注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
            Tuple of `torch.FloatTensor` (one for each layer), with shape `(batch_size, num_heads, patch_size, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果 drop_prob 为 0 或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 处理不同维度的张量，而不仅仅是2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Segformer
class SegformerDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数来执行 drop path 操作
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class SegformerOverlapPatchEmbeddings(nn.Module):
    """Construct the overlapping patch embeddings."""

    def __init__(self, patch_size, stride, num_channels, hidden_size):
        super().__init__()
        # 使用卷积层构建重叠的补丁嵌入
        self.proj = nn.Conv2d(
            num_channels,
            hidden_size,
            kernel_size=patch_size,
            stride=stride,
            padding=patch_size // 2,
        )

        self.layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, pixel_values):
        # 计算嵌入，然后重塑形状以便传递给 Transformer 层
        embeddings = self.proj(pixel_values)
        _, _, height, width = embeddings.shape
        embeddings = embeddings.flatten(2).transpose(1, 2)
        embeddings = self.layer_norm(embeddings)
        return embeddings, height, width


class SegformerEfficientSelfAttention(nn.Module):
    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
    paper](https://arxiv.org/abs/2102.12122)."""
    # 初始化函数，用于创建一个注意力机制模型
    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
        # 调用父类初始化方法
        super().__init__()
        # 将隐藏大小和注意力头数保存到对象属性中
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads

        # 检查隐藏大小是否能被注意力头数整除，否则抛出数值错误
        if self.hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
                f"heads ({self.num_attention_heads})"
            )

        # 计算每个注意力头的大小和所有注意力头的总大小
        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建三个线性变换层，用于生成查询、键和值
        self.query = nn.Linear(self.hidden_size, self.all_head_size)
        self.key = nn.Linear(self.hidden_size, self.all_head_size)
        self.value = nn.Linear(self.hidden_size, self.all_head_size)

        # 创建一个丢弃层，用于注意力概率的丢弃
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # 如果序列缩减比例大于1，则创建一个卷积层和层归一化层
        self.sr_ratio = sequence_reduction_ratio
        if sequence_reduction_ratio > 1:
            self.sr = nn.Conv2d(
                hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
            )
            self.layer_norm = nn.LayerNorm(hidden_size)

    # 将隐藏状态重塑为注意力分数计算所需的形状
    def transpose_for_scores(self, hidden_states):
        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        hidden_states = hidden_states.view(new_shape)
        return hidden_states.permute(0, 2, 1, 3)

    # 前向传播函数，定义了模型如何处理输入并生成输出
    def forward(
        self,
        hidden_states,
        height,
        width,
        output_attentions=False,
        ):
            # 通过对隐藏状态进行查询操作并转置以备使用
            query_layer = self.transpose_for_scores(self.query(hidden_states))

            # 如果序列压缩比大于1
            if self.sr_ratio > 1:
                batch_size, seq_len, num_channels = hidden_states.shape
                # 重新组织张量形状为(batch_size, num_channels, height, width)
                hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
                # 应用序列压缩操作
                hidden_states = self.sr(hidden_states)
                # 将张量形状还原为(batch_size, seq_len, num_channels)
                hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
                # 应用层归一化
                hidden_states = self.layer_norm(hidden_states)

            # 通过对隐藏状态进行键操作并转置以备使用
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            # 通过对隐藏状态进行值操作并转置以备使用
            value_layer = self.transpose_for_scores(self.value(hidden_states))

            # 计算“查询”和“键”之间的点积，得到原始注意力分数
            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

            # 对注意力分数进行缩放
            attention_scores = attention_scores / math.sqrt(self.attention_head_size)

            # 将注意力分数归一化为概率
            attention_probs = nn.functional.softmax(attention_scores, dim=-1)

            # 使用dropout来随机丢弃一些token，以实现注意力机制
            attention_probs = self.dropout(attention_probs)

            # 计算加权和，得到上下文张量
            context_layer = torch.matmul(attention_probs, value_layer)

            # 将上下文张量进行维度变换，以适应后续操作
            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
            context_layer = context_layer.view(new_context_layer_shape)

            # 输出结果，根据需要是否包含注意力分数
            outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

            # 返回计算结果
            return outputs
# 定义一个自定义的 PyTorch 模块，用于 Segformer 模型的自注意力机制输出层
class SegformerSelfOutput(nn.Module):
    def __init__(self, config, hidden_size):
        super().__init__()
        # 线性层，将隐藏状态映射到相同大小的空间
        self.dense = nn.Linear(hidden_size, hidden_size)
        # Dropout 层，用于随机置零输入张量的元素，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 线性映射
        hidden_states = self.dense(hidden_states)
        # Dropout 操作
        hidden_states = self.dropout(hidden_states)
        return hidden_states


# 定义 Segformer 模型的注意力机制模块
class SegformerAttention(nn.Module):
    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
        super().__init__()
        # SegformerEfficientSelfAttention 自注意力模块的实例化
        self.self = SegformerEfficientSelfAttention(
            config=config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequence_reduction_ratio=sequence_reduction_ratio,
        )
        # SegformerSelfOutput 自注意力输出层的实例化
        self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
        # 用于存储被剪枝的注意力头的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 寻找可剪枝的注意力头并获取索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被剪枝的头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, hidden_states, height, width, output_attentions=False):
        # 调用自注意力模块的前向传播
        self_outputs = self.self(hidden_states, height, width, output_attentions)

        # 通过输出层处理注意力输出和原始隐藏状态
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要，添加注意力信息到输出元组中
        return outputs


# Segformer 模型的深度可分离卷积模块
class SegformerDWConv(nn.Module):
    def __init__(self, dim=768):
        super().__init__()
        # 深度可分离卷积层，用于处理输入的隐藏状态
        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)

    def forward(self, hidden_states, height, width):
        # 获取输入隐藏状态的维度信息
        batch_size, seq_len, num_channels = hidden_states.shape
        # 调整隐藏状态的形状以便进行深度可分离卷积
        hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
        hidden_states = self.dwconv(hidden_states)  # 应用深度可分离卷积
        hidden_states = hidden_states.flatten(2).transpose(1, 2)  # 将结果展平并重塑
        return hidden_states


# Segformer 模型的混合前馈网络模块
class SegformerMixFFN(nn.Module):
    # 初始化函数，用于初始化一个自定义的神经网络模块
    def __init__(self, config, in_features, hidden_features=None, out_features=None):
        # 调用父类的初始化函数，确保正确地初始化神经网络模块
        super().__init__()
        # 如果未指定输出特征数，则默认与输入特征数相同
        out_features = out_features or in_features
        # 创建一个线性层，输入特征数为in_features，输出特征数为hidden_features
        self.dense1 = nn.Linear(in_features, hidden_features)
        # 创建一个自定义的深度可分离卷积层
        self.dwconv = SegformerDWConv(hidden_features)
        # 根据配置文件中的隐藏激活函数类型选择对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
        # 创建一个线性层，输入特征数为hidden_features，输出特征数为out_features
        self.dense2 = nn.Linear(hidden_features, out_features)
        # 创建一个dropout层，使用配置文件中指定的隐藏层dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，定义了数据在模型中如何流动的过程
    def forward(self, hidden_states, height, width):
        # 第一层线性变换，将输入特征向量映射到隐藏特征空间
        hidden_states = self.dense1(hidden_states)
        # 深度可分离卷积层的前向计算，处理输入特征的空间信息
        hidden_states = self.dwconv(hidden_states, height, width)
        # 使用配置文件中指定的中间激活函数对隐藏状态进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 对隐藏状态应用dropout操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 第二层线性变换，将隐藏特征映射到输出特征空间
        hidden_states = self.dense2(hidden_states)
        # 再次应用dropout操作，增强模型的泛化能力
        hidden_states = self.dropout(hidden_states)
        # 返回最终的模型输出结果
        return hidden_states
class SegformerLayer(nn.Module):
    """This corresponds to the Block class in the original implementation."""

    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
        super().__init__()
        # Layer normalization applied to the input hidden states
        self.layer_norm_1 = nn.LayerNorm(hidden_size)
        
        # Self-attention mechanism specific to Segformer
        self.attention = SegformerAttention(
            config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequence_reduction_ratio=sequence_reduction_ratio,
        )
        
        # DropPath module for stochastic depth regularization
        self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        
        # Layer normalization applied after self-attention and before MLP
        self.layer_norm_2 = nn.LayerNorm(hidden_size)
        
        # Multi-layer perceptron (MLP) component of the Segformer layer
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)

    def forward(self, hidden_states, height, width, output_attentions=False):
        # Apply layer normalization before feeding into self-attention
        self_attention_outputs = self.attention(
            self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
            height,
            width,
            output_attentions=output_attentions,
        )

        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # First residual connection with stochastic depth (DropPath)
        attention_output = self.drop_path(attention_output)
        hidden_states = attention_output + hidden_states

        # Feed the output of the self-attention through the MLP
        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)

        # Second residual connection with stochastic depth (DropPath)
        mlp_output = self.drop_path(mlp_output)
        layer_output = mlp_output + hidden_states

        outputs = (layer_output,) + outputs  # Include layer output in the outputs tuple

        return outputs
    # 初始化函数，接受一个配置参数 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将配置参数 config 存储在对象的属性中
        self.config = config

        # stochastic depth decay rule
        # 根据 config 中的 drop_path_rate 参数生成一个随机深度衰减规则列表
        drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]

        # patch embeddings
        embeddings = []
        # 遍历 num_encoder_blocks 次，创建 SegformerOverlapPatchEmbeddings 对象并加入列表中
        for i in range(config.num_encoder_blocks):
            embeddings.append(
                SegformerOverlapPatchEmbeddings(
                    patch_size=config.patch_sizes[i],
                    stride=config.strides[i],
                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
                    hidden_size=config.hidden_sizes[i],
                )
            )
        # 将 embeddings 转换为 nn.ModuleList 类型，并赋值给 patch_embeddings 属性
        self.patch_embeddings = nn.ModuleList(embeddings)

        # Transformer blocks
        blocks = []
        cur = 0
        # 遍历 num_encoder_blocks 次，创建 SegformerLayer 对象并加入列表中
        for i in range(config.num_encoder_blocks):
            # 每个块由多个层组成
            layers = []
            if i != 0:
                cur += config.depths[i - 1]
            # 根据 depths[i] 参数创建多个 SegformerLayer 层，并加入 layers 列表中
            for j in range(config.depths[i]):
                layers.append(
                    SegformerLayer(
                        config,
                        hidden_size=config.hidden_sizes[i],
                        num_attention_heads=config.num_attention_heads[i],
                        drop_path=drop_path_decays[cur + j],
                        sequence_reduction_ratio=config.sr_ratios[i],
                        mlp_ratio=config.mlp_ratios[i],
                    )
                )
            # 将 layers 转换为 nn.ModuleList 类型，并加入 blocks 列表中
            blocks.append(nn.ModuleList(layers))

        # 将 blocks 转换为 nn.ModuleList 类型，并赋值给 block 属性
        self.block = nn.ModuleList(blocks)

        # Layer norms
        # 根据 hidden_sizes[i] 创建多个 LayerNorm 层，并转换为 nn.ModuleList 类型，赋值给 layer_norm 属性
        self.layer_norm = nn.ModuleList(
            [nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)]
        )

    # 前向传播函数，接受像素值 pixel_values 和几个可选参数
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple, BaseModelOutput]:
        # 初始化用于存储所有隐藏状态的元组，如果不需要输出隐藏状态，则置为 None
        all_hidden_states = () if output_hidden_states else None
        # 初始化用于存储所有自注意力矩阵的元组，如果不需要输出注意力矩阵，则置为 None
        all_self_attentions = () if output_attentions else None

        # 获取输入张量的批量大小
        batch_size = pixel_values.shape[0]

        # 将输入张量作为初始隐藏状态
        hidden_states = pixel_values
        # 遍历每个模块：嵌入层、块层、层归一化层
        for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)):
            embedding_layer, block_layer, norm_layer = x
            # 第一步：获取补丁嵌入
            hidden_states, height, width = embedding_layer(hidden_states)
            # 第二步：将嵌入通过块层处理
            for i, blk in enumerate(block_layer):
                # 调用块层处理隐藏状态、高度、宽度以及是否输出注意力矩阵
                layer_outputs = blk(hidden_states, height, width, output_attentions)
                hidden_states = layer_outputs[0]  # 更新隐藏状态为块层输出的隐藏状态
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)  # 更新自注意力矩阵元组

            # 第三步：应用层归一化
            hidden_states = norm_layer(hidden_states)

            # 第四步：根据需要将隐藏状态重塑回 (batch_size, num_channels, height, width) 的形状
            if idx != len(self.patch_embeddings) - 1 or (
                idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage
            ):
                hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()

            # 如果需要输出隐藏状态，则将当前隐藏状态加入到 all_hidden_states 元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的输出
        if not return_dict:
            # 返回包含非空值的元组（隐藏状态、所有隐藏状态、所有自注意力矩阵）
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        
        # 返回 BaseModelOutput 类的实例，包含最终的隐藏状态、所有隐藏状态和所有自注意力矩阵
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
# 添加起始文档字符串和类注释，说明这是SegformerModel类，继承自SegformerPreTrainedModel类。
@add_start_docstrings(
    "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
    SEGFORMER_START_DOCSTRING,
)
class SegformerModel(SegformerPreTrainedModel):

    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
    Attributes:
        config_class (SegformerConfig): Configuration class defining parameters for the model.
        base_model_prefix (str): Prefix used in naming the base model.
        main_input_name (str): Name of the main input expected by the model.
    """

    config_class = SegformerConfig
    base_model_prefix = "segformer"
    main_input_name = "pixel_values"

    def _init_weights(self, module):
        """Initialize the weights of the given module."""
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # Initialize weights using normal distribution with mean 0 and standard deviation `initializer_range`
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # If bias exists, initialize it to zero
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # Initialize embedding weights using normal distribution
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # If padding index is specified, set corresponding embedding weights to zero
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # Initialize layer normalization bias to zero and weight to one
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        # hierarchical Transformer encoder
        self.encoder = SegformerEncoder(config)
        # 初始化一个分层Transformer编码器，使用给定的配置参数

        # Initialize weights and apply final processing
        self.post_init()
        # 初始化权重并进行最终处理

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            # 对模型的注意力头进行修剪
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        encoder_outputs = self.encoder(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]

        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]

        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """
    SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
    states) e.g. for ImageNet.
    """,
    SEGFORMER_START_DOCSTRING,
)
class SegformerForImageClassification(SegformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 设置分类标签数量
        self.num_labels = config.num_labels
        # 初始化 SegFormer 模型
        self.segformer = SegformerModel(config)

        # 分类器头部
        self.classifier = nn.Linear(config.hidden_sizes[-1], config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=SegFormerImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
    ) -> Union[Tuple, SegFormerImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 设置返回字典，如果未指定，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 SegFormer 模型进行预测
        outputs = self.segformer(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取模型输出的最后一层隐藏状态
        sequence_output = outputs[0]

        # 将最后一层隐藏状态转换为 (batch_size, height*width, hidden_size) 的形式
        batch_size = sequence_output.shape[0]
        if self.config.reshape_last_stage:
            # 如果需要重塑最后一个阶段的输出形状
            # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
            sequence_output = sequence_output.permute(0, 2, 3, 1)
        sequence_output = sequence_output.reshape(batch_size, -1, self.config.hidden_sizes[-1])

        # 对每个样本进行全局平均池化
        sequence_output = sequence_output.mean(dim=1)

        # 使用分类器对平均池化后的特征进行分类预测
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 如果提供了标签
            if self.config.problem_type is None:
                # 根据标签数据类型和类别数设置问题类型
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                # 如果是回归问题，使用均方误差损失函数
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 如果是单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 如果是多标签分类问题，使用带logits的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典，则返回模型预测的输出和损失
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，返回 SegFormerImageClassifierOutput 类型的对象
        return SegFormerImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
class SegformerMLP(nn.Module):
    """
    Linear Embedding.
    """

    def __init__(self, config: SegformerConfig, input_dim):
        super().__init__()
        # 使用线性层将输入的维度 input_dim 转换为 config.decoder_hidden_size
        self.proj = nn.Linear(input_dim, config.decoder_hidden_size)

    def forward(self, hidden_states: torch.Tensor):
        # 将 hidden_states 按照第二个维度展平，然后转置第一和第二维度
        hidden_states = hidden_states.flatten(2).transpose(1, 2)
        # 使用 self.proj 进行线性变换
        hidden_states = self.proj(hidden_states)
        return hidden_states


class SegformerDecodeHead(SegformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 创建一个包含多个 SegformerMLP 的 ModuleList，用于将每个 encoder block 的通道维度统一到 config.decoder_hidden_size
        mlps = []
        for i in range(config.num_encoder_blocks):
            mlp = SegformerMLP(config, input_dim=config.hidden_sizes[i])
            mlps.append(mlp)
        self.linear_c = nn.ModuleList(mlps)

        # 实现原始实现的 ConvModule 的三个层
        # 使用 1x1 卷积层将输入通道数从 config.decoder_hidden_size * config.num_encoder_blocks 转换为 config.decoder_hidden_size
        self.linear_fuse = nn.Conv2d(
            in_channels=config.decoder_hidden_size * config.num_encoder_blocks,
            out_channels=config.decoder_hidden_size,
            kernel_size=1,
            bias=False,
        )
        # 批量归一化层，归一化 config.decoder_hidden_size 个通道
        self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size)
        # ReLU 激活函数
        self.activation = nn.ReLU()

        # 使用 config.classifier_dropout_prob 概率进行 Dropout
        self.dropout = nn.Dropout(config.classifier_dropout_prob)
        # 1x1 卷积层将 config.decoder_hidden_size 个通道转换为 config.num_labels 个通道
        self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1)

        self.config = config
    # 定义前向传播方法，接受编码器隐藏状态作为输入并返回预测的逻辑张量
    def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 获取批量大小，这里假设输入的最后一个隐藏状态作为参考
        batch_size = encoder_hidden_states[-1].shape[0]

        # 初始化一个空元组，用于存储所有隐藏状态
        all_hidden_states = ()
        # 遍历编码器隐藏状态和线性层列表
        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c):
            # 如果指定不重塑最后阶段并且编码器隐藏状态是3维的
            if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3:
                # 计算高度和宽度，并重塑编码器隐藏状态的形状
                height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
                encoder_hidden_state = (
                    encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
                )

            # 统一通道维度
            height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
            encoder_hidden_state = mlp(encoder_hidden_state)  # 应用线性层到隐藏状态
            encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)  # 调换维度顺序
            encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)  # 重塑隐藏状态
            # 上采样
            encoder_hidden_state = nn.functional.interpolate(
                encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False
            )
            # 将当前处理后的隐藏状态添加到元组中
            all_hidden_states += (encoder_hidden_state,)

        # 将所有处理后的隐藏状态在通道维度上拼接并通过线性融合层处理
        hidden_states = self.linear_fuse(torch.cat(all_hidden_states[::-1], dim=1))
        hidden_states = self.batch_norm(hidden_states)  # 批量归一化
        hidden_states = self.activation(hidden_states)  # 应用激活函数
        hidden_states = self.dropout(hidden_states)  # 应用丢弃（dropout）操作

        # 最终的逻辑张量形状为 (batch_size, num_labels, height/4, width/4)
        logits = self.classifier(hidden_states)

        return logits
# 使用装饰器为类添加文档字符串，描述该类为基于 SegFormer 模型的语义分割模型，具有一个全MLP解码头部
# 可用于处理例如ADE20k和CityScapes数据集的任务
@add_start_docstrings(
    """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
    SEGFORMER_START_DOCSTRING,
)
# 定义 SegformerForSemanticSegmentation 类，继承自 SegformerPreTrainedModel 类
class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
    
    # 初始化方法，接收一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建 SegformerModel 的实例，传入配置对象作为参数
        self.segformer = SegformerModel(config)
        # 创建 SegformerDecodeHead 的实例，传入配置对象作为参数
        self.decode_head = SegformerDecodeHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用装饰器为 forward 方法添加文档字符串，描述其输入和输出
    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 替换返回值的文档字符串，指定输出类型为 SemanticSegmenterOutput，并指定配置类为 _CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    # forward 方法定义，接收多个输入参数，包括像素值、标签等信息
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\segformer\modeling_tf_segformer.py`

# 设置文件编码为 UTF-8

# 版权声明和许可信息，告知此代码的版权归属和使用许可
# 根据 Apache License, Version 2.0 许可证，除非符合许可证的规定，否则不得使用本文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0

# 引入必要的库和模块
# 引入 math 库用于数学运算
# 引入 typing 库中的一些类型注解
# 引入 TensorFlow 库
from __future__ import annotations
import math
from typing import Optional, Tuple, Union
import tensorflow as tf

# 引入其他模块和函数
# 从本地路径中引入活化函数模块
from ...activations_tf import get_tf_activation
# 从文件工具模块中引入一些函数
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
# 从模型输出的 TensorFlow 版本中引入输出类型
from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput
# 从 TensorFlow 工具模块中引入一些实用函数和类
from ...modeling_tf_utils import (
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    keras,
    keras_serializable,
    unpack_inputs,
)
# 从 TensorFlow 实用工具模块中引入形状列表和稳定 softmax 函数
from ...tf_utils import shape_list, stable_softmax
# 引入日志记录工具
from ...utils import logging
# 从 Segformer 配置模块中引入 SegformerConfig 类
from .configuration_segformer import SegformerConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的配置信息
_CONFIG_FOR_DOC = "SegformerConfig"

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]

# 图像分类文档信息
_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# 预训练模型存档列表
TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "nvidia/segformer-b0-finetuned-ade-512-512",
    # 可以在 https://huggingface.co/models?filter=segformer 查看所有 SegFormer 模型
]

# 从 transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath 复制的类，现在是 SegformerDropPath
# 实现了在残差块的主路径中应用的样本级别的 Drop Path (Stochastic Depth)
# 参考来源：github.com:rwightman/pytorch-image-models
class TFSegformerDropPath(keras.layers.Layer):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    """

    def __init__(self, drop_path: float, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path

    def call(self, x: tf.Tensor, training=None):
        if training:
            keep_prob = 1 - self.drop_path
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
            random_tensor = tf.floor(random_tensor)
            return (x / keep_prob) * random_tensor
        return x

# 构建重叠的补丁嵌入层
class TFSegformerOverlapPatchEmbeddings(keras.layers.Layer):
    """Construct the overlapping patch embeddings."""
    # 初始化方法，设置类的初始参数及网络层结构
    def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 创建一个 ZeroPadding2D 层，用于填充输入的像素值
        self.padding = keras.layers.ZeroPadding2D(padding=patch_size // 2)
        # 创建一个 Conv2D 层，用于进行卷积操作，生成特征映射
        self.proj = keras.layers.Conv2D(
            filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj"
        )

        # 创建一个 LayerNormalization 层，用于归一化特征映射
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
        # 设置输入通道数和隐藏单元数
        self.num_channels = num_channels
        self.hidden_size = hidden_size

    # 定义调用方法，实现数据流向及数据处理
    def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]:
        # 对输入像素值进行填充和卷积操作，生成特征映射
        embeddings = self.proj(self.padding(pixel_values))
        # 获取特征映射的高度、宽度和深度信息
        height = shape_list(embeddings)[1]
        width = shape_list(embeddings)[2]
        hidden_dim = shape_list(embeddings)[3]
        # 重新调整特征映射的形状，将其转换为(batch_size, height*width, hidden_dim)的形式
        embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim))
        # 对重新调整的特征映射进行层归一化处理
        embeddings = self.layer_norm(embeddings)
        # 返回归一化后的特征映射、高度和宽度信息
        return embeddings, height, width

    # 构建方法，用于构建层次结构及初始化层内部的权重
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记该层已经构建
        self.built = True
        # 如果 proj 层存在，则构建 proj 层
        if getattr(self, "proj", None) is not None:
            with tf.name_scope(self.proj.name):
                self.proj.build([None, None, None, self.num_channels])
        # 如果 layer_norm 层存在，则构建 layer_norm 层
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.hidden_size])
        height: int,
        width: int,
        output_attentions: bool = False,
        training: bool = False,


        # 定义 call 方法用于执行层的前向传播
        hidden_states: tf.Tensor,
        # height 和 width 表示输入张量的高度和宽度
        height: int,
        width: int,
        # output_attentions 表示是否输出注意力权重，默认为 False
        output_attentions: bool = False,
        # training 表示是否处于训练模式，默认为 False
        training: bool = False,
    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
        # 获取隐藏状态的批量大小
        batch_size = shape_list(hidden_states)[0]
        # 获取隐藏状态的通道数
        num_channels = shape_list(hidden_states)[2]

        # 使用 self.query 对隐藏状态进行查询操作，并调整维度以进行注意力计算
        query_layer = self.transpose_for_scores(self.query(hidden_states))

        if self.sr_ratio > 1:
            # 将隐藏状态重塑为 (batch_size, height, width, num_channels) 的形状
            hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
            # 应用序列减少操作
            hidden_states = self.sr(hidden_states)
            # 将隐藏状态重新调整为 (batch_size, seq_len, num_channels) 的形状
            hidden_states = tf.reshape(hidden_states, (batch_size, -1, num_channels))
            # 对调整后的隐藏状态进行层归一化
            hidden_states = self.layer_norm(hidden_states)

        # 使用 self.key 对隐藏状态进行查询操作，并调整维度以进行注意力计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        # 使用 self.value 对隐藏状态进行查询操作，并调整维度以进行注意力计算
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        # 计算 "query" 和 "key" 的点积，得到原始的注意力分数
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)

        # 缩放注意力分数
        scale = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
        attention_scores = tf.divide(attention_scores, scale)

        # 将注意力分数归一化为概率
        attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # 对注意力概率进行 dropout 操作，训练时使用
        attention_probs = self.dropout(attention_probs, training=training)

        # 计算加权后的 value 层作为上下文层
        context_layer = tf.matmul(attention_probs, value_layer)

        # 调整上下文层的维度顺序
        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
        # 将上下文层重塑为 (batch_size, seq_len_q, all_head_size) 的形状
        context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size))

        # 如果需要输出注意力分数，则将其包含在输出中
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
        return outputs

    def build(self, input_shape=None):
        # 如果模型已经构建，直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 self.query 属性，则构建 self.query 层
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.hidden_size])
        # 如果存在 self.key 属性，则构建 self.key 层
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.hidden_size])
        # 如果存在 self.value 属性，则构建 self.value 层
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.hidden_size])
        # 如果存在 self.sr 属性，则构建 self.sr 层
        if getattr(self, "sr", None) is not None:
            with tf.name_scope(self.sr.name):
                self.sr.build([None, None, None, self.hidden_size])
        # 如果存在 self.layer_norm 属性，则构建 self.layer_norm 层
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.hidden_size])
# Segformer 模型的自定义输出层，用于处理隐藏状态
class TFSegformerSelfOutput(keras.layers.Layer):
    def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs):
        super().__init__(**kwargs)
        # 创建一个全连接层，将输入转换为指定的隐藏大小
        self.dense = keras.layers.Dense(hidden_size, name="dense")
        # Dropout 层，用于在训练过程中随机丢弃部分神经元，以防止过拟合
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        self.hidden_size = hidden_size

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将隐藏状态传递给全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 如果在训练模式下，对全连接层的输出进行 Dropout 处理
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果层已经构建，则直接返回；否则，构建全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.hidden_size])


# Segformer 模型的注意力层，包含自注意力机制和输出处理
class TFSegformerAttention(keras.layers.Layer):
    def __init__(
        self,
        config: SegformerConfig,
        hidden_size: int,
        num_attention_heads: int,
        sequence_reduction_ratio: int,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 自注意力机制，用于处理输入的隐藏状态
        self.self = TFSegformerEfficientSelfAttention(
            config=config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequence_reduction_ratio=sequence_reduction_ratio,
            name="self",
        )
        # 输出处理层，负责处理自注意力机制的输出隐藏状态
        self.dense_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")

    def call(
        self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False
    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
        # 使用自注意力机制处理隐藏状态
        self_outputs = self.self(hidden_states, height, width, output_attentions)

        # 将自注意力机制的输出传递给输出处理层进行处理
        attention_output = self.dense_output(self_outputs[0])
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力权重，则添加到输出中
        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果层已经构建，则直接返回；否则，构建自注意力和输出处理层
        if getattr(self, "self", None) is not None:
            with tf.name_scope(self.self.name):
                self.self.build(None)
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)


# Segformer 模型的深度可分离卷积层，用于特征提取
class TFSegformerDWConv(keras.layers.Layer):
    def __init__(self, dim: int = 768, **kwargs):
        super().__init__(**kwargs)
        # 深度可分离卷积层，用于对输入进行特征提取
        self.depthwise_convolution = keras.layers.Conv2D(
            filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
        )
        self.dim = dim
    # 定义一个方法 `call`，接受三个参数：隐藏状态（张量）、高度和宽度，并返回一个张量
    def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
        # 获取隐藏状态的批量大小
        batch_size = shape_list(hidden_states)[0]
        # 获取隐藏状态的通道数
        num_channels = shape_list(hidden_states)[-1]
        # 将隐藏状态重塑为四维张量，形状为(batch_size, height, width, num_channels)
        hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
        # 对重塑后的隐藏状态进行深度可分离卷积操作
        hidden_states = self.depthwise_convolution(hidden_states)

        # 获取卷积后张量的新高度、宽度和通道数
        new_height = shape_list(hidden_states)[1]
        new_width = shape_list(hidden_states)[2]
        num_channels = shape_list(hidden_states)[3]
        # 将卷积后的张量再次重塑为三维张量，形状为(batch_size, new_height * new_width, num_channels)
        hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels))
        # 返回处理后的张量
        return hidden_states

    # 定义一个方法 `build`，用于构建模型层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        self.built = True
        # 检查是否存在深度可分离卷积层，如果有，则构建该层
        if getattr(self, "depthwise_convolution", None) is not None:
            with tf.name_scope(self.depthwise_convolution.name):
                # 调用深度可分离卷积层的构建方法，并传入期望的输入形状
                self.depthwise_convolution.build([None, None, None, self.dim])
class TFSegformerMixFFN(keras.layers.Layer):
    # Segformer 模型中的混合 FeedForward Network 层
    def __init__(
        self,
        config: SegformerConfig,
        in_features: int,
        hidden_features: int = None,
        out_features: int = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        out_features = out_features or in_features
        # 第一个全连接层，输入特征数为 hidden_features，输出特征数为 hidden_features
        self.dense1 = keras.layers.Dense(hidden_features, name="dense1")
        # 深度可分离卷积层，处理 hidden_features 维度的数据
        self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv")
        # 中间激活函数，根据配置选择
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        # 第二个全连接层，输入特征数为 hidden_features，输出特征数为 out_features
        self.dense2 = keras.layers.Dense(out_features, name="dense2")
        # Dropout 层，根据配置的概率进行 dropout
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        self.hidden_features = hidden_features
        self.in_features = in_features

    def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
        # 前向传播过程
        # 全连接层 1，将 hidden_states 映射到 hidden_features 维度
        hidden_states = self.dense1(hidden_states)
        # 深度可分离卷积层，处理 hidden_features 维度的数据，输入图片的高度和宽度
        hidden_states = self.depthwise_convolution(hidden_states, height, width)
        # 中间激活函数，对 hidden_states 应用激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # Dropout 层，对 hidden_states 进行随机失活
        hidden_states = self.dropout(hidden_states, training=training)
        # 全连接层 2，将 hidden_states 映射回 out_features 维度
        hidden_states = self.dense2(hidden_states)
        # 再次应用 Dropout 层
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

    def build(self, input_shape=None):
        # 构建层的方法
        if self.built:
            return
        self.built = True
        # 如果 dense1 层已经定义，则构建 dense1 层
        if getattr(self, "dense1", None) is not None:
            with tf.name_scope(self.dense1.name):
                self.dense1.build([None, None, self.in_features])
        # 如果 depthwise_convolution 层已经定义，则构建 depthwise_convolution 层
        if getattr(self, "depthwise_convolution", None) is not None:
            with tf.name_scope(self.depthwise_convolution.name):
                self.depthwise_convolution.build(None)
        # 如果 dense2 层已经定义，则构建 dense2 层
        if getattr(self, "dense2", None) is not None:
            with tf.name_scope(self.dense2.name):
                self.dense2.build([None, None, self.hidden_features])


class TFSegformerLayer(keras.layers.Layer):
    """This corresponds to the Block class in the original implementation."""
    # Segformer 模型中的一个层，对应原始实现中的 Block 类

    def __init__(
        self,
        config,
        hidden_size: int,
        num_attention_heads: int,
        drop_path: float,
        sequence_reduction_ratio: int,
        mlp_ratio: int,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
        self.attention = TFSegformerAttention(
            config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequence_reduction_ratio=sequence_reduction_ratio,
            name="attention",
        )
        self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear")
        self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
        self.hidden_size = hidden_size


# 初始化方法，用于创建一个新的 Segformer 层
def __init__(
    self,
    config,
    hidden_size: int,
    num_attention_heads: int,
    sequence_reduction_ratio: int,
    mlp_ratio: float,
    drop_path: float,
    **kwargs
):
    # 调用父类的初始化方法
    super().__init__(**kwargs)
    # 第一个 LayerNormalization 层，用于在自注意力机制之前进行层归一化
    self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
    # Segformer 的注意力机制层
    self.attention = TFSegformerAttention(
        config,
        hidden_size=hidden_size,
        num_attention_heads=num_attention_heads,
        sequence_reduction_ratio=sequence_reduction_ratio,
        name="attention",
    )
    # DropPath 层，根据概率 drop_path 进行路径下降或线性激活
    self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear")
    # 第二个 LayerNormalization 层，用于在 MLP 之前进行层归一化
    self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
    # MLP 层，用于非线性变换
    mlp_hidden_size = int(hidden_size * mlp_ratio)
    self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
    # 隐藏大小
    self.hidden_size = hidden_size



    def call(
        self,
        hidden_states: tf.Tensor,
        height: int,
        width: int,
        output_attentions: bool = False,
        training: bool = False,
    ) -> Tuple:
        # 使用自注意力机制层处理输入的隐藏状态并返回输出
        self_attention_outputs = self.attention(
            self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
            height,
            width,
            output_attentions=output_attentions,
            training=training,
        )

        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 第一个残差连接（带有随机深度）
        attention_output = self.drop_path(attention_output, training=training)
        hidden_states = attention_output + hidden_states
        # 使用 MLP 层处理归一化后的隐藏状态并返回输出
        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)

        # 第二个残差连接（带有随机深度）
        mlp_output = self.drop_path(mlp_output, training=training)
        layer_output = mlp_output + hidden_states

        outputs = (layer_output,) + outputs

        return outputs


    def build(self, input_shape=None):
        # 如果已经构建过网络层，则直接返回
        if self.built:
            return
        self.built = True
        # 构建第一个层归一化层
        if getattr(self, "layer_norm_1", None) is not None:
            with tf.name_scope(self.layer_norm_1.name):
                self.layer_norm_1.build([None, None, self.hidden_size])
        # 构建注意力机制层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 构建第二个层归一化层
        if getattr(self, "layer_norm_2", None) is not None:
            with tf.name_scope(self.layer_norm_2.name):
                self.layer_norm_2.build([None, None, self.hidden_size])
        # 构建 MLP 层
        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)
# 定义一个自定义层 TFSegformerEncoder，继承自 keras.layers.Layer 类
class TFSegformerEncoder(keras.layers.Layer):
    
    # 初始化方法，接受一个 SegformerConfig 对象作为参数，并调用父类的初始化方法
    def __init__(self, config: SegformerConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config

        # 计算使用 stochastic depth 策略的衰减率列表
        drop_path_decays = [x.numpy() for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]

        # 创建 patch embeddings 列表
        embeddings = []
        # 根据 num_encoder_blocks 的数量循环创建 TFSegformerOverlapPatchEmbeddings 对象
        for i in range(config.num_encoder_blocks):
            embeddings.append(
                TFSegformerOverlapPatchEmbeddings(
                    patch_size=config.patch_sizes[i],
                    stride=config.strides[i],
                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
                    hidden_size=config.hidden_sizes[i],
                    name=f"patch_embeddings.{i}",
                )
            )
        self.embeddings = embeddings

        # 创建 Transformer blocks 列表
        blocks = []
        cur = 0
        # 根据 num_encoder_blocks 的数量循环创建 Transformer blocks
        for i in range(config.num_encoder_blocks):
            # 每个 block 包含多个 layers
            layers = []
            if i != 0:
                cur += config.depths[i - 1]
            # 根据 depths[i] 的数量循环创建 TFSegformerLayer 对象
            for j in range(config.depths[i]):
                layers.append(
                    TFSegformerLayer(
                        config,
                        hidden_size=config.hidden_sizes[i],
                        num_attention_heads=config.num_attention_heads[i],
                        drop_path=drop_path_decays[cur + j],
                        sequence_reduction_ratio=config.sr_ratios[i],
                        mlp_ratio=config.mlp_ratios[i],
                        name=f"block.{i}.{j}",
                    )
                )
            blocks.append(layers)

        self.block = blocks

        # 创建 Layer norms 列表
        self.layer_norms = [
            keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}")
            for i in range(config.num_encoder_blocks)
        ]

    # 定义 call 方法，实现层的调用逻辑
    def call(
        self,
        pixel_values: tf.Tensor,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        training: bool = False,
        # 参数说明：输入的像素值张量，是否输出注意力权重，是否输出隐藏状态，是否返回字典形式的输出，训练模式标志位
    ) -> Union[Tuple, TFBaseModelOutput]:
        # 如果输出隐藏状态为真，则初始化空元组，否则为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重为真，则初始化空元组，否则为 None
        all_self_attentions = () if output_attentions else None

        # 获取批量大小
        batch_size = shape_list(pixel_values)[0]

        # 初始隐藏状态为输入像素值
        hidden_states = pixel_values
        # 遍历嵌入层、块、层归一化器的组合
        for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)):
            embedding_layer, block_layer, norm_layer = x
            # 第一步，获取图像块的嵌入表示
            hidden_states, height, width = embedding_layer(hidden_states)

            # 第二步，将嵌入表示通过块处理
            # （每个块包含多个层，即层的列表）
            for i, blk in enumerate(block_layer):
                # 调用块的前向传播
                layer_outputs = blk(
                    hidden_states,
                    height,
                    width,
                    output_attentions,
                    training=training,
                )
                hidden_states = layer_outputs[0]
                # 如果需要输出注意力权重，则将它们累加到 all_self_attentions 中
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            # 第三步，应用层归一化
            hidden_states = norm_layer(hidden_states)

            # 第四步，可选地将隐藏状态重塑为 (batch_size, height, width, num_channels)
            if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage):
                num_channels = shape_list(hidden_states)[-1]
                hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))

            # 如果需要输出隐藏状态，则将当前隐藏状态累加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典格式的结果，则返回非 None 的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，返回 TFBaseModelOutput 类型的结果
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在层归一化器，则对每个层归一化器进行构建
        if getattr(self, "layer_norms", None) is not None:
            for layer, shape in zip(self.layer_norms, self.config.hidden_sizes):
                with tf.name_scope(layer.name):
                    layer.build([None, None, shape])
        # 如果存在块，则对每个块中的每个层进行构建
        if getattr(self, "block", None) is not None:
            for block in self.block:
                for layer in block:
                    with tf.name_scope(layer.name):
                        layer.build(None)
        # 如果存在嵌入层，则对每个嵌入层进行构建
        if getattr(self, "embeddings", None) is not None:
            for layer in self.embeddings:
                with tf.name_scope(layer.name):
                    layer.build(None)
@keras_serializable
class TFSegformerMainLayer(keras.layers.Layer):
    config_class = SegformerConfig

    def __init__(self, config: SegformerConfig, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        # hierarchical Transformer encoder
        self.encoder = TFSegformerEncoder(config, name="encoder")

    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple, TFBaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
        # So change the input format from `NCHW` to `NHWC`.
        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        encoder_outputs = self.encoder(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        sequence_output = encoder_outputs[0]
        # Change to NCHW output format to have uniformity in the modules
        sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])

        # Change the other hidden state outputs to NCHW as well
        if output_hidden_states:
            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])

        if not return_dict:
            if tf.greater(len(encoder_outputs[1:]), 0):
                transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0])
                return (sequence_output,) + (transposed_encoder_outputs,)
            else:
                return (sequence_output,) + encoder_outputs[1:]

        return TFBaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)


class TFSegformerPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """


注释：

# 定义一个可序列化的 Keras 层 `TFSegformerMainLayer`，用于分割器模型
@keras_serializable
class TFSegformerMainLayer(keras.layers.Layer):
    # 配置类为 SegformerConfig
    config_class = SegformerConfig

    def __init__(self, config: SegformerConfig, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        # hierarchical Transformer encoder
        # 使用给定的配置参数创建名为 "encoder" 的 Segformer 编码器
        self.encoder = TFSegformerEncoder(config, name="encoder")

    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple, TFBaseModelOutput]:
        # 设置输出注意力，默认为配置中的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态，默认为配置中的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典，默认为配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 当在 CPU 上运行时，`keras.layers.Conv2D` 不支持 `NCHW` 格式。
        # 因此将输入格式从 `NCHW` 转换为 `NHWC`。
        # 形状为 (batch_size, in_height, in_width, in_channels=num_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 使用编码器处理输入数据
        encoder_outputs = self.encoder(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取序列输出（通常是最后一个隐藏状态的输出）
        sequence_output = encoder_outputs[0]
        # 将输出格式转换为 `NCHW`，以保持模块的一致性
        sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])

        # 如果需要输出隐藏状态，则将它们也转换为 `NCHW` 格式
        if output_hidden_states:
            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])

        # 如果不需要返回字典，则返回调整后的编码器输出
        if not return_dict:
            if tf.greater(len(encoder_outputs[1:]), 0):
                # 转换所有其他隐藏状态输出为 `NCHW` 格式
                transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0])
                return (sequence_output,) + (transposed_encoder_outputs,)
            else:
                return (sequence_output,) + encoder_outputs[1:]

        # 否则返回 TFBaseModelOutput 类的实例，其中包含最后隐藏状态、隐藏状态和注意力权重
        return TFBaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建，则直接返回
        if self.built:
            return
        # 设置已构建标志
        self.built = True
        # 如果存在编码器，则在其命名范围内构建编码器
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)

# 定义一个基于 TFPreTrainedModel 的抽象类 TFSegformerPreTrainedModel，
# 用于处理权重初始化以及下载和加载预训练模型的简单接口
class TFSegformerPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    # 定义一个变量，指定配置类为SegformerConfig
    config_class = SegformerConfig
    # 定义一个字符串变量，作为基础模型前缀，设为"segformer"
    base_model_prefix = "segformer"
    # 定义一个字符串变量，表示主要输入的名称，设为"pixel_values"
    main_input_name = "pixel_values"
    
    # 定义一个属性方法，用于返回输入的签名信息
    @property
    def input_signature(self):
        # 返回一个字典，键为"pixel_values"，值为一个 TensorFlow 张量规格，
        # 其形状为(None, self.config.num_channels, 512, 512)，数据类型为 tf.float32
        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 512, 512), dtype=tf.float32)}
"""
定义了 SEGFORMER_START_DOCSTRING，包含了关于模型继承和参数配置的详细描述文档。
"""
SEGFORMER_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""

"""
定义了 SEGFORMER_INPUTS_DOCSTRING，包含了模型输入参数的详细描述文档。
"""
SEGFORMER_INPUTS_DOCSTRING = r"""

    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegformerImageProcessor.__call__`] for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.

        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""

"""
添加了模型描述的文档字符串，并调用了 `add_start_docstrings` 装饰器，将模型简介和参数文档串联起来。
"""
@add_start_docstrings(
    "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
    SEGFORMER_START_DOCSTRING,
)
class TFSegformerModel(TFSegformerPreTrainedModel):
    """
    TFSegformerModel 类继承自 TFSegformerPreTrainedModel，表示一个基础的 SegFormer 编码器（混合Transformer），
    输出未经特定顶部处理的原始隐藏状态。

    Args:
        config (SegformerConfig): 包含模型所有参数的配置类。使用配置文件初始化时，不会加载与模型关联的权重，只会加载配置。
            查看 `~TFPreTrainedModel.from_pretrained` 方法以加载模型权重。
    """
    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.config = config

        # hierarchical Transformer encoder
        # 层次化Transformer编码器
        self.segformer = TFSegformerMainLayer(config, name="segformer")

    """
    添加了文档字符串到模型的前向传播方法，描述了输入参数的详细信息。
    """
    @unpack_inputs
    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    # 使用装饰器为此方法添加文档字符串，指定了一些参数和预期输出类型等信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 定义一个方法 `call`，接受多个参数并返回 `TFBaseModelOutput` 类型或其元组
    def call(
        self,
        pixel_values: tf.Tensor,  # 输入像素值的张量
        output_attentions: Optional[bool] = None,  # 控制是否输出注意力权重的布尔值
        output_hidden_states: Optional[bool] = None,  # 控制是否输出隐藏状态的布尔值
        return_dict: Optional[bool] = None,  # 控制是否以字典形式返回输出的布尔值
        training: bool = False,  # 控制是否处于训练模式的布尔值，默认为 False
    ) -> Union[Tuple, TFBaseModelOutput]:  # 返回类型可以是元组或 `TFBaseModelOutput` 类型

        # 调用 `self.segformer` 方法，传递相应参数，并接收输出结果
        outputs = self.segformer(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # 返回 `segformer` 方法的输出结果
        return outputs

    # 定义 `build` 方法
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        
        # 将标志 `self.built` 设置为 True，表示已经构建
        self.built = True
        
        # 如果存在 `self.segformer` 属性，则在名称作用域内构建 `segformer`
        if getattr(self, "segformer", None) is not None:
            with tf.name_scope(self.segformer.name):
                self.segformer.build(None)
"""
SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
states) e.g. for ImageNet.
"""
# 使用 SegFormer 模型进行图像分类，顶部是一个线性层，放置在最终隐藏状态之上，例如用于 ImageNet 数据集。

class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels
        # 初始化 SegFormer 主体模型层
        self.segformer = TFSegformerMainLayer(config, name="segformer")

        # 分类器头部
        self.classifier = keras.layers.Dense(config.num_labels, name="classifier")
        self.config = config

    @unpack_inputs
    # 将起始文档字符串添加到模型的前向传播方法上，描述输入的格式
    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例的文档字符串
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        labels: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TFSequenceClassifierOutput]:
        outputs = self.segformer(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        # 将最后的隐藏状态转换为 (batch_size, height*width, hidden_size) 的形状
        batch_size = shape_list(sequence_output)[0]
        sequence_output = tf.transpose(sequence_output, perm=[0, 2, 3, 1])
        sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1]))

        # 全局平均池化
        sequence_output = tf.reduce_mean(sequence_output, axis=1)

        logits = self.classifier(sequence_output)

        # 如果没有标签，损失为 None；否则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return TFSequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "segformer", None) is not None:
            with tf.name_scope(self.segformer.name):
                self.segformer.build(None)
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_sizes[-1]])
    """
    
    def __init__(self, input_dim: int, config: SegformerConfig, **kwargs):
        super().__init__(**kwargs)  # 调用父类的构造函数，传递任何额外的关键字参数
        self.proj = keras.layers.Dense(config.decoder_hidden_size, name="proj")  # 初始化一个全连接层 Dense 对象，用于投影
        self.input_dim = input_dim  # 设置输入维度

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        height = shape_list(hidden_states)[1]  # 获取 hidden_states 张量的高度维度
        width = shape_list(hidden_states)[2]   # 获取 hidden_states 张量的宽度维度
        hidden_dim = shape_list(hidden_states)[-1]  # 获取 hidden_states 张量的最后一个维度（隐藏维度）
        hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim))  # 对 hidden_states 张量进行重新形状操作
        hidden_states = self.proj(hidden_states)  # 应用定义的投影层到 hidden_states 张量上
        return hidden_states  # 返回变换后的 hidden_states 张量

    def build(self, input_shape=None):
        if self.built:  # 如果模型已经构建过，则直接返回
            return
        self.built = True  # 将模型标记为已构建
        if getattr(self, "proj", None) is not None:  # 如果投影层存在
            with tf.name_scope(self.proj.name):  # 使用投影层的名字作为命名空间
                self.proj.build([None, None, self.input_dim])  # 构建投影层，指定输入维度
# 定义一个继承自TFSegformerPreTrainedModel的解码头类，用于Segformer模型
class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
    # 初始化方法，接受一个SegformerConfig对象和其他关键字参数
    def __init__(self, config: SegformerConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, **kwargs)
        
        # 初始化一个空列表，用于存储多个MLP模块
        mlps = []
        # 根据配置中的encoder块数量迭代创建MLP模块
        for i in range(config.num_encoder_blocks):
            # 创建一个TFSegformerMLP对象，设置输入维度和名称
            mlp = TFSegformerMLP(config=config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
            # 将创建的MLP模块添加到mlps列表中
            mlps.append(mlp)
        # 将创建的MLP模块列表赋值给当前对象的mlps属性
        self.mlps = mlps

        # 创建线性融合层，实现原始实现中的ConvModule
        self.linear_fuse = keras.layers.Conv2D(
            filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse"
        )
        # 创建批标准化层
        self.batch_norm = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm")
        # 创建激活函数层，使用ReLU激活函数
        self.activation = keras.layers.Activation("relu")

        # 创建dropout层，使用配置中的分类器dropout概率
        self.dropout = keras.layers.Dropout(config.classifier_dropout_prob)
        # 创建分类器层，输出通道数为配置中的标签数量
        self.classifier = keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier")

        # 将配置对象保存到当前对象的config属性中
        self.config = config

    # 定义call方法，接受encoder_hidden_states和training两个参数，返回一个Tensor对象
    def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 初始化一个空元组，用于存储所有隐藏状态
        all_hidden_states = ()
        # 迭代encoder_hidden_states和mlps列表
        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
            # 如果reshape_last_stage为False且encoder_hidden_state的形状长度为3
            if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
                # 计算height和width，并将encoder_hidden_state重塑为四维张量
                height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32))
                height = width = tf.cast(height, tf.int32)
                channel_dim = shape_list(encoder_hidden_state)[-1]
                encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))

            # 将encoder_hidden_state的通道维度移动到最后一个维度
            encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1])
            # 获取当前encoder_hidden_state的height和width
            height, width = shape_list(encoder_hidden_state)[1:3]
            # 将encoder_hidden_state传入mlp模块中进行处理
            encoder_hidden_state = mlp(encoder_hidden_state)
            # 获取处理后的encoder_hidden_state的通道维度
            channel_dim = shape_list(encoder_hidden_state)[-1]
            # 将encoder_hidden_state重塑为四维张量
            encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))

            # 上采样
            temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])
            upsample_resolution = shape_list(temp_state)[1:-1]
            encoder_hidden_state = tf.image.resize(encoder_hidden_state, size=upsample_resolution, method="bilinear")
            # 将处理后的encoder_hidden_state添加到all_hidden_states元组中
            all_hidden_states += (encoder_hidden_state,)

        # 对所有隐藏状态进行拼接并通过线性融合层处理
        hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1))
        # 对处理后的隐藏状态进行批标准化
        hidden_states = self.batch_norm(hidden_states, training=training)
        # 对批标准化后的隐藏状态进行ReLU激活
        hidden_states = self.activation(hidden_states)
        # 对激活后的隐藏状态进行dropout
        hidden_states = self.dropout(hidden_states, training=training)

        # 计算分类器的logits，形状为(batch_size, height/4, width/4, num_labels)
        logits = self.classifier(hidden_states)

        # 返回logits
        return logits
    # 定义模型的构建方法，用于初始化模型结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        
        # 如果存在线性融合层（linear_fuse），则构建该层
        if getattr(self, "linear_fuse", None) is not None:
            # 使用线性融合层的名称作为命名空间
            with tf.name_scope(self.linear_fuse.name):
                # 使用给定形状构建线性融合层，形状包括 None 表示任意大小
                self.linear_fuse.build(
                    [None, None, None, self.config.decoder_hidden_size * self.config.num_encoder_blocks]
                )
        
        # 如果存在批量归一化层（batch_norm），则构建该层
        if getattr(self, "batch_norm", None) is not None:
            # 使用批量归一化层的名称作为命名空间
            with tf.name_scope(self.batch_norm.name):
                # 使用给定形状构建批量归一化层，形状中的 None 表示任意大小
                self.batch_norm.build([None, None, None, self.config.decoder_hidden_size])
        
        # 如果存在分类器（classifier），则构建该层
        if getattr(self, "classifier", None) is not None:
            # 使用分类器的名称作为命名空间
            with tf.name_scope(self.classifier.name):
                # 使用给定形状构建分类器，形状中的 None 表示任意大小
                self.classifier.build([None, None, None, self.config.decoder_hidden_size])
        
        # 如果存在多层感知机（mlps），则逐层构建每个多层感知机层
        if getattr(self, "mlps", None) is not None:
            for layer in self.mlps:
                # 使用每层多层感知机层的名称作为命名空间
                with tf.name_scope(layer.name):
                    # 每层多层感知机层不需要特定的输入形状，因此传入 None
                    layer.build(None)
# 使用特定的文档字符串初始化一个 SegFormer 模型，该模型在顶部具有全MLP解码头，例如用于 ADE20k、CityScapes 数据集。
# 继承自 TFSegformerPreTrainedModel 类
class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
    def __init__(self, config: SegformerConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, **kwargs)
        # 初始化 SegFormer 主层，并命名为 "segformer"
        self.segformer = TFSegformerMainLayer(config, name="segformer")
        # 初始化 SegFormer 解码头，并命名为 "decode_head"
        self.decode_head = TFSegformerDecodeHead(config, name="decode_head")

    def hf_compute_loss(self, logits, labels):
        # 将 logits 插值（上采样）到原始图像尺寸
        # `labels` 的形状为 (batch_size, height, width)
        label_interp_shape = shape_list(labels)[1:]

        upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
        # 定义加权损失函数
        loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

        def masked_loss(real, pred):
            # 计算未屏蔽的损失
            unmasked_loss = loss_fct(real, pred)
            # 创建掩码，排除标签为 self.config.semantic_loss_ignore_index 的位置
            mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype)
            masked_loss = unmasked_loss * mask
            # 通过加权损失计算减少的掩码损失
            reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)
            return tf.reshape(reduced_masked_loss, (1,))

        return masked_loss(labels, upsampled_logits)

    @unpack_inputs
    # 添加前向模型调用的文档字符串，使用 SEGFORMER_INPUTS_DOCSTRING 模板，指定输入的格式为 "batch_size, sequence_length"
    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 替换返回值文档字符串，指定输出类型为 TFSemanticSegmenterOutput，使用 _CONFIG_FOR_DOC 类配置
    @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        pixel_values: tf.Tensor,
        labels: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 不为 None，则使用其当前值；否则使用 self.config.use_return_dict 的值

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 output_hidden_states 不为 None，则使用其当前值；否则使用 self.config.output_hidden_states 的值

        outputs = self.segformer(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=True,  # we need the intermediate hidden states
            return_dict=return_dict,
        )
        # 调用 self.segformer 进行语义分割模型的前向传播计算，传入像素值 pixel_values，
        # 设置 output_attentions 和 return_dict 参数，确保返回中间隐藏状态

        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
        # 如果 return_dict 为 True，则使用 outputs 的 hidden_states；否则使用 outputs 的第二个元素作为编码器的隐藏状态

        logits = self.decode_head(encoder_hidden_states)
        # 根据编码器的隐藏状态计算预测 logits

        loss = None
        if labels is not None:
            if not self.config.num_labels > 1:
                raise ValueError("The number of labels should be greater than one")
            else:
                loss = self.hf_compute_loss(logits=logits, labels=labels)
        # 如果 labels 不为 None，则计算损失值，确保标签数量大于1，否则抛出 ValueError

        # 调整 logits 的形状为 (batch_size, num_labels, height, width)，以保持 API 一致性
        logits = tf.transpose(logits, perm=[0, 3, 1, 2])

        if not return_dict:
            if output_hidden_states:
                output = (logits,) + outputs[1:]
            else:
                output = (logits,) + outputs[2:]
            # 如果 return_dict 为 False，根据 output_hidden_states 的值选择返回 logits 和隐藏状态列表或注意力列表
            return ((loss,) + output) if loss is not None else output
        # 返回包含损失值和输出内容的元组，如果损失值为 None，则返回输出内容

        return TFSemanticSegmenterOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )
        # 如果 return_dict 为 True，则返回 TFSemanticSegmenterOutput 对象，包含损失值、logits、隐藏状态和注意力信息
    # 定义模型构建方法，接受输入形状参数，默认为None
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，不再重复构建
        if self.built:
            return
        # 设置模型已构建标志为True
        self.built = True
        
        # 检查是否存在名为"segformer"的属性，并且该属性不为None
        if getattr(self, "segformer", None) is not None:
            # 在TensorFlow的命名作用域内，使用self.segformer.name作为作用域名
            with tf.name_scope(self.segformer.name):
                # 调用self.segformer对象的build方法，传入None作为输入形状参数
                self.segformer.build(None)
        
        # 检查是否存在名为"decode_head"的属性，并且该属性不为None
        if getattr(self, "decode_head", None) is not None:
            # 在TensorFlow的命名作用域内，使用self.decode_head.name作为作用域名
            with tf.name_scope(self.decode_head.name):
                # 调用self.decode_head对象的build方法，传入None作为输入形状参数
                self.decode_head.build(None)

`.\models\segformer\init.py`

# 导入必要的模块和函数声明
from typing import TYPE_CHECKING  # 导入类型检查模块

# 导入必要的依赖项和函数
from ...utils import (
    OptionalDependencyNotAvailable,  # 导入自定义的可选依赖未安装异常
    _LazyModule,  # 导入自定义的懒加载模块
    is_tf_available,  # 导入检查 TensorFlow 是否可用的函数
    is_torch_available,  # 导入检查 PyTorch 是否可用的函数
    is_vision_available,  # 导入检查视觉处理模块是否可用的函数
)

# 定义模块的导入结构字典
_import_structure = {
    "configuration_segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig", "SegformerOnnxConfig"]
}

# 检查视觉处理模块是否可用，若不可用则抛出自定义的可选依赖未安装异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若视觉处理模块可用，则导入相应的特征提取和图像处理函数
    _import_structure["feature_extraction_segformer"] = ["SegformerFeatureExtractor"]
    _import_structure["image_processing_segformer"] = ["SegformerImageProcessor"]

# 检查 PyTorch 是否可用，若不可用则抛出自定义的可选依赖未安装异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 PyTorch 可用，则导入相应的 Segformer 模型组件
    _import_structure["modeling_segformer"] = [
        "SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SegformerDecodeHead",
        "SegformerForImageClassification",
        "SegformerForSemanticSegmentation",
        "SegformerLayer",
        "SegformerModel",
        "SegformerPreTrainedModel",
    ]

# 检查 TensorFlow 是否可用，若不可用则抛出自定义的可选依赖未安装异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 TensorFlow 可用，则导入相应的 TensorFlow Segformer 模型组件
    _import_structure["modeling_tf_segformer"] = [
        "TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFSegformerDecodeHead",
        "TFSegformerForImageClassification",
        "TFSegformerForSemanticSegmentation",
        "TFSegformerModel",
        "TFSegformerPreTrainedModel",
    ]

# 如果是类型检查阶段，则进一步导入类型相关的模块和函数
if TYPE_CHECKING:
    from .configuration_segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig, SegformerOnnxConfig

    # 检查视觉处理模块是否可用，在类型检查阶段导入相应的特征提取和图像处理函数
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .feature_extraction_segformer import SegformerFeatureExtractor
        from .image_processing_segformer import SegformerImageProcessor

    # 检查 PyTorch 是否可用，在类型检查阶段导入相关的模型组件
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果不是 TensorFlow 环境，则导入本地的 Segformer 模型相关模块
    else:
        from .modeling_segformer import (
            SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            SegformerDecodeHead,
            SegformerForImageClassification,
            SegformerForSemanticSegmentation,
            SegformerLayer,
            SegformerModel,
            SegformerPreTrainedModel,
        )
    try:
        # 检查当前环境是否可用 TensorFlow
        if not is_tf_available():
            # 如果 TensorFlow 不可用，则抛出 OptionalDependencyNotAvailable 异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果发生 OptionalDependencyNotAvailable 异常，不做任何处理，继续执行
        pass
    else:
        # 如果 TensorFlow 可用，则导入 TensorFlow 版本的 Segformer 模型相关模块
        from .modeling_tf_segformer import (
            TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFSegformerDecodeHead,
            TFSegformerForImageClassification,
            TFSegformerForSemanticSegmentation,
            TFSegformerModel,
            TFSegformerPreTrainedModel,
        )
else:
    # 如果不满足前面的条件，即不是第一次导入模块时执行的分支
    import sys
    # 导入 sys 模块，用于操作 Python 解释器相关的功能

    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 封装
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
    # 创建一个 _LazyModule 对象并将其赋值给当前模块的键名，设置模块的相关属性

`.\models\seggpt\configuration_seggpt.py`

# 导入所需的模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 映射预训练模型名称到其配置文件的 URL 地址
SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "BAAI/seggpt-vit-large": "https://huggingface.co/BAAI/seggpt-vit-large/resolve/main/config.json",
}

# SegGptConfig 类，继承自 PretrainedConfig 类
class SegGptConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SegGptModel`]. It is used to instantiate a SegGPT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the SegGPT
    [BAAI/seggpt-vit-large](https://huggingface.co/BAAI/seggpt-vit-large) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            # 编码器层和池化层的维度。
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            # Transformer 编码器中隐藏层的数量。
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            # 每个注意力层中的注意力头的数量。
            Number of attention heads for each attention layer in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            # 编码器和池化器中的非线性激活函数。
            The non-linear activation function (function or string) in the encoder and pooler.
            如果是字符串，支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`。
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            # 嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        initializer_range (`float`, *optional*, defaults to 0.02):
            # 用于初始化所有权重矩阵的截断正态分布的标准差。
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            # 层归一化层使用的 epsilon。
            The epsilon used by the layer normalization layers.
        image_size (`List[int]`, *optional*, defaults to `[896, 448]`):
            # 每个图像的大小（分辨率）。
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            # 每个图块的大小（分辨率）。
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            # 输入通道的数量。
            The number of input channels.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            # 是否为查询、键和值添加偏置。
            Whether to add a bias to the queries, keys and values.
        mlp_dim (`int`, *optional*):
            # Transformer 编码器中MLP层的维度。如果未设置，默认为 `hidden_size * 4`。
            The dimensionality of the MLP layer in the Transformer encoder.
        drop_path_rate (`float`, *optional*, defaults to 0.1):
            # dropout层的drop path比率。
            The drop path rate for the dropout layers.
        pretrain_image_size (`int`, *optional*, defaults to 224):
            # 绝对位置嵌入的预训练大小。
            The pretrained size of the absolute position embeddings.
        decoder_hidden_size (`int`, *optional*, defaults to 64):
            # 解码器的隐藏大小。
            Hidden size for decoder.
        use_relative_position_embeddings (`bool`, *optional*, defaults to `True`):
            # 是否在注意力层中使用相对位置嵌入。
            Whether to use relative position embeddings in the attention layers.
        merge_index (`int`, *optional*, defaults to 2):
            # 合并嵌入的编码器层的索引。
            The index of the encoder layer to merge the embeddings.
        intermediate_hidden_state_indices (`List[int]`, *optional*, defaults to `[5, 11, 17, 23]`):
            # 我们存储为解码器特征的编码器层的索引。
            The indices of the encoder layers which we store as features for the decoder.
        beta (`float`, *optional*, defaults to 0.01):
            # SegGptLoss（平滑L1损失）的正则化因子。
            Regularization factor for SegGptLoss (smooth-l1 loss).

    Example:

    ```
    >>> from transformers import SegGptConfig, SegGptModel

    >>> # Initializing a SegGPT seggpt-vit-large style configuration
    >>> configuration = SegGptConfig()
    # 初始化一个 SegGptModel 模型对象，使用给定的配置参数（包含随机权重）
    model = SegGptModel(configuration)
    
    # 访问模型的配置参数
    configuration = model.config

`.\models\seggpt\convert_seggpt_to_hf.py`

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert SegGPT checkpoints from the original repository.

URL: https://github.com/baaivision/Painter/tree/main/SegGPT
"""


import argparse  # 导入命令行参数解析模块

import requests  # 导入处理 HTTP 请求的模块
import torch  # 导入 PyTorch 深度学习框架
from PIL import Image  # 导入处理图像的模块

from transformers import SegGptConfig, SegGptForImageSegmentation, SegGptImageProcessor  # 导入 SegGPT 相关模块
from transformers.utils import logging  # 导入日志模块

logging.set_verbosity_info()  # 设置日志的详细程度为 info 级别
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


# here we list all keys to be renamed (original name on the left, our name on the right)
# 定义一个函数，列出需要重命名的所有键值对（左边是原始名称，右边是我们使用的名称）
def create_rename_keys(config):
    rename_keys = []  # 初始化空的重命名键值对列表

    # fmt: off

    # rename embedding and its parameters
    # 重命名嵌入和其参数
    rename_keys.append(("patch_embed.proj.weight", "model.embeddings.patch_embeddings.projection.weight"))
    rename_keys.append(("patch_embed.proj.bias", "model.embeddings.patch_embeddings.projection.bias"))
    rename_keys.append(("mask_token", "model.embeddings.mask_token"))
    rename_keys.append(("segment_token_x", "model.embeddings.segment_token_input"))
    rename_keys.append(("segment_token_y", "model.embeddings.segment_token_prompt"))
    rename_keys.append(("type_token_cls", "model.embeddings.type_token_semantic"))
    rename_keys.append(("type_token_ins", "model.embeddings.type_token_instance"))
    rename_keys.append(("pos_embed", "model.embeddings.position_embeddings"))

    # rename decoder and other
    # 重命名解码器和其他部分
    rename_keys.append(("norm.weight", "model.encoder.layernorm.weight"))
    rename_keys.append(("norm.bias", "model.encoder.layernorm.bias"))
    rename_keys.append(("decoder_embed.weight", "decoder.decoder_embed.weight"))
    rename_keys.append(("decoder_embed.bias", "decoder.decoder_embed.bias"))
    rename_keys.append(("decoder_pred.0.weight", "decoder.decoder_pred.conv.weight"))
    rename_keys.append(("decoder_pred.0.bias", "decoder.decoder_pred.conv.bias"))
    rename_keys.append(("decoder_pred.1.weight", "decoder.decoder_pred.layernorm.weight"))
    rename_keys.append(("decoder_pred.1.bias", "decoder.decoder_pred.layernorm.bias"))
    rename_keys.append(("decoder_pred.3.weight", "decoder.decoder_pred.head.weight"))
    rename_keys.append(("decoder_pred.3.bias", "decoder.decoder_pred.head.bias"))

    # rename blocks

    # fmt: on
    # 遍历从 0 到 config.num_hidden_layers-1 的范围，进行重命名键的添加
    for i in range(config.num_hidden_layers):
        # 添加注意力层的权重重命名键
        rename_keys.append((f"blocks.{i}.attn.qkv.weight", f"model.encoder.layers.{i}.attention.qkv.weight"))
        # 添加注意力层的偏置项重命名键
        rename_keys.append((f"blocks.{i}.attn.qkv.bias", f"model.encoder.layers.{i}.attention.qkv.bias"))
        # 添加注意力层投影层权重的重命名键
        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"model.encoder.layers.{i}.attention.proj.weight"))
        # 添加注意力层投影层偏置项的重命名键
        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"model.encoder.layers.{i}.attention.proj.bias"))
        # 添加注意力层相对位置编码（水平方向）的重命名键
        rename_keys.append((f"blocks.{i}.attn.rel_pos_h", f"model.encoder.layers.{i}.attention.rel_pos_h"))
        # 添加注意力层相对位置编码（垂直方向）的重命名键
        rename_keys.append((f"blocks.{i}.attn.rel_pos_w", f"model.encoder.layers.{i}.attention.rel_pos_w"))

        # 添加多层感知机（MLP）的第一个全连接层权重的重命名键
        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"model.encoder.layers.{i}.mlp.lin1.weight"))
        # 添加多层感知机（MLP）的第一个全连接层偏置项的重命名键
        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"model.encoder.layers.{i}.mlp.lin1.bias"))
        # 添加多层感知机（MLP）的第二个全连接层权重的重命名键
        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"model.encoder.layers.{i}.mlp.lin2.weight"))
        # 添加多层感知机（MLP）的第二个全连接层偏置项的重命名键
        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"model.encoder.layers.{i}.mlp.lin2.bias"))

        # 添加注意力层前层归一化权重的重命名键
        rename_keys.append((f"blocks.{i}.norm1.weight", f"model.encoder.layers.{i}.layernorm_before.weight"))
        # 添加注意力层前层归一化偏置项的重命名键
        rename_keys.append((f"blocks.{i}.norm1.bias", f"model.encoder.layers.{i}.layernorm_before.bias"))
        # 添加注意力层后层归一化权重的重命名键
        rename_keys.append((f"blocks.{i}.norm2.weight", f"model.encoder.layers.{i}.layernorm_after.weight"))
        # 添加注意力层后层归一化偏置项的重命名键
        rename_keys.append((f"blocks.{i}.norm2.bias", f"model.encoder.layers.{i}.layernorm_after.bias"))

    # 返回所有添加的重命名键列表
    return rename_keys
# 从字典中移除旧键，并将其对应的值存储在变量val中
def rename_key(dct, old, new):
    val = dct.pop(old)
    # 将旧键的值存储在新键下
    dct[new] = val


# 准备输入数据，包括图像和掩模
def prepare_input():
    # 定义输入图像的URL
    image_input_url = (
        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
    )
    # 定义提示图像的URL
    image_prompt_url = (
        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
    )
    # 定义掩模图像的URL
    mask_prompt_url = (
        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
    )

    # 使用requests库获取并打开输入图像、提示图像和掩模图像的二进制数据
    image_input = Image.open(requests.get(image_input_url, stream=True).raw)
    image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
    mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw)

    # 返回准备好的图像和掩模
    return image_input, image_prompt, mask_prompt


# 使用torch.no_grad()装饰器，以确保在推理时不会计算梯度
@torch.no_grad()
def convert_seggpt_checkpoint(args):
    # 从参数中获取模型名称、PyTorch模型保存路径、是否验证logits以及是否推送到Hub
    model_name = args.model_name
    pytorch_dump_folder_path = args.pytorch_dump_folder_path
    verify_logits = args.verify_logits
    push_to_hub = args.push_to_hub

    # 定义SegGPT模型的配置，默认使用SegGptConfig()
    config = SegGptConfig()

    # 加载原始的检查点文件，从Hugging Face模型中心加载
    checkpoint_url = "https://huggingface.co/BAAI/SegGpt/blob/main/seggpt_vit_large.pth"
    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]

    # 创建新的状态字典副本
    new_state_dict = original_state_dict.copy()

    # 调用create_rename_keys函数创建需要重命名的键列表
    rename_keys = create_rename_keys(config)

    # 遍历重命名键列表，将新旧键映射应用于new_state_dict
    for src, dest in rename_keys:
        rename_key(new_state_dict, src, dest)

    # 实例化SegGptForImageSegmentation模型
    model = SegGptForImageSegmentation(config)
    model.eval()

    # 加载新的状态字典到模型中，strict=False表示允许缺失键和多余键
    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
    # 打印缺失的键和多余的键
    print("Missing keys:", missing_keys)
    print("Unexpected keys:", unexpected_keys)

    # 准备输入数据，获取输入图像、提示图像和掩模
    input_img, prompt_img, prompt_mask = prepare_input()

    # 实例化SegGptImageProcessor
    image_processor = SegGptImageProcessor()

    # 使用image_processor处理输入图像、提示图像和掩模，返回PyTorch张量
    inputs = image_processor(images=input_img, prompt_images=prompt_img, prompt_masks=prompt_mask, return_tensors="pt")

    # 预期的提示像素值张量，用于验证结果
    expected_prompt_pixel_values = torch.tensor(
        [
            [[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965]],
            [[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583]],
            [[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088]],
        ]
    )

    # 预期的像素值张量，用于验证结果
    expected_pixel_values = torch.tensor(
        [
            [[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810], [1.5810, 1.5639, 1.5639]],
            [[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031], [1.2206, 1.2031, 1.1681]],
            [[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465], [1.6291, 1.6291, 1.6291]],
        ]
    )
    # 定义期望的像素值，这里使用 torch.tensor 创建张量
    expected_prompt_masks = torch.tensor(
        [
            [[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179]],
            [[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357]],
            [[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044]],
        ]
    )

    # 检查模型输入的像素值是否与期望的像素值接近，设置容忍度为 1e-4
    assert torch.allclose(inputs.pixel_values[0, :, :3, :3], expected_pixel_values, atol=1e-4)
    # 检查模型输入的提示像素值是否与期望的像素值接近，设置容忍度为 1e-4
    assert torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_values, atol=1e-4)
    # 检查模型输入的提示掩码是否与期望的掩码接近，设置容忍度为 1e-4
    assert torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4)

    # 设置随机种子为 2
    torch.manual_seed(2)
    # 使用模型处理给定的输入
    outputs = model(**inputs)
    # 打印模型输出
    print(outputs)

    # 如果需要验证 logits，检查模型输出的预测掩码是否与期望的输出接近，设置容忍度为 1e-4
    if verify_logits:
        expected_output = torch.tensor(
            [
                [[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
                [[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
                [[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
            ]
        )
        # 检查模型输出的预测掩码是否与期望的输出接近，设置容忍度为 1e-4
        assert torch.allclose(outputs.pred_masks[0, :, :3, :3], expected_output, atol=1e-4)
        # 打印验证通过信息
        print("Looks good!")
    else:
        # 如果不需要验证 logits，则打印转换完成信息
        print("Converted without verifying logits")

    # 如果指定了 PyTorch 导出文件夹路径
    if pytorch_dump_folder_path is not None:
        # 打印保存模型和处理器的信息
        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将图像处理器保存到指定路径
        image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 打印推送模型和处理器到 Hub 的信息
        print(f"Pushing model and processor for {model_name} to hub")
        # 推送模型到 Hub
        model.push_to_hub(f"EduardoPacheco/{model_name}")
        # 推送图像处理器到 Hub
        image_processor.push_to_hub(f"EduardoPacheco/{model_name}")
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    
    # 添加必需的参数
    parser.add_argument(
        "--model_name",
        default="seggpt-vit-large",
        type=str,
        choices=["seggpt-vit-large"],
        help="Name of the SegGpt model you'd like to convert.",
    )
    
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output PyTorch model directory."
    )
    
    # 添加可选的参数
    parser.add_argument(
        "--verify_logits",
        action="store_false",
        help="Whether or not to verify the logits against the original implementation.",
    )
    
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether or not to push the converted model to the 🤗 hub."
    )

    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用函数 convert_seggpt_checkpoint，传入解析后的参数对象 args
    convert_seggpt_checkpoint(args)

`.\models\seggpt\image_processing_seggpt.py`

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for SegGPT."""

from typing import Dict, List, Optional, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_channel_dimension_axis,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
)
from ...utils import TensorType, is_torch_available, logging, requires_backends


if is_torch_available():
    import torch


logger = logging.get_logger(__name__)


# See https://arxiv.org/pdf/2212.02499.pdf  at 3.1 Redefining Output Spaces as "Images" - Semantic Segmentation from PAINTER paper
# Taken from https://github.com/Abdullah-Meda/Painter/blob/main/Painter/data/coco_semseg/gen_color_coco_panoptic_segm.py#L31
# 根据给定的类别数目构建调色板，返回一个颜色元组列表
def build_palette(num_labels: int) -> List[Tuple[int, int]]:
    base = int(num_labels ** (1 / 3)) + 1
    margin = 256 // base

    # 假设类别索引0代表背景，映射到黑色
    color_list = [(0, 0, 0)]
    # 生成调色板列表
    for location in range(num_labels):
        num_seq_r = location // base**2
        num_seq_g = (location % base**2) // base
        num_seq_b = location % base

        R = 255 - num_seq_r * margin
        G = 255 - num_seq_g * margin
        B = 255 - num_seq_b * margin

        color_list.append((R, G, B))

    return color_list


# 获取图像的通道数，根据输入数据格式和图像数组
def get_num_channels(image: np.ndarray, input_data_format: ChannelDimension) -> int:
    if image.ndim == 2:
        return 0

    channel_idx = get_channel_dimension_axis(image, input_data_format)
    return image.shape[channel_idx]


# 将掩码转换为RGB图像，使用指定的调色板（可选），输入数据格式和输出数据格式
def mask_to_rgb(
    mask: np.ndarray,
    palette: Optional[List[Tuple[int, int]]] = None,
    input_data_format: Optional[ChannelDimension] = None,
    data_format: Optional[ChannelDimension] = None,
) -> np.ndarray:
    # 如果未指定输入数据格式并且掩码维度大于2，则推断输入数据格式
    if input_data_format is None and mask.ndim > 2:
        input_data_format = infer_channel_dimension_format(mask)

    # 如果未指定输出数据格式，则使用输入数据格式作为输出数据格式
    data_format = data_format if data_format is not None else input_data_format

    # 获取掩码的通道数
    num_channels = get_num_channels(mask, input_data_format)
    # 如果输入图像的通道数为 3，调用函数将掩码转换为指定通道格式，如果指定格式不为空；否则直接返回掩码
    if num_channels == 3:
        return to_channel_dimension_format(mask, data_format, input_data_format) if data_format is not None else mask

    # 如果调色板不为空，则处理彩色掩码
    if palette is not None:
        # 获取掩码的高度和宽度
        height, width = mask.shape

        # 创建一个全零的 RGB 掩码，形状为 (3, height, width)，数据类型为无符号 8 位整数
        rgb_mask = np.zeros((3, height, width), dtype=np.uint8)

        # 获取掩码中唯一的类别值
        classes_in_mask = np.unique(mask)

        # 遍历每个类别
        for class_idx in classes_in_mask:
            # 获取当前类别对应的 RGB 值
            rgb_value = palette[class_idx]

            # 创建当前类别的二值掩码，并扩展一个通道维度
            class_mask = (mask == class_idx).astype(np.uint8)
            class_mask = np.expand_dims(class_mask, axis=-1)

            # 将当前类别的 RGB 掩码计算出来，并移动通道维度到最前面
            class_rgb_mask = class_mask * np.array(rgb_value)
            class_rgb_mask = np.moveaxis(class_rgb_mask, -1, 0)

            # 将当前类别的 RGB 掩码加到总的 RGB 掩码上
            rgb_mask += class_rgb_mask.astype(np.uint8)

        # 将 RGB 掩码限制在 [0, 255] 范围内，并转换为无符号 8 位整数类型
        rgb_mask = np.clip(rgb_mask, 0, 255).astype(np.uint8)

    else:
        # 如果调色板为空，则将单通道掩码复制为三通道，形成灰度到 RGB 的映射
        rgb_mask = np.repeat(mask[None, ...], 3, axis=0)

    # 返回处理后的 RGB 掩码，如果指定通道格式不为空，则转换为指定格式；否则直接返回 RGB 掩码
    return (
        to_channel_dimension_format(rgb_mask, data_format, input_data_format) if data_format is not None else rgb_mask
    )
    r"""
    Constructs a SegGpt image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 448, "width": 448}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    """

    # List of model input names expected by the SegGpt model
    model_input_names = ["pixel_values"]

    # Initialize the SegGptImageProcessor class with various parameters
    def __init__(
        self,
        do_resize: bool = True,  # Whether to resize images by default
        size: Optional[Dict[str, int]] = None,  # Default size for image resizing
        resample: PILImageResampling = PILImageResampling.BICUBIC,  # Default resampling method
        do_rescale: bool = True,  # Whether to rescale images by default
        rescale_factor: Union[int, float] = 1 / 255,  # Default rescaling factor
        do_normalize: bool = True,  # Whether to normalize images by default
        image_mean: Optional[Union[float, List[float]]] = None,  # Default image mean for normalization
        image_std: Optional[Union[float, List[float]]] = None,  # Default image standard deviation for normalization
        **kwargs,  # Additional keyword arguments
    ) -> None:
        super().__init__(**kwargs)
        # 设置大小，如果未指定则使用默认大小
        size = size if size is not None else {"height": 448, "width": 448}
        # 根据给定的大小获取标准化后的尺寸字典
        size = get_size_dict(size)
        # 是否执行调整大小操作
        self.do_resize = do_resize
        # 是否执行重新缩放操作
        self.do_rescale = do_rescale
        # 是否执行标准化操作
        self.do_normalize = do_normalize
        # 图像的大小
        self.size = size
        # 重采样方法
        self.resample = resample
        # 重新缩放因子
        self.rescale_factor = rescale_factor
        # 图像均值，如果未指定则使用默认值
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        # 图像标准差，如果未指定则使用默认值
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD

    def get_palette(self, num_labels: int) -> List[Tuple[int, int]]:
        """Build a palette to map the prompt mask from a single channel to a 3 channel RGB.

        Args:
            num_labels (`int`):
                Number of classes in the segmentation task (excluding the background).

        Returns:
            `List[Tuple[int, int]]`: Palette to map the prompt mask from a single channel to a 3 channel RGB.
        """
        # 调用 build_palette 函数创建用于将单通道掩码映射到3通道RGB的调色板
        return build_palette(num_labels)

    def mask_to_rgb(
        self,
        image: np.ndarray,
        palette: Optional[List[Tuple[int, int]]] = None,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Convert a mask to RGB format.

        Args:
            image (`np.ndarray`):
                Mask to convert to RGB format. If the mask is already in RGB format, it will be passed through.
            palette (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
                Palette to use to convert the mask to RGB format. If unset, the mask is duplicated across the channel
                dimension.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The mask in RGB format.
        """
        # 调用 mask_to_rgb 函数将掩码转换为RGB格式的图像
        return mask_to_rgb(
            image,
            palette=palette,
            data_format=data_format,
            input_data_format=input_data_format,
        )
    # 定义 resize 方法，用于调整图像大小
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        # 获取调整后的目标尺寸字典
        size = get_size_dict(size)
        # 检查 size 字典是否包含 "height" 和 "width" 键
        if "height" not in size or "width" not in size:
            # 如果缺少必要的键，则引发 ValueError 异常
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        # 设定输出图像的尺寸元组
        output_size = (size["height"], size["width"])
        # 调用 resize 函数来实际执行图像的调整大小操作
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
    # 定义私有方法 _preprocess_step，用于预处理步骤
    def _preprocess_step(
        self,
        images: ImageInput,
        is_mask: bool = False,
        do_resize: Optional[bool] = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        num_labels: Optional[int] = None,
        **kwargs,
    ):
        # 预处理步骤，可能包括图像大小调整、缩放、归一化等操作
        pass

    # 定义公共方法 preprocess，用于执行预处理操作
    def preprocess(
        self,
        images: Optional[ImageInput] = None,
        prompt_images: Optional[ImageInput] = None,
        prompt_masks: Optional[ImageInput] = None,
        do_resize: Optional[bool] = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        num_labels: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        # 执行预处理，可能包括图像操作及其它参数的设置
        pass

    # 定义语义分割后处理方法 post_process_semantic_segmentation
    def post_process_semantic_segmentation(
        self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None, num_labels: Optional[int] = None
    ):
        # 对语义分割模型输出进行后处理，可能涉及到结果尺寸调整和标签数处理
        pass

`.\models\seggpt\modeling_seggpt.py`

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch SegGpt model."""


import collections.abc
import math
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import functional as F

# Importing utilities and components from the HuggingFace library
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# Import configuration specific to SegGpt
from .configuration_seggpt import SegGptConfig


# Get the logger for this module
logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "SegGptConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "BAAI/seggpt-vit-large"
_EXPECTED_OUTPUT_SHAPE = [3, 896, 448]

# List of pretrained model archive names specific to SegGpt
SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "BAAI/seggpt-vit-large",
    # See all SegGpt models at https://huggingface.co/models?filter=seggpt
]


@dataclass
class SegGptEncoderOutput(ModelOutput):
    """
    Output type of [`SegGptEncoderOutput`].
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
            Tuple of *torch.FloatTensor* (one for each layer) of shape
            `(batch_size, num_heads, seq_len, seq_len)`.
        intermediate_hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.intermediate_hidden_state_indices` is set):
            Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
            Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
            Additionaly, each feature passes through a LayerNorm.
    """

    last_hidden_state: torch.FloatTensor
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    # intermediate_hidden_states 是一个可选的类型为 Tuple[torch.FloatTensor] 的变量，初始值为 None。
    intermediate_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class SegGptImageSegmentationOutput(ModelOutput):
    """
    Output type of [`SegGptImageSegmentationOutput`].

    Args:
        loss (`torch.FloatTensor`, `optional`, returned when `labels` is provided):
            The loss value.
        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            The predicted masks.
        hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape
            `(batch_size, num_heads, seq_len, seq_len)`.
    """

    loss: Optional[torch.FloatTensor] = None  # 可选的损失值
    pred_masks: Optional[torch.FloatTensor] = None  # 可选的预测掩码
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 可选的隐藏状态元组
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # 可选的注意力元组


# Copied from transformers.models.sam.modeling_sam.SamPatchEmbeddings with Sam->SegGpt
class SegGptPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        self.image_size = image_size  # 图像尺寸
        self.patch_size = patch_size  # 补丁尺寸
        self.num_channels = num_channels  # 通道数
        self.num_patches = num_patches  # 补丁数量

        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, pixel_values):
        batch_size, num_channels, height, width = pixel_values.shape
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        if height != self.image_size[0] or width != self.image_size[1]:
            raise ValueError(
                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
            )
        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)  # 投影像素值到补丁嵌入的张量维度
        return embeddings


class SegGptEmbeddings(nn.Module):
    """
    Placeholder for SegGptEmbeddings class definition.
    """
    Construct the embeddings from patch, position embeddings for input and prompt.
    """

    # 定义一个名为SegGptEmbeddings的类，继承自父类nn.Module
    def __init__(self, config: SegGptConfig) -> None:
        super().__init__()

        # 定义用于掩码的张量参数
        self.mask_token = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
        # 定义输入分段标记的张量参数
        self.segment_token_input = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
        # 定义提示分段标记的张量参数
        self.segment_token_prompt = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
        # 定义语义类型标记的张量参数
        # token for seg types
        self.type_token_semantic = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
        # 定义实例类型标记的张量参数
        self.type_token_instance = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))

        # 初始化图像块嵌入对象
        self.patch_embeddings = SegGptPatchEmbeddings(config)

        # 计算位置嵌入的数量
        num_positions = (config.pretrain_image_size // config.patch_size) ** 2 + 1
        # 定义位置嵌入的张量参数
        self.position_embeddings = nn.Parameter(torch.randn(1, num_positions, config.hidden_size))
        # 定义丢弃层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 定义一个插值位置编码的方法
    def interpolate_pos_encoding(self, height: int, width: int) -> torch.Tensor:
        # 获取位置编码中的图像块位置嵌入
        patch_pos_embed = self.position_embeddings[:, 1:]
        # 计算图像块的数量
        num_patches = patch_pos_embed.shape[1]
        # 计算预训练图像块大小的平方根
        pretrain_patch_size = int(math.sqrt(num_patches))

        # 如果预训练图像块大小与给定的高度或宽度不匹配，则进行插值处理
        if pretrain_patch_size != height or pretrain_patch_size != width:
            # 使用双三次插值方法对位置编码进行插值
            patch_pos_embed = F.interpolate(
                patch_pos_embed.reshape(1, pretrain_patch_size, pretrain_patch_size, -1).permute(0, 3, 1, 2),
                size=(height, width),
                mode="bicubic",
                align_corners=False,
            )

            # 将插值后的位置编码张量进行维度调整，并返回
            return patch_pos_embed.permute(0, 2, 3, 1)
        else:
            # 如果不需要插值，则直接返回原始的位置编码张量
            return patch_pos_embed.reshape(1, height, width, -1)

    # 定义前向传播方法
    def forward(
        self,
        pixel_values: torch.Tensor,
        prompt_pixel_values: torch.Tensor,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        embedding_type: Optional[str] = None,
        # 继续定义其他参数
    ) -> torch.Tensor:
        # 使用self.patch_embeddings方法将像素值转换为输入嵌入
        input_embeddings = self.patch_embeddings(pixel_values)
        # 使用self.patch_embeddings方法将提示像素值转换为提示嵌入
        prompt_embeddings = self.patch_embeddings(prompt_pixel_values)

        # 获取输入嵌入的维度信息
        batch_size, patch_height, patch_width, _ = input_embeddings.shape

        # 扩展mask_token以匹配输入嵌入的形状
        mask_token = self.mask_token.expand(batch_size, patch_height, patch_width, -1)
        # 使用bool_masked_pos创建一个掩码，将掩码处的视觉标记替换为mask_token
        w = bool_masked_pos.unsqueeze(-1).type_as(mask_token).reshape(-1, patch_height, patch_width, 1)
        prompt_embeddings = prompt_embeddings * (1 - w) + mask_token * w

        # 如果未指定embedding_type，则默认为"instance"
        embedding_type = embedding_type if embedding_type is not None else "instance"

        # 添加位置编码到每个标记
        pos_embed = self.interpolate_pos_encoding(patch_height, patch_width)

        # 添加段标记到输入嵌入和提示嵌入
        input_embeddings = input_embeddings + self.segment_token_input
        prompt_embeddings = prompt_embeddings + self.segment_token_prompt

        # 跳过CLS后，添加位置编码到输入嵌入和提示嵌入
        input_embeddings = input_embeddings + pos_embed
        prompt_embeddings = prompt_embeddings + pos_embed

        # 根据embedding_type选择对应的类型嵌入
        if embedding_type == "semantic":
            type_embedding = self.type_token_semantic
        elif embedding_type == "instance":
            type_embedding = self.type_token_instance
        else:
            raise ValueError(f"Embedding type should be either 'semantic' or 'instance', but got {embedding_type}")

        # 添加类型嵌入到输入嵌入和提示嵌入
        input_embeddings = input_embeddings + type_embedding
        prompt_embeddings = prompt_embeddings + type_embedding

        # 将输入嵌入和提示嵌入连接起来形成最终的嵌入张量
        embeddings = torch.cat((input_embeddings, prompt_embeddings), dim=0)

        # 返回最终的嵌入张量
        return embeddings
        query: torch.Tensor,
        rel_pos_h: torch.Tensor,
        rel_pos_w: torch.Tensor,
        q_size: Tuple[int, int],
        k_size: Tuple[int, int],
    ) -> torch.Tensor:
        """
        Add decomposed relative positional embeddings to attention scores.

        Args:
            attn (torch.Tensor):
                Attention scores.
            query (torch.Tensor):
                Query tensor.
            rel_pos_h (torch.Tensor):
                Relative positional embeddings along height dimension.
            rel_pos_w (torch.Tensor):
                Relative positional embeddings along width dimension.
            q_size (Tuple[int, int]):
                Size of the query tensor.
            k_size (Tuple[int, int]):
                Size of the key tensor.

        Returns:
            Updated attention scores with added decomposed relative positional embeddings.
        """

        # Get relative position embeddings based on query and key sizes
        rel_pos_h = self.get_rel_pos(q_size[0], k_size[0], rel_pos_h)
        rel_pos_w = self.get_rel_pos(q_size[1], k_size[1], rel_pos_w)

        # Add relative position embeddings to attention scores
        attn += torch.matmul(query, rel_pos_h.unsqueeze(0)) + torch.matmul(query, rel_pos_w.unsqueeze(0).transpose(-2, -1))
        
        return attn
    ) -> torch.Tensor:
        """
        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

        Args:
            attn (`torch.Tensor`):
                attention map.
            query (`torch.Tensor`):
                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
            rel_pos_h (`torch.Tensor`):
                relative position embeddings (Lh, channel) for height axis.
            rel_pos_w (`torch.Tensor`):
                relative position embeddings (Lw, channel) for width axis.
            q_size (tuple):
                spatial sequence size of query q with (query_height, query_width).
            k_size (tuple):
                spatial sequence size of key k with (key_height, key_width).

        Returns:
            attn (`torch.Tensor`):
                attention map with added relative positional embeddings.
        """
        # 解构 q_size 元组，获取查询张量的高度和宽度
        query_height, query_width = q_size
        # 解构 k_size 元组，获取键张量的高度和宽度
        key_height, key_width = k_size
        
        # 获取高度轴的相对位置编码，形状为 (batch_size, query_height, query_width, key_height, channel)
        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
        # 获取宽度轴的相对位置编码，形状为 (batch_size, query_height, query_width, key_width, channel)
        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)

        # 获取查询张量的批量大小、高度、宽度和维度
        batch_size, _, dim = query.shape
        # 将查询张量重塑为四维张量 (batch_size, query_height, query_width, dim)
        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
        
        # 计算高度轴的相对位置编码与查询张量的乘积，形状为 (batch_size, query_height, query_width, key_height)
        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
        # 计算宽度轴的相对位置编码与查询张量的乘积，形状为 (batch_size, query_height, query_width, key_width)
        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
        
        # 将注意力图重塑为五维张量 (batch_size, query_height, query_width, key_height, key_width)
        attn = attn.reshape(batch_size, query_height, query_width, key_height, key_width)
        # 将注意力图与高度轴和宽度轴的相对位置编码相加
        attn = attn + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
        
        # 将注意力图重塑为二维张量 (batch_size, query_height * query_width, key_height * key_width)
        attn = attn.reshape(batch_size, query_height * query_width, key_height * key_width)
        # 返回添加了相对位置编码的注意力图
        return attn
    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
        # 获取隐藏状态的形状信息，batch_size为批大小，height为高度，width为宽度，_为通道数
        batch_size, height, width, _ = hidden_states.shape
        
        # 使用self.qkv对隐藏状态进行qkv计算，结果形状为(3, batch_size, num_attention_heads, height * width, embed_dim)
        qkv = (
            self.qkv(hidden_states)
            .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)  # 重塑形状以便后续操作
            .permute(2, 0, 3, 1, 4)  # 转置以便得到q, k, v分量
        )
        
        # 将qkv分解为query, key, value三个部分，形状为(batch_size * num_attention_heads, height * width, embed_dim)
        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
        
        # 计算注意力权重，形状为(batch_size * num_attention_heads, height * width, height * width)
        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
        
        # 如果使用相对位置编码，则对注意力权重进行处理
        if self.use_relative_position_embeddings:
            attn_weights = self.add_decomposed_rel_pos(
                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
            )
        
        # 对注意力权重进行softmax操作，保留query的数据类型
        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
        
        # 如果需要输出注意力权重，则进行特定的形状重塑操作，否则attn_weights_reshaped为None
        if output_attentions:
            # 这个操作有些笨拙，但是需要确保attn_weights保持其梯度。
            # 为了做到这一点，attn_weights必须进行两次重塑，并且在接下来的使用中需要重用它们
            attn_weights_reshaped = attn_weights.view(batch_size, self.num_attention_heads, height * width, -1)
            attn_weights = attn_weights_reshaped.view(batch_size * self.num_attention_heads, height * width, -1)
        else:
            attn_weights_reshaped = None
        
        # 计算注意力输出，形状为(batch_size, num_attention_heads, height, width, embed_dim)
        attn_output = (attn_weights @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
        # 调整输出的形状，使其变为(batch_size, height, width, num_attention_heads * embed_dim)
        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
        
        # 对注意力输出进行投影，形状为(batch_size, height, width, embed_dim)
        attn_output = self.proj(attn_output)
        
        # 返回注意力输出和注意力权重的重塑形状（如果需要）
        return (attn_output, attn_weights_reshaped)
# 从transformers.models.sam.modeling_sam.SamMLPBlock复制到SegGptMlp
class SegGptMlp(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，输入维度是config.hidden_size，输出维度是config.mlp_dim
        self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
        # 创建另一个线性层，输入维度是config.mlp_dim，输出维度是config.hidden_size
        self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
        # 选择激活函数，根据config.hidden_act从预定义的ACT2FN字典中选择对应的函数
        self.act = ACT2FN[config.hidden_act]

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 对输入的hidden_states应用第一个线性层
        hidden_states = self.lin1(hidden_states)
        # 应用选择的激活函数
        hidden_states = self.act(hidden_states)
        # 对应用激活函数后的结果应用第二个线性层
        hidden_states = self.lin2(hidden_states)
        # 返回处理后的hidden_states作为输出
        return hidden_states


# 从transformers.models.beit.modeling_beit.drop_path复制
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    按样本丢弃路径（随机深度），应用于残差块的主路径中。

    Ross Wightman的注释：这与我为EfficientNet等网络创建的DropConnect实现相同，但原始名称误导，因为'Drop Connect'是另一篇论文中的一种不同的丢弃形式...
    参见讨论：https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... 我选择更改层和参数名称为'drop path'，而不是将DropConnect作为层名称并使用'survival rate'作为参数。
    """
    if drop_prob == 0.0 or not training:
        # 如果drop_prob为0或者不处于训练模式，则直接返回输入
        return input
    keep_prob = 1 - drop_prob
    # 创建一个与输入张量形状相同的随机张量，值在[keep_prob, 1.0)之间
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化
    # 应用丢弃路径操作，将输入张量按照keep_prob进行缩放
    output = input.div(keep_prob) * random_tensor
    return output


# 从transformers.models.beit.modeling_beit.BeitDropPath复制到SegGptDropPath
class SegGptDropPath(nn.Module):
    """按样本丢弃路径（随机深度），应用于残差块的主路径中。"""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用drop_path函数，传入hidden_states、drop_prob和当前模块是否处于训练模式
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class SegGptLayer(nn.Module):
    def __init__(self, config: SegGptConfig, drop_path_rate: float) -> None:
        super().__init__()
        # 创建一个SegGptAttention对象，使用给定的config
        self.attention = SegGptAttention(config)
        # 创建一个SegGptMlp对象，使用给定的config
        self.mlp = SegGptMlp(config)
        # 如果drop_path_rate大于0.0，则创建一个SegGptDropPath对象，否则创建一个恒等映射(nn.Identity())
        self.drop_path = SegGptDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        # 创建一个LayerNorm层，输入维度是config.hidden_size，epsilon值是config.layer_norm_eps
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    # 定义神经网络的前向传播方法，接收多个输入参数并返回一个或两个张量的元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        ensemble_cond: int,
        feature_ensemble: bool = False,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:

        # 使用 self.attention 方法进行自注意力计算，先对 hidden_states 进行 layernorm 处理
        self_attention_outputs = self.attention(
            self.layernorm_before(hidden_states),  # 在 SegGpt 中，在进行自注意力计算前先应用 layernorm
            output_attentions=output_attentions,
        )
        attention_output = self_attention_outputs[0]  # 提取自注意力计算的输出
        outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则将其添加到 outputs 中

        # 如果 feature_ensemble 为 True，且满足 ensemble_cond 条件
        if feature_ensemble and attention_output.shape[0] // 2 >= ensemble_cond:
            # 将 attention_output 拆分为 prompt 和 inputs
            prompt, inputs = attention_output.split(attention_output.shape[1] // 2, dim=1)
            # 如果 ensemble_cond 等于 2
            if ensemble_cond == 2:
                num_prompts = attention_output.shape[0] // 2
                # 对 inputs 进行形状调整和均值计算
                inputs = inputs.reshape(2, num_prompts, -1)
                inputs = inputs.mean(dim=1, keepdim=True).expand_as(inputs)
                inputs = inputs.reshape(*prompt.shape)
            else:
                # 对 inputs 进行均值计算和扩展
                inputs = inputs.mean(dim=0, keepdim=True).expand_as(inputs)
            # 拼接处理后的 prompt 和 inputs，并更新 attention_output
            attention_output = torch.cat([prompt, inputs], dim=1)

        # 第一个残差连接
        hidden_states = self.drop_path(attention_output) + hidden_states
        residual = hidden_states  # 保存残差连接后的 hidden_states

        # 在 self.layernorm_after 后应用 layernorm
        hidden_states = self.layernorm_after(hidden_states)
        # 通过 MLP 网络进行非线性变换
        hidden_states = self.mlp(hidden_states)
        # 第二个残差连接
        hidden_states = residual + self.drop_path(hidden_states)

        outputs = (hidden_states,) + outputs  # 更新 outputs，添加最终的 hidden_states

        return outputs  # 返回前向传播的结果
class SegGptEncoder(nn.Module):
    # SegGpt 编码器类，继承自 nn.Module
    def __init__(self, config: SegGptConfig) -> None:
        super().__init__()
        self.config = config
        # 生成一个从0到配置的 drop_path_rate 的线性序列，并转换为 Python 列表
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
        # 创建包含多个 SegGptLayer 实例的 ModuleList，每个实例使用不同的 drop_path_rate
        self.layers = nn.ModuleList([SegGptLayer(config, dpr[i]) for i in range(config.num_hidden_layers)])
        # 创建 LayerNorm 层，用于规范化隐藏状态的尺寸，设置 epsilon 为 config.layer_norm_eps
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 是否开启梯度检查点功能，默认为 False
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        feature_ensemble: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, SegGptEncoderOutput]:
        # 如果输出隐藏状态，则初始化一个空元组来存储所有的隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，则初始化一个空元组来存储所有的注意力权重
        all_self_attentions = () if output_attentions else None
        # 用于存储中间隐藏状态的列表
        intermediate_hidden_states = []

        # 遍历所有层
        for i, layer_module in enumerate(self.layers):
            # 如果输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 根据当前层的索引判断是否需要多个提示来进行集成
            ensemble_cond = 2 if self.config.merge_index > i else 1

            # 如果开启梯度检查点功能并且正在训练，则使用梯度检查点函数来执行当前层的调用
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    ensemble_cond,
                    feature_ensemble,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的前向传播方法
                layer_outputs = layer_module(hidden_states, ensemble_cond, feature_ensemble, output_attentions)

            # 更新隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果当前层的索引等于配置的 merge_index，则执行合并操作
            if i == self.config.merge_index:
                hidden_states = (
                    hidden_states[: hidden_states.shape[0] // 2] + hidden_states[hidden_states.shape[0] // 2 :]
                ) * 0.5

            # 如果当前层的索引在配置的 intermediate_hidden_state_indices 中，则将规范化后的隐藏状态添加到中间隐藏状态列表中
            if i in self.config.intermediate_hidden_state_indices:
                intermediate_hidden_states.append(self.layernorm(hidden_states))

            # 如果输出注意力权重，则将当前层的注意力权重添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果输出隐藏状态，则将最后一个隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典，则返回一个元组，其中包含所有非空的结果项
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, all_hidden_states, all_self_attentions, intermediate_hidden_states]
                if v is not None
            )
        # 否则返回 SegGptEncoderOutput 对象，包含最后的隐藏状态、所有隐藏状态、所有注意力权重和中间隐藏状态列表
        return SegGptEncoderOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            intermediate_hidden_states=intermediate_hidden_states,
        )


# 从 transformers.models.convnext.modeling_convnext.ConvNextLayerNorm 复制并修改为 SegGptLayerNorm
class SegGptLayerNorm(nn.Module):
    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    """

    # 定义一个支持两种数据格式（channels_last 或 channels_first）的 LayerNorm 类
    class LayerNorm(nn.Module):
        
        # 初始化方法，接受 normalized_shape（标准化的维度大小）、eps（防止除零的小常数，默认为 1e-6）、data_format（数据格式，默认为 channels_last）
        def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
            super().__init__()
            # 初始化权重参数为 1，并将其包装为 nn.Parameter，使其可以被优化
            self.weight = nn.Parameter(torch.ones(normalized_shape))
            # 初始化偏置参数为 0，并将其包装为 nn.Parameter，使其可以被优化
            self.bias = nn.Parameter(torch.zeros(normalized_shape))
            # 设置 eps 参数
            self.eps = eps
            # 检查数据格式是否为支持的 channels_last 或 channels_first
            if self.data_format not in ["channels_last", "channels_first"]:
                raise NotImplementedError(f"Unsupported data format: {self.data_format}")
            # 存储标准化的维度信息
            self.normalized_shape = (normalized_shape,)
        
        # 前向传播方法，接受输入张量 x，返回标准化后的张量
        def forward(self, x: torch.Tensor) -> torch.Tensor:
            # 如果数据格式是 channels_last，则使用 torch.nn.functional.layer_norm 函数进行标准化
            if self.data_format == "channels_last":
                x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
            # 如果数据格式是 channels_first，则手动实现标准化过程
            elif self.data_format == "channels_first":
                # 保存输入张量的数据类型
                input_dtype = x.dtype
                # 将输入张量转换为 float 类型
                x = x.float()
                # 计算均值 u
                u = x.mean(1, keepdim=True)
                # 计算方差 s
                s = (x - u).pow(2).mean(1, keepdim=True)
                # 标准化过程：(x - u) / sqrt(s + eps)
                x = (x - u) / torch.sqrt(s + self.eps)
                # 将输出张量的数据类型转换回输入的数据类型
                x = x.to(dtype=input_dtype)
                # 应用权重和偏置调整
                x = self.weight[:, None, None] * x + self.bias[:, None, None]
            # 返回标准化后的张量
            return x
# 定义一个名为 SegGptDecoderHead 的类，继承自 nn.Module
class SegGptDecoderHead(nn.Module):
    # 初始化方法，接收一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 定义一个 2D 卷积层，输入和输出通道数都是 config.decoder_hidden_size，卷积核大小为 3x3，填充为 1
        self.conv = nn.Conv2d(
            config.decoder_hidden_size,
            config.decoder_hidden_size,
            kernel_size=3,
            padding=1,
        )
        # 初始化一个 SegGptLayerNorm 实例，对输入进行归一化，通道顺序为 "channels_first"
        self.layernorm = SegGptLayerNorm(
            normalized_shape=config.decoder_hidden_size, eps=config.layer_norm_eps, data_format="channels_first"
        )
        # 根据配置选择激活函数，ACT2FN 是一个预定义的激活函数字典
        self.act_fct = ACT2FN[config.hidden_act]
        # 定义一个 1x1 的 2D 卷积层，将隐藏状态映射到 3 个通道，带有偏置
        self.head = nn.Conv2d(config.decoder_hidden_size, 3, kernel_size=1, bias=True)  # decoder to patch

    # 前向传播方法，接收输入 hidden_states
    def forward(self, hidden_states: torch.FloatTensor):
        # 对隐藏状态进行卷积操作
        hidden_states = self.conv(hidden_states)
        # 对卷积后的结果进行归一化
        hidden_states = self.layernorm(hidden_states)
        # 应用预定义的激活函数
        hidden_states = self.act_fct(hidden_states)
        # 将激活后的结果再次经过一个 1x1 卷积层
        hidden_states = self.head(hidden_states)

        return hidden_states


# 定义一个名为 SegGptDecoder 的类，继承自 nn.Module
class SegGptDecoder(nn.Module):
    # 初始化方法，接收一个 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 定义一个线性层，用于将输入维度转换为 config.patch_size^2 * config.decoder_hidden_size 的输出维度
        self.decoder_embed = nn.Linear(
            config.hidden_size * len(config.intermediate_hidden_state_indices),
            config.patch_size**2 * config.decoder_hidden_size,
            bias=True,
        )
        # 初始化一个 SegGptDecoderHead 的实例，作为解码器的预测头部
        self.decoder_pred = SegGptDecoderHead(config)
        # 记录 patch 的大小
        self.patch_size = config.patch_size
        # 记录解码器隐藏层的大小
        self.decoder_hidden_size = config.decoder_hidden_size
        # 记录配置对象
        self.config = config

    # 定义一个辅助方法，用于重塑隐藏状态的形状
    def _reshape_hidden_states(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
        # 获取输入的张量形状信息
        batch_size, patch_height, patch_width, _ = hidden_states.shape
        # 将输入的张量重塑为新的形状
        hidden_states = hidden_states.reshape(
            batch_size, patch_height, patch_width, self.patch_size, self.patch_size, self.decoder_hidden_size
        )
        # 对重塑后的张量进行维度排列变换
        hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
        # 再次重塑为指定形状
        hidden_states = hidden_states.reshape(
            shape=(batch_size, -1, patch_height * self.patch_size, patch_width * self.patch_size)
        )

        return hidden_states

    # 前向传播方法，接收输入 hidden_states
    def forward(self, hidden_states: torch.FloatTensor):
        # 将输入的隐藏状态先经过线性层进行维度转换
        hidden_states = self.decoder_embed(hidden_states)
        # 调用辅助方法重塑隐藏状态的形状
        hidden_states = self._reshape_hidden_states(hidden_states)
        # 将重塑后的隐藏状态传入解码器的预测头部进行处理
        hidden_states = self.decoder_pred(hidden_states)

        return hidden_states


# 定义一个名为 SegGptPreTrainedModel 的类，继承自 PreTrainedModel
class SegGptPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，处理权重初始化、预训练模型下载和加载的简单接口。
    """

    # 类属性：配置类为 SegGptConfig
    config_class = SegGptConfig
    # 模型基础名称前缀为 "model"
    base_model_prefix = "model"
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不拆分的模块列表
    _no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        # 从配置中获取初始化的标准差
        std = self.config.initializer_range

        # 如果模块是线性层或卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用截断正态分布初始化权重，先将权重转换为 float32 类型以避免在 half 精度下出现 `trunc_normal_cpu` 未实现的问题，然后再转回原始的 dtype
            module.weight.data = nn.init.trunc_normal_(module.weight.data.to(torch.float32), mean=0.0, std=std).to(
                module.weight.dtype
            )
            # 如果存在偏置，则初始化为零
            if module.bias is not None:
                module.bias.data.zero_()

        # 如果模块是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 初始化偏置为零
            module.bias.data.zero_()
            # 初始化权重为全 1
            module.weight.data.fill_(1.0)

        # 如果模块是 SegGptAttention 类型
        elif isinstance(module, SegGptAttention):
            # 使用截断正态分布初始化相对位置编码的水平方向数据
            module.rel_pos_h.data = nn.init.trunc_normal_(
                module.rel_pos_h.data.to(torch.float32),
                mean=0.0,
                std=std,
            ).to(module.rel_pos_h.dtype)
            # 使用截断正态分布初始化相对位置编码的垂直方向数据
            module.rel_pos_w.data = nn.init.trunc_normal_(
                module.rel_pos_w.data.to(torch.float32),
                mean=0.0,
                std=std,
            ).to(module.rel_pos_w.dtype)

        # 如果模块是 SegGptEmbeddings 类型
        elif isinstance(module, SegGptEmbeddings):
            # 使用截断正态分布初始化位置嵌入数据
            module.position_embeddings.data = nn.init.trunc_normal_(
                module.position_embeddings.data.to(torch.float32),
                mean=0.0,
                std=std,
            ).to(module.position_embeddings.dtype)
            
            # 初始化其他特殊令牌的数据，使用正态分布初始化
            torch.nn.init.normal_(module.mask_token, std=std)
            torch.nn.init.normal_(module.segment_token_input, std=std)
            torch.nn.init.normal_(module.segment_token_prompt, std=std)
            torch.nn.init.normal_(module.type_token_semantic, std=std)
            torch.nn.init.normal_(module.type_token_instance, std=std)
"""
    This model is a PyTorch `torch.nn.Module` subclass designed for SegGpt model architecture. Use it
    like any regular PyTorch Module and refer to the PyTorch documentation for general usage and behavior.

    Parameters:
        config (`SegGptConfig`): Model configuration class containing all model parameters.
            Initializing with a config file loads the configuration settings only, not the model weights.
            Use `PreTrainedModel.from_pretrained` to load weights associated with the model.
"""

"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input pixel values. These are obtained using `AutoImageProcessor`. See `SegGptImageProcessor.__call__`
            for detailed information.

        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt-specific pixel values. These are obtained using `AutoImageProcessor`. See `SegGptImageProcessor.__call__`
            for detailed information.

        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Mask applied to prompts. This is obtained using `AutoImageProcessor`. See `SegGptImageProcessor.__call__`
            for detailed information.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean tensor indicating masked positions (1 for masked, 0 for not masked).

        feature_ensemble (`bool`, *optional*):
            Indicates whether to use feature ensemble. If `True`, the model uses feature ensemble when multiple prompts
            are present. If `False`, it does not. Relevant for few-shot inference on an input image with more than one prompt.

        embedding_type (`str`, *optional*):
            Type of embedding used for prompts. Can be 'instance' or 'semantic'.

        output_attentions (`bool`, *optional*):
            Whether to return the attentions tensors of all attention layers. See `attentions` in returned tensors
            for more details.

        output_hidden_states (`bool`, *optional*):
            Whether to return the hidden states of all layers. See `hidden_states` in returned tensors for more details.

        return_dict (`bool`, *optional*):
            Whether to return a `utils.ModelOutput` instead of a plain tuple.
"""

@add_start_docstrings(
    "The bare SegGpt Model transformer outputting raw hidden-states without any specific head on top.",
    SEGGPT_START_DOCSTRING,
)
class SegGptModel(SegGptPreTrainedModel):
    def __init__(self, config: SegGptConfig):
        super().__init__(config)
        self.config = config

        self.embeddings = SegGptEmbeddings(config)
        self.encoder = SegGptEncoder(config)

        # Initialize weights and apply final processing
        self.post_init()



        def get_input_embeddings(self) -> SegGptPatchEmbeddings:
            return self.embeddings.patch_embeddings



        def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
            """
            Prunes heads of the model.
            
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
            """
            for layer, heads in heads_to_prune.items():
                # Access each layer of the encoder and prune specified heads in the attention mechanism
                self.encoder.layer[layer].attention.prune_heads(heads)



        @add_start_docstrings_to_model_forward(SEGGPT_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=SegGptEncoderOutput, config_class=_CONFIG_FOR_DOC)
        def forward(
            self,
            pixel_values: torch.Tensor,
            prompt_pixel_values: torch.Tensor,
            prompt_masks: torch.Tensor,
            bool_masked_pos: Optional[torch.BoolTensor] = None,
            feature_ensemble: Optional[bool] = None,
            embedding_type: Optional[str] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,






            ):
                """
                Forward pass of the SegGptModel.
                
                pixel_values: torch.Tensor of shape (batch_size, num_patches, embed_dim)
                    Tensor containing pixel values.
                prompt_pixel_values: torch.Tensor of shape (batch_size, num_prompt_patches, embed_dim)
                    Tensor containing prompt pixel values.
                prompt_masks: torch.Tensor of shape (batch_size, num_patches)
                    Mask to ignore prompt tokens.
                bool_masked_pos: Optional[torch.BoolTensor], optional
                    Boolean mask for masked positions, by default None.
                feature_ensemble: Optional[bool], optional
                    Whether to use feature ensemble, by default None.
                embedding_type: Optional[str], optional
                    Type of embedding used, by default None.
                output_attentions: Optional[bool], optional
                    Whether to output attentions, by default None.
                output_hidden_states: Optional[bool], optional
                    Whether to output hidden states, by default None.
                return_dict: Optional[bool], optional
                    Whether to return a dictionary, by default None.






                Returns:
                    SegGptEncoderOutput or Tuple(torch.Tensor), torch.Tensor))
                    A SegGptEncoderOutput (if return_dict=True) or a tuple of torch.Tensors
                    (prompt_tokens, prompt_mask, prompt_patch_embedding)
                """
                # Forward pass logic would be implemented here, detailing how inputs are processed
                # through the layers of the model to produce the desired outputs.
# 定义一个函数，将输入的张量切分成指定大小的图块，并重新组织形状
def patchify(tensor: torch.Tensor, patch_size: int) -> torch.Tensor:
    # 获取张量的批量大小、通道数、高度和宽度
    batch_size, num_channels, height, width = tensor.shape
    # 计算图块的高度和宽度
    patch_height = height // patch_size
    patch_width = width // patch_size

    # 将张量重新形状为(batch_size, num_channels, patch_height, patch_size, patch_width, patch_size)
    tensor = tensor.reshape(shape=(batch_size, num_channels, patch_height, patch_size, patch_width, patch_size))
    # 对张量进行维度置换，调整为(batch_size, patch_height, patch_width, patch_size, patch_size, num_channels)
    tensor = tensor.permute(0, 2, 4, 3, 5, 1)
    # 再次重新形状为(batch_size, patch_height * patch_width, patch_size^2 * num_channels)
    tensor = tensor.reshape(shape=(batch_size, patch_height * patch_width, patch_size**2 * 3))

    return tensor


# 定义一个函数，将输入的张量反转回原始的高度和宽度
def unpatchify(tensor: torch.Tensor, patch_height: int, patch_width: int) -> torch.Tensor:
    # 获取张量的批量大小
    batch_size = tensor.shape[0]
    # 推断出图块的大小
    patch_size = int((tensor.shape[-1] / 3) ** 0.5)
    # 检查图块数量是否与给定的patch_height和patch_width相匹配
    if patch_height * patch_width != tensor.shape[1]:
        raise ValueError(f"Number of patches {tensor.shape[1]} does not match patch height and width.")

    # 将张量重新形状为(batch_size, patch_height, patch_width, patch_size, patch_size, 3)
    tensor = tensor.reshape(shape=(batch_size, patch_height, patch_width, patch_size, patch_size, 3))
    # 对张量进行维度置换，调整为(batch_size, 3, patch_height * patch_size, patch_width * patch_size)
    tensor = tensor.permute(0, 5, 1, 3, 2, 4)

    return tensor


# 定义一个用于语义分割和GPT模型的损失函数类
class SegGptLoss(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化损失函数的参数
        self.beta = config.beta
        self.patch_size = config.patch_size

    # 前向传播方法，计算损失
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        prompt_pixel_values: torch.FloatTensor,
        pred_masks: torch.FloatTensor,
        labels: torch.FloatTensor,
        bool_masked_pos: torch.BoolTensor,
        ```
        ):
        # 此处应该继续注释forward方法的其余部分，但这里不做展示
        pass  # 在此处插入pass语句
        """
        计算预测掩码与实际掩码之间的L1损失。

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
                合并的像素值，来自提示图像和输入图像。

            prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
                来自掩码提示的合并像素值。

            pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
                预测的掩码。

            labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                输入图像的实际掩码。

            bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
                布尔掩码位置。指示哪些补丁被掩盖（1），哪些没有（0）。

        Returns:
            `torch.FloatTensor`: 预测掩码与实际掩码之间的平均L1损失。
        """
        # 根据掩码位置创建掩码
        mask = bool_masked_pos[:, :, None].repeat(1, 1, self.patch_size**2 * 3)
        # 将掩码映射回原始尺寸
        mask = unpatchify(mask, pixel_values.shape[1] // self.patch_size, pixel_values.shape[2] // self.patch_size)
        # 将掩码提示中的虚拟掩码改为实际标签值
        prompt_pixel_values = prompt_pixel_values.clone()
        prompt_pixel_values[:, :, prompt_pixel_values.shape[2] // 2 :, :] = labels
        # 计算平滑L1损失，不进行缩减，并根据掩码应用损失
        loss = F.smooth_l1_loss(pred_masks, prompt_pixel_values, reduction="none", beta=self.beta)
        loss = (loss * mask).sum() / mask.sum()  # 计算移除补丁后的平均损失

        return loss
# 添加类的文档字符串，描述 SegGptForImageSegmentation 类的作用及其特性
@add_start_docstrings(
    "SegGpt model with a decoder on top for one-shot image segmentation.",
    SEGGPT_START_DOCSTRING,
)
# 定义 SegGptForImageSegmentation 类，继承自 SegGptPreTrainedModel
class SegGptForImageSegmentation(SegGptPreTrainedModel):
    
    # 初始化方法，接受一个 SegGptConfig 类型的参数 config
    def __init__(self, config: SegGptConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将参数 config 存储在实例的 config 属性中
        self.config = config

        # 使用给定的 config 创建 SegGptModel 实例，并赋值给 self.model
        self.model = SegGptModel(config)
        # 使用给定的 config 创建 SegGptDecoder 实例，并赋值给 self.decoder
        self.decoder = SegGptDecoder(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 前向传播方法，接受多个输入参数，执行模型的前向计算
    @add_start_docstrings_to_model_forward(SEGGPT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SegGptImageSegmentationOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.Tensor,  # 图像像素值张量
        prompt_pixel_values: torch.Tensor,  # 提示像素值张量
        prompt_masks: torch.Tensor,  # 提示掩码张量
        bool_masked_pos: Optional[torch.BoolTensor] = None,  # 可选的布尔类型掩码位置张量
        feature_ensemble: Optional[bool] = None,  # 可选的特征合集标志
        embedding_type: Optional[str] = None,  # 可选的嵌入类型
        labels: Optional[torch.FloatTensor] = None,  # 可选的标签张量
        output_attentions: Optional[bool] = None,  # 可选的注意力输出标志
        output_hidden_states: Optional[bool] = None,  # 可选的隐藏状态输出标志
        return_dict: Optional[bool] = None,  # 可选的返回字典标志

`.\models\seggpt\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从项目内的 utils 模块导入自定义异常 OptionalDependencyNotAvailable 和 _LazyModule
from ...utils import OptionalDependencyNotAvailable, _LazyModule
# 从 utils 模块导入检查函数 is_torch_available 和 is_vision_available
from ...utils import is_torch_available, is_vision_available

# 定义一个字典 _import_structure，用于存储模块导入结构
_import_structure = {
    "configuration_seggpt": ["SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegGptConfig", "SegGptOnnxConfig"]
}

# 检查是否可以导入 torch，如果不可以则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可以导入 torch，则将相关模块添加到 _import_structure 中
    _import_structure["modeling_seggpt"] = [
        "SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SegGptModel",
        "SegGptPreTrainedModel",
        "SegGptForImageSegmentation",
    ]

# 检查是否可以导入 vision 相关模块，如果不可以则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可以导入 vision 相关模块，则将其添加到 _import_structure 中
    _import_structure["image_processing_seggpt"] = ["SegGptImageProcessor"]

# 如果当前处于类型检查模式（TYPE_CHECKING 为 True）
if TYPE_CHECKING:
    # 从本地模块中导入特定的类和常量
    from .configuration_seggpt import SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, SegGptConfig, SegGptOnnxConfig

    try:
        # 再次检查是否可以导入 torch，如果不可以则忽略异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可以导入 torch，则从本地模块中导入相关模块
        from .modeling_seggpt import (
            SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
            SegGptForImageSegmentation,
            SegGptModel,
            SegGptPreTrainedModel,
        )

    try:
        # 再次检查是否可以导入 vision 相关模块，如果不可以则忽略异常
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可以导入 vision 相关模块，则从本地模块中导入相关模块
        from .image_processing_seggpt import SegGptImageProcessor

# 如果不处于类型检查模式，则执行以下代码块
else:
    import sys

    # 将当前模块替换为懒加载模块 _LazyModule
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\sew\configuration_sew.py`

# 设置编码为UTF-8，确保代码可以处理Unicode字符
# 版权声明，指明代码的版权归属和许可协议
# 导入 functools 和 operator 模块，用于后续操作的支持函数
# 从相对路径导入 PretrainedConfig 类
# 从相对路径导入 logging 工具，用于记录日志信息

# 获取 SEWConfig 类专用的日志记录器
logger = logging.get_logger(__name__)

# SEW_PRETRAINED_CONFIG_ARCHIVE_MAP 是一个字典，映射了 SEW 模型的预训练配置文件的名称到对应的 URL
SEW_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "asapp/sew-tiny-100k": "https://huggingface.co/asapp/sew-tiny-100k/resolve/main/config.json",
    # 可以在 https://huggingface.co/models?filter=sew 查看所有 SEW 模型的列表
}

# SEWConfig 类继承自 PretrainedConfig 类，用于存储 SEW 模型的配置信息
class SEWConfig(PretrainedConfig):
    r"""
    这是 SEWModel 的配置类，用于存储 SEW 模型的配置信息。根据指定的参数实例化配置对象，定义模型的架构。
    使用默认参数实例化配置对象将得到与 asapp/sew-tiny-100k 架构类似的配置。

    配置对象继承自 PretrainedConfig，可用于控制模型的输出。详细信息请参阅 PretrainedConfig 的文档。

    Example:

    ```
    >>> from transformers import SEWConfig, SEWModel

    >>> # 初始化一个 SEW 类型的配置，例如 asapp/sew-tiny-100k
    >>> configuration = SEWConfig()

    >>> # 使用 SEW 类型的配置初始化一个模型（随机权重）
    >>> model = SEWModel(configuration)

    >>> # 访问模型的配置信息
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "sew"
    model_type = "sew"
    # 初始化函数，用于创建一个 Transformer 模型的实例
    def __init__(
        self,
        vocab_size=32,  # 词汇表大小，默认为32
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer 中的隐藏层层数，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # 中间层的大小，默认为3072
        squeeze_factor=2,  # 压缩因子，用于某些特征压缩，默认为2
        hidden_act="gelu",  # 隐藏层激活函数，默认为 GELU
        hidden_dropout=0.1,  # 隐藏层的 dropout 概率，默认为0.1
        activation_dropout=0.1,  # 激活函数的 dropout 概率，默认为0.1
        attention_dropout=0.1,  # 注意力层的 dropout 概率，默认为0.1
        feat_proj_dropout=0.0,  # 特征投影层的 dropout 概率，默认为0.0
        final_dropout=0.1,  # 最终输出的 dropout 概率，默认为0.1
        layerdrop=0.1,  # 层级 dropout 概率，默认为0.1
        initializer_range=0.02,  # 参数初始化的范围，默认为0.02
        layer_norm_eps=1e-5,  # 层归一化的 epsilon，默认为1e-5
        feat_extract_norm="group",  # 特征提取层的归一化方式，默认为 group
        feat_extract_activation="gelu",  # 特征提取层的激活函数，默认为 GELU
        conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512),  # 卷积层的维度列表
        conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1),  # 卷积层的步长列表
        conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1),  # 卷积层的卷积核大小列表
        conv_bias=False,  # 是否使用卷积层的偏置，默认为 False
        num_conv_pos_embeddings=128,  # 卷积位置嵌入的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的组数，默认为16
        apply_spec_augment=True,  # 是否应用特定的数据增强，默认为 True
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 时间掩码的最小数量，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        mask_feature_min_masks=0,  # 特征掩码的最小数量，默认为0
        ctc_loss_reduction="mean",  # CTC 损失的归并方式，默认为 mean
        ctc_zero_infinity=False,  # CTC 损失是否将无限值设为0，默认为 False
        use_weighted_layer_sum=False,  # 是否使用加权层求和，默认为 False
        classifier_proj_size=256,  # 分类器投影层的大小，默认为256
        pad_token_id=0,  # 填充标记的 ID，默认为0
        bos_token_id=1,  # 开始标记的 ID，默认为1
        eos_token_id=2,  # 结束标记的 ID，默认为2
        **kwargs,  # 其他未列出的参数，使用关键字参数传递
        ):
            # 调用父类的初始化方法，并传递关键字参数
            super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
            # 设置隐藏层大小
            self.hidden_size = hidden_size
            # 特征提取层的归一化方式
            self.feat_extract_norm = feat_extract_norm
            # 特征提取层的激活函数
            self.feat_extract_activation = feat_extract_activation
            # 卷积层的维度列表
            self.conv_dim = list(conv_dim)
            # 卷积层的步长列表
            self.conv_stride = list(conv_stride)
            # 卷积层的卷积核大小列表
            self.conv_kernel = list(conv_kernel)
            # 卷积层是否包含偏置
            self.conv_bias = conv_bias
            # 卷积位置嵌入的数量
            self.num_conv_pos_embeddings = num_conv_pos_embeddings
            # 卷积位置嵌入的组数
            self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
            # 特征提取层的数量
            self.num_feat_extract_layers = len(self.conv_dim)
            # 隐藏层的数量
            self.num_hidden_layers = num_hidden_layers
            # 中间层的大小
            self.intermediate_size = intermediate_size
            # 压缩因子
            self.squeeze_factor = squeeze_factor
            # 隐藏层的激活函数
            self.hidden_act = hidden_act
            # 注意力头的数量
            self.num_attention_heads = num_attention_heads
            # 隐藏层的dropout比率
            self.hidden_dropout = hidden_dropout
            # 注意力机制的dropout比率
            self.attention_dropout = attention_dropout
            # 激活函数的dropout比率
            self.activation_dropout = activation_dropout
            # 特征投影的dropout比率
            self.feat_proj_dropout = feat_proj_dropout
            # 最终输出的dropout比率
            self.final_dropout = final_dropout
            # 层间的dropout
            self.layerdrop = layerdrop
            # 层归一化的epsilon值
            self.layer_norm_eps = layer_norm_eps
            # 初始化范围
            self.initializer_range = initializer_range
            # 词汇表大小
            self.vocab_size = vocab_size

            # 检查卷积层配置参数是否正确
            if (
                (len(self.conv_stride) != self.num_feat_extract_layers)
                or (len(self.conv_kernel) != self.num_feat_extract_layers)
                or (len(self.conv_dim) != self.num_feat_extract_layers)
            ):
                # 如果不正确，则抛出数值错误异常
                raise ValueError(
                    "Configuration for convolutional layers is incorrect. "
                    "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
                    f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
                    f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
                )

            # 针对 SpecAugment 进行微调的配置参数：https://arxiv.org/abs/1904.08779
            # 是否应用 SpecAugment
            self.apply_spec_augment = apply_spec_augment
            # 时间遮盖的概率
            self.mask_time_prob = mask_time_prob
            # 时间遮盖的长度
            self.mask_time_length = mask_time_length
            # 最小时间遮盖数
            self.mask_time_min_masks = mask_time_min_masks
            # 特征遮盖的概率
            self.mask_feature_prob = mask_feature_prob
            # 特征遮盖的长度
            self.mask_feature_length = mask_feature_length
            # 最小特征遮盖数
            self.mask_feature_min_masks = mask_feature_min_masks

            # CTC损失配置
            # 减少的方式
            self.ctc_loss_reduction = ctc_loss_reduction
            # CTC无穷远
            self.ctc_zero_infinity = ctc_zero_infinity

            # 序列分类
            # 是否使用加权层求和
            self.use_weighted_layer_sum = use_weighted_layer_sum
            # 分类器投影的大小
            self.classifier_proj_size = classifier_proj_size

        @property
        def inputs_to_logits_ratio(self):
            # 返回输入到logits比率，通过乘以所有卷积步长的结果得到
            return functools.reduce(operator.mul, self.conv_stride, 1)

`.\models\sew\convert_sew_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert SEW checkpoint."""

import argparse  # 导入用于处理命令行参数的模块
import json  # 导入处理JSON格式数据的模块
import os  # 导入操作系统相关功能的模块

import fairseq  # 导入fairseq库
import torch  # 导入PyTorch库
from fairseq.data import Dictionary  # 从fairseq库中导入Dictionary类

# Register SEW's fairseq modules
from sew_asapp import tasks  # noqa: F401  # 导入SEW的fairseq模块，不会引发F401未使用警告

from transformers import (  # 从transformers库中导入多个类和函数
    SEWConfig,  # SEW模型配置类
    SEWForCTC,  # SEW CTC模型类
    SEWModel,  # SEW模型类
    Wav2Vec2CTCTokenizer,  # Wav2Vec2 CTC标记器类
    Wav2Vec2FeatureExtractor,  # Wav2Vec2特征提取器类
    Wav2Vec2Processor,  # Wav2Vec2处理器类
    logging,  # 日志记录模块
)

logging.set_verbosity_info()  # 设置日志级别为info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

MAPPING = {
    "post_extract_proj": "feature_projection",  # 映射fairseq模型的post_extract_proj到feature_projection
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",  # 映射fairseq模型的encoder.pos_conv.0到encoder.pos_conv_embed.conv
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",  # 映射fairseq模型的self_attn.k_proj到encoder.layers.*.attention.k_proj
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",  # 映射fairseq模型的self_attn.v_proj到encoder.layers.*.attention.v_proj
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",  # 映射fairseq模型的self_attn.q_proj到encoder.layers.*.attention.q_proj
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",  # 映射fairseq模型的self_attn.out_proj到encoder.layers.*.attention.out_proj
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",  # 映射fairseq模型的self_attn_layer_norm到encoder.layers.*.layer_norm
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",  # 映射fairseq模型的fc1到encoder.layers.*.feed_forward.intermediate_dense
    "fc2": "encoder.layers.*.feed_forward.output_dense",  # 映射fairseq模型的fc2到encoder.layers.*.feed_forward.output_dense
    "final_layer_norm": "encoder.layers.*.final_layer_norm",  # 映射fairseq模型的final_layer_norm到encoder.layers.*.final_layer_norm
    "encoder.upsample.0": "encoder.upsample.projection",  # 映射fairseq模型的encoder.upsample.0到encoder.upsample.projection
    "encoder.layer_norm": "encoder.layer_norm",  # 映射fairseq模型的encoder.layer_norm到encoder.layer_norm
    "w2v_model.layer_norm": "layer_norm",  # 映射fairseq模型的w2v_model.layer_norm到layer_norm
    "w2v_encoder.proj": "lm_head",  # 映射fairseq模型的w2v_encoder.proj到lm_head
    "mask_emb": "masked_spec_embed",  # 映射fairseq模型的mask_emb到masked_spec_embed
}


def set_recursively(hf_pointer, key, value, full_name, weight_type):
    # 递归设置hf_pointer指定路径下的权重或属性值
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)

    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    assert hf_shape == value.shape, (
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    else:
        hf_pointer.data = value

    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")


def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
    # 递归加载fairseq模型的权重到hf_model中
    unused_weights = []  # 未使用的权重列表
    fairseq_dict = fairseq_model.state_dict()  # 获取fairseq模型的状态字典
    # 根据是否微调选择特征提取器模型
    feature_extractor = hf_model.sew.feature_extractor if is_finetuned else hf_model.feature_extractor

    # 遍历 fairseq_dict 中的每个键值对
    for name, value in fairseq_dict.items():
        # 标记该权重是否被使用的布尔值
        is_used = False
        
        # 检查名字中是否包含 "conv_layers"
        if "conv_layers" in name:
            # 如果包含，则加载卷积层权重
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            is_used = True
        else:
            # 遍历 MAPPING 中的每个键值对
            for key, mapped_key in MAPPING.items():
                # 根据微调状态和键映射更新 mapped_key
                mapped_key = "sew." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key

                # 检查 name 是否包含 key 或者与 key 后面的部分匹配
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    is_used = True
                    
                    # 如果 mapped_key 包含通配符 "*"，则替换为层索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    
                    # 根据名字判断权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "weight" in name:
                        weight_type = "weight"
                    elif "bias" in name:
                        weight_type = "bias"
                    else:
                        weight_type = None
                    
                    # 递归设置 hf_model 中的权重值
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                continue
        
        # 如果未使用该权重，则将其添加到未使用权重列表中
        if not is_used:
            unused_weights.append(name)

    # 记录未使用的权重信息到日志中
    logger.warning(f"Unused weights: {unused_weights}")
# 加载卷积层参数的函数，根据完整名称 `full_name`、数值 `value`、特征提取器 `feature_extractor`、未使用的权重列表 `unused_weights` 和是否使用组归一化 `use_group_norm` 进行操作
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 从完整名称中提取出卷积层的名字 `name`
    name = full_name.split("conv_layers.")[-1]
    # 按点号分割名字，得到列表 `items`
    items = name.split(".")
    # 将第一个元素转换为整数，作为层的索引 `layer_id`
    layer_id = int(items[0])
    # 将第二个元素转换为整数，作为类型的索引 `type_id`
    type_id = int(items[1])

    # 根据类型 `type_id` 执行不同的操作
    if type_id == 0:
        # 如果名称中包含 "bias"，则更新对应卷积层的偏置参数
        if "bias" in name:
            # 断言新值 `value` 的形状与目标卷积层的偏置参数形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            # 更新卷积层的偏置参数
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            # 记录日志，指示偏置参数已从 `full_name` 初始化
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 如果名称中包含 "weight"，则更新对应卷积层的权重参数
        elif "weight" in name:
            # 断言新值 `value` 的形状与目标卷积层的权重参数形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            # 更新卷积层的权重参数
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            # 记录日志，指示权重参数已从 `full_name` 初始化
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 如果类型为 2，并且不使用组归一化，或者类型为 2 且是第一层且使用组归一化，则执行以下操作
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果名称中包含 "bias"，则更新对应卷积层的组归一化偏置参数
        if "bias" in name:
            # 断言新值 `value` 的形状与目标卷积层的组归一化偏置参数形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            # 更新卷积层的组归一化偏置参数
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            # 记录日志，指示组归一化偏置参数已从 `full_name` 初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果名称中包含 "weight"，则更新对应卷积层的组归一化权重参数
        elif "weight" in name:
            # 断言新值 `value` 的形状与目标卷积层的组归一化权重参数形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
            )
            # 更新卷积层的组归一化权重参数
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            # 记录日志，指示组归一化权重参数已从 `full_name` 初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    else:
        # 如果不满足上述条件，则将 `full_name` 加入未使用的权重列表
        unused_weights.append(full_name)


# 将模型的配置转换为 SEWConfig 对象的函数，根据是否微调 `is_finetuned` 执行不同的操作
def convert_config(model, is_finetuned):
    # 创建 SEWConfig 对象 `config`
    config = SEWConfig()
    # 根据是否微调选择不同的配置信息
    if is_finetuned:
        # 如果是微调，则从 `model.w2v_encoder.w2v_model.cfg` 中获取配置信息
        fs_config = model.w2v_encoder.w2v_model.cfg
    else:
        # 否则从 `model.cfg` 中获取配置信息
        fs_config = model.cfg

    # 将卷积层的偏置标志 `conv_bias` 设置为 `fs_config` 中的值
    config.conv_bias = fs_config.conv_bias
    # 使用 `eval` 函数解析 `fs_config` 中的卷积层信息，并分别提取卷积层的维度、卷积核、步长等信息
    conv_layers = eval(fs_config.conv_feature_layers)
    config.conv_dim = [x[0] for x in conv_layers]
    config.conv_kernel = [x[1] for x in conv_layers]
    config.conv_stride = [x[2] for x in conv_layers]
    # 设置特征提取的激活函数为 "gelu"
    config.feat_extract_activation = "gelu"
    # 根据 `fs_config` 中的提取器模式设置特征提取的归一化方式为 "layer" 或 "group"
    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
    # 最终的 dropout 设为 0.0
    config.final_dropout = 0.0
    # 设置隐藏层激活函数为 `fs_config` 中的激活函数名称
    config.hidden_act = fs_config.activation_fn.name
    # 设置隐藏层大小为预训练模型的编码器嵌入维度
    config.hidden_size = fs_config.encoder_embed_dim
    # 设置初始化范围为0.02
    config.initializer_range = 0.02
    # 设置中间层大小为预训练模型的编码器前馈神经网络嵌入维度
    config.intermediate_size = fs_config.encoder_ffn_embed_dim
    # 设置层归一化的 epsilon 值
    config.layer_norm_eps = 1e-5
    # 设置层丢弃概率为预训练模型的编码器层丢弃率
    config.layerdrop = fs_config.encoder_layerdrop
    # 设置注意力头数为预训练模型的编码器注意力头数
    config.num_attention_heads = fs_config.encoder_attention_heads
    # 设置卷积位置嵌入组数为预训练模型的卷积位置编组数
    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
    # 设置卷积位置嵌入数为预训练模型的卷积位置编码数
    config.num_conv_pos_embeddings = fs_config.conv_pos
    # 设置特征提取层数为卷积层列表的长度
    config.num_feat_extract_layers = len(conv_layers)
    # 设置隐藏层数为预训练模型的编码器层数
    config.num_hidden_layers = fs_config.encoder_layers
    # 设置挤压因子为预训练模型的挤压因子
    config.squeeze_factor = fs_config.squeeze_factor

    # 处理被 Wav2VecCtc 模型覆盖的任何参数
    if is_finetuned:
        # 使用模型配置中的配置覆盖当前配置
        fs_config = model.cfg
        # 设置最终丢弃概率为模型配置中的最终丢弃概率
        config.final_dropout = fs_config.final_dropout
        # 设置层丢弃概率为模型配置中的层丢弃概率
        config.layerdrop = fs_config.layerdrop
    # 设置激活丢弃概率为预训练模型的激活丢弃概率
    config.activation_dropout = fs_config.activation_dropout
    # 根据是否应用频谱增强设置配置中的应用特定增强
    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
    # 设置注意力丢弃概率为预训练模型的注意力丢弃概率
    config.attention_dropout = fs_config.attention_dropout
    # 设置特征投影丢弃概率为预训练模型的输入丢弃概率
    config.feat_proj_dropout = fs_config.dropout_input
    # 设置隐藏层丢弃概率为预训练模型的丢弃概率
    config.hidden_dropout = fs_config.dropout
    # 设置掩码特征长度为预训练模型的掩码通道长度
    config.mask_feature_length = fs_config.mask_channel_length
    # 设置掩码特征概率为预训练模型的掩码通道概率
    config.mask_feature_prob = fs_config.mask_channel_prob
    # 设置掩码时间长度为预训练模型的掩码长度
    config.mask_time_length = fs_config.mask_length
    # 设置掩码时间概率为预训练模型的掩码概率
    config.mask_time_prob = fs_config.mask_prob

    # 设置特征提取器类型为 "Wav2Vec2FeatureExtractor"
    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
    # 设置分词器类为 "Wav2Vec2CTCTokenizer"
    config.tokenizer_class = "Wav2Vec2CTCTokenizer"

    # 返回配置对象
    return config
@torch.no_grad()
def convert_sew_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """

    if is_finetuned:
        # 如果是微调模型，使用fairseq加载模型和任务
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
        )
    else:
        # 如果不是微调模型，直接加载模型
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])

    if config_path is not None:
        # 如果提供了配置文件路径，使用SEWConfig从预训练模型加载配置
        config = SEWConfig.from_pretrained(config_path)
    else:
        # 否则，根据模型和微调状态转换配置
        config = convert_config(model[0], is_finetuned)
    # 将模型设置为评估模式
    model = model[0].eval()

    # 根据配置设置是否返回注意力掩码
    return_attention_mask = True if config.feat_extract_norm == "layer" else False
    # 创建Wav2Vec2特征提取器实例
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16000,
        padding_value=0,
        do_normalize=True,
        return_attention_mask=return_attention_mask,
    )

    if is_finetuned:
        if dict_path:
            # 如果提供了字典路径，加载字典
            target_dict = Dictionary.load(dict_path)

            # 重要的变化，调整起始和填充符号ID，因为CTC的符号是<pad>而不是<s>
            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
            if not os.path.isdir(pytorch_dump_folder_path):
                # 如果目标路径不是目录，记录错误并返回
                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
                return
            # 创建目标路径
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            # 将字典索引写入JSON文件
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(target_dict.indices, vocab_handle)
            # 创建Wav2Vec2CTC标记器实例
            tokenizer = Wav2Vec2CTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )
            # 创建Wav2Vec2处理器实例
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
            # 将处理器保存到指定路径
            processor.save_pretrained(pytorch_dump_folder_path)

        # 根据配置创建SEWForCTC模型
        hf_model = SEWForCTC(config)
    else:
        # 根据配置创建SEWModel模型
        hf_model = SEWModel(config)
        # 将特征提取器保存到指定路径
        feature_extractor.save_pretrained(pytorch_dump_folder_path)

    # 递归加载模型权重到预训练模型
    recursively_load_weights(model, hf_model, is_finetuned)

    # 将预训练模型保存到指定路径
    hf_model.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 添加命令行参数 `--pytorch_dump_folder_path`，指定输出 PyTorch 模型的路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数 `--checkpoint_path`，指定 fairseq 模型的检查点路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加命令行参数 `--dict_path`，指定经过微调的模型的字典路径
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    # 添加命令行参数 `--config_path`，指定要转换的模型的 hf config.json 路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加命令行参数 `--is_finetuned`，指定要转换的模型是否是经过微调的模型
    parser.add_argument(
        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
    )
    # 解析命令行参数，并将其存储到 args 变量中
    args = parser.parse_args()
    # 调用 convert_sew_checkpoint 函数，传递命令行参数中指定的路径和配置
    convert_sew_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
    )

`.\models\sew\modeling_sew.py`

# coding=utf-8
# 版权所有 2021 年 ASAPP 公司和 HuggingFace 公司团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可;
# 除非符合许可证的条款，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何形式的明示或暗示担保或条件。
# 有关详细信息，请参阅许可证。
""" PyTorch SEW 模型。"""

import math
import warnings
from typing import Optional, Tuple, Union

import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_sew import SEWConfig

logger = logging.get_logger(__name__)

# 隐藏状态的起始位置
_HIDDEN_STATES_START_POSITION = 1

# 用于文档的通用字符串
_CONFIG_FOR_DOC = "SEWConfig"

# 用于基本文档的检查点字符串
_CHECKPOINT_FOR_DOC = "asapp/sew-tiny-100k-ft-ls100h"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 512]

# CTC（Connectionist Temporal Classification）文档字符串
_CTC_EXPECTED_OUTPUT = (
    "'MISTER QUILTER IS THE APPOSTILE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPOLLE'"
)
_CTC_EXPECTED_LOSS = 0.42

# 音频分类文档字符串
_SEQ_CLASS_CHECKPOINT = "anton-l/sew-mid-100k-ft-keyword-spotting"
_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
_SEQ_CLASS_EXPECTED_LOSS = 9.52

SEW_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "asapp/sew-tiny-100k",
    "asapp/sew-small-100k",
    "asapp/sew-mid-100k",
    # 查看所有 SEW 模型 https://huggingface.co/models?filter=sew
]

# 从 transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices 复制而来
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    计算给定形状的随机掩码间隔。用于实现 ASR 的 [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779)。请注意，此方法未经优化，应在 CPU 上作为训练预处理的一部分运行，而不是在 TPU 上运行。

    """
    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """
    # 解包形状参数
    batch_size, sequence_length = shape

    # 检查 mask_length 是否合法
    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")

    # 检查 mask_length 是否小于 sequence_length
    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )

    # epsilon 用于概率舍入
    epsilon = np.random.rand(1).item()

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        # 计算应该屏蔽的 span 的数量
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
        # 确保不低于最小屏蔽数
        num_masked_span = max(num_masked_span, min_masks)

        # 确保 num_masked_span 不超过 sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length

        # 确保 num_masked span 不超过 input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)

        return num_masked_span

    # 计算每个 batch 中的屏蔽 span 的数量
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]
    )

    # 创建用于 SpecAugment 的屏蔽 mask
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
    spec_aug_mask_idxs = []

    # 计算最大可能的屏蔽 span 数量
    max_num_masked_span = compute_num_masked_span(sequence_length)

    # 如果最大屏蔽 span 数量为 0，则直接返回空的 spec_aug_mask
    if max_num_masked_span == 0:
        return spec_aug_mask
    # 遍历输入长度列表中的每个长度
    for input_length in input_lengths:
        # 计算当前输入长度下的需要屏蔽的片段数量
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要屏蔽的索引
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 选择第一个样本索引作为填充向量的虚拟索引，确保所有批次具有相同的维度
        # 这是由于概率舍入导致的维度问题的解决方案
        if len(spec_aug_mask_idx) == 0:
            # 只有在 `input_length` 严格小于 `sequence_length` 时才会发生这种情况，
            # 此时最后一个标记必须是填充标记，可以用作虚拟屏蔽标识符
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟屏蔽索引扩展到匹配的数量，并添加到屏蔽索引列表中
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将屏蔽索引列表转换为 numpy 数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将屏蔽索引扩展为屏蔽段
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 添加偏移量到起始索引，以确保索引现在创建一个完整的屏蔽段
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保屏蔽索引不会超过序列长度
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 使用屏蔽索引在 spec_aug_mask 上进行填充操作
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回填充后的 spec_aug_mask
    return spec_aug_mask
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->SEW
class SEWNoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 设置输入卷积维度为上一层的卷积维度或者默认为1（如果是第一层）
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为当前层的卷积维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个卷积层，指定输入和输出维度，卷积核大小，步长和是否有偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 设置激活函数为预定义的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 执行卷积操作
        hidden_states = self.conv(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->SEW
class SEWLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 设置输入卷积维度为上一层的卷积维度或者默认为1（如果是第一层）
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为当前层的卷积维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个卷积层，指定输入和输出维度，卷积核大小，步长和是否有偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 创建一个LayerNorm层，对输出进行标准化，并可选地进行仿射变换
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        # 设置激活函数为预定义的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 执行卷积操作
        hidden_states = self.conv(hidden_states)

        # 将卷积输出的维度转置以便进行LayerNorm操作
        hidden_states = hidden_states.transpose(-2, -1)
        # 应用LayerNorm进行标准化
        hidden_states = self.layer_norm(hidden_states)
        # 再次将维度转置回来
        hidden_states = hidden_states.transpose(-2, -1)

        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->SEW
class SEWGroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 设置输入卷积维度为上一层的卷积维度或者默认为1（如果是第一层）
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为当前层的卷积维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个卷积层，指定输入和输出维度，卷积核大小，步长和是否有偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 设置激活函数为预定义的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

        # 创建一个GroupNorm层，指定组数和通道数，对输出进行标准化
        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        # 执行卷积操作
        hidden_states = self.conv(hidden_states)
        # 应用GroupNorm进行标准化
        hidden_states = self.layer_norm(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


class SEWPositionalConvEmbedding(nn.Module):
    # 在此处继续实现其他功能
    pass
    # 初始化函数，用于初始化类的实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个一维卷积层对象
        self.conv = nn.Conv1d(
            config.hidden_size,  # 输入通道数（隐藏大小）
            config.hidden_size,  # 输出通道数（隐藏大小，保持不变）
            kernel_size=config.num_conv_pos_embeddings,  # 卷积核大小
            padding=config.num_conv_pos_embeddings // 2,  # 填充大小
            groups=config.num_conv_pos_embedding_groups,  # 分组卷积的组数
            stride=config.squeeze_factor,  # 步长
        )

        # 如果启用了Deepspeed的zero3模式
        if is_deepspeed_zero3_enabled():
            import deepspeed

            # 使用Deepspeed的分布式参数收集功能
            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                # 对卷积层进行权重归一化，并命名为"weight"，dim=2表示在输出通道维度上进行归一化
                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
            # 注册卷积层权重的外部参数
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 对卷积层进行权重归一化，并命名为"weight"，dim=2表示在输出通道维度上进行归一化
            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)

        # 创建一个与卷积层同样大小的填充层对象
        self.padding = SEWSamePadLayer(config.num_conv_pos_embeddings)
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    # 前向传播函数，定义了数据如何通过网络层流动
    def forward(self, hidden_states):
        # 经过一维卷积层
        hidden_states = self.conv(hidden_states)
        # 经过填充层
        hidden_states = self.padding(hidden_states)
        # 经过激活函数
        hidden_states = self.activation(hidden_states)

        # 返回处理后的隐藏状态
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制代码，并将 Wav2Vec2 更改为 SEW
class SEWSamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 根据卷积位置嵌入的数量确定是否需要移除一个填充元素
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        if self.num_pad_remove > 0:
            # 如果需要移除填充元素，则截取掉隐藏状态的末尾
            hidden_states = hidden_states[:, :, :-self.num_pad_remove]
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制代码，并将 Wav2Vec2 更改为 SEW
class SEWFeatureEncoder(nn.Module):
    """从原始音频波形中构造特征"""

    def __init__(self, config):
        super().__init__()

        # 根据配置选择不同的特征提取归一化方式
        if config.feat_extract_norm == "group":
            # 如果是组归一化，则使用 SEWGroupNormConvLayer 作为第一层，其余层为 SEWNoLayerNormConvLayer
            conv_layers = [SEWGroupNormConvLayer(config, layer_id=0)] + [
                SEWNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果是层归一化，则所有层均使用 SEWLayerNormConvLayer
            conv_layers = [SEWLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
        else:
            # 如果归一化方式不是 'group' 或 'layer'，则抛出异常
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )

        # 将所有的卷积层组成一个模块列表
        self.conv_layers = nn.ModuleList(conv_layers)
        self.gradient_checkpointing = False
        self._requires_grad = True

    def _freeze_parameters(self):
        # 冻结模型的所有参数，使其不可训练
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False
    def forward(self, input_values):
        # 将输入的张量扩展维度，增加一个维度，用于卷积操作
        hidden_states = input_values[:, None]

        # 如果需要计算梯度并且处于训练模式，则将 hidden_states 设置为需要梯度计算
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        # 遍历所有的卷积层进行前向传播
        for conv_layer in self.conv_layers:
            # 如果需要计算梯度、启用了梯度检查点功能并且处于训练模式，则使用梯度检查点函数进行前向传播
            if self._requires_grad and self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,
                    hidden_states,
                )
            else:
                # 否则直接调用卷积层进行前向传播计算
                hidden_states = conv_layer(hidden_states)

        # 返回最终的隐藏状态张量
        return hidden_states
class SEWFeatureExtractor(SEWFeatureEncoder):
    # SEWFeatureExtractor 类继承自 SEWFeatureEncoder 类

    def __init__(self, config):
        # 初始化函数，接受一个配置参数 config
        super().__init__(config)
        # 调用父类 SEWFeatureEncoder 的初始化方法

        # 发出警告，提示 SEWFeatureExtractor 类已过时，建议使用 SEWFeatureEncoder 类
        warnings.warn(
            f"The class `{self.__class__.__name__}` has been depreciated "
            "and will be removed in Transformers v5. "
            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
            FutureWarning,
        )


# 从 transformers.models.bart.modeling_bart.BartAttention 复制并修改为 SEWAttention
class SEWAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[SEWConfig] = None,
    ):
        # 初始化函数，定义注意力机制的参数
        super().__init__()
        self.embed_dim = embed_dim  # 注意力机制的输入维度
        self.num_heads = num_heads  # 注意力头的数量
        self.dropout = dropout  # Dropout 概率
        self.head_dim = embed_dim // num_heads  # 每个头的维度
        self.config = config  # SEW 的配置对象

        # 检查 embed_dim 是否可以被 num_heads 整除，否则抛出错误
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子
        self.is_decoder = is_decoder  # 是否为解码器
        self.is_causal = is_causal  # 是否是因果的

        # 线性变换层，用于计算 Q、K、V 矩阵
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入张量 tensor 重新形状为 (bsz, seq_len, num_heads, head_dim) 并转置
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 前向传播函数，接受多个输入参数并进行注意力计算
        pass  # 此处未实现具体功能，需要根据具体的注意力机制进行实现


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward 复制并修改为 SEWFeedForward
class SEWFeedForward(nn.Module):
    def __init__(self, config):
        # 初始化函数，接受一个配置参数 config
        super().__init__()
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 中间层线性变换，用于激活函数前的线性变换
        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

        # 输出层线性变换，用于最终输出
        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.output_dropout = nn.Dropout(config.hidden_dropout)
    # 定义前向传播函数，接受隐藏状态作为输入参数
    def forward(self, hidden_states):
        # 使用中间层的稠密层对隐藏状态进行变换
        hidden_states = self.intermediate_dense(hidden_states)
        # 对变换后的隐藏状态应用中间层的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 对应用激活函数后的隐藏状态进行中间层的dropout操作
    
        # 使用输出层的稠密层对经过中间层处理后的隐藏状态再次进行变换
        hidden_states = self.output_dense(hidden_states)
        # 对输出层变换后的隐藏状态进行输出层的dropout操作
        return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer复制而来，将Wav2Vec2替换为SEW
class SEWEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化注意力机制，使用SEWAttention类
        self.attention = SEWAttention(
            embed_dim=config.hidden_size,  # 设置嵌入维度为隐藏大小
            num_heads=config.num_attention_heads,  # 设置注意力头数
            dropout=config.attention_dropout,  # 设置注意力机制的dropout率
            is_decoder=False,
        )
        # 随机失活层，使用隐藏dropout率
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 层归一化，使用隐藏大小和层归一化epsilon值
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 前馈神经网络，使用SEWFeedForward类
        self.feed_forward = SEWFeedForward(config)
        # 最终层归一化，使用隐藏大小和层归一化epsilon值
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 注意力残差连接
        attn_residual = hidden_states
        # 执行注意力计算
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        # 应用dropout
        hidden_states = self.dropout(hidden_states)
        # 添加注意力残差到隐藏状态
        hidden_states = attn_residual + hidden_states

        # 应用层归一化
        hidden_states = self.layer_norm(hidden_states)
        # 添加前馈神经网络输出到隐藏状态
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        # 最终层归一化
        hidden_states = self.final_layer_norm(hidden_states)

        # 构建输出元组
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，添加到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


class SEWEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # SEW位置卷积嵌入
        self.pos_conv_embed = SEWPositionalConvEmbedding(config)
        # 平均池化层，使用squeeze因子配置
        self.pool = nn.AvgPool1d(config.squeeze_factor, config.squeeze_factor)
        # 层归一化，使用隐藏大小和层归一化epsilon值
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 随机失活层，使用隐藏dropout率
        self.dropout = nn.Dropout(config.hidden_dropout)
        # SEW编码器层列表，根据隐藏层数配置
        self.layers = nn.ModuleList([SEWEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # SEW上采样
        self.upsample = SEWUpsampling(config)
        # 梯度检查点，默认为关闭
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
):
    pass  # 此处省略forward方法实现，仅给出了方法签名

class SEWPreTrainedModel(PreTrainedModel):
    """
    处理权重初始化和预训练模型下载加载的抽象类。
    """

    config_class = SEWConfig  # SEW模型的配置类
    base_model_prefix = "sew"  # 基础模型前缀
    main_input_name = "input_values"  # 主输入名称
    supports_gradient_checkpointing = True  # 支持梯度检查点
    def _init_weights(self, module):
        """Initialize the weights"""  # 定义一个初始化权重的函数，参数是一个神经网络模块
        if isinstance(module, SEWPositionalConvEmbedding):
            # 对 SEWPositionalConvEmbedding 类型的模块，使用正态分布初始化卷积层的权重，均值为 0，标准差为根据核大小和输入通道数计算的值
            nn.init.normal_(
                module.conv.weight,
                mean=0,
                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
            )
            # 将卷积层的偏置初始化为 0
            nn.init.constant_(module.conv.bias, 0)
        elif isinstance(module, nn.Linear):
            # 对线性层，使用正态分布初始化权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            # 对 LayerNorm 和 GroupNorm 模块，偏置初始化为 0，权重初始化为 1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Conv1d):
            if is_deepspeed_zero3_enabled():
                import deepspeed  # 导入 deepspeed 库
                # 如果启用了 DeepSpeed 的 Zero-3，且卷积层有 weight_v 和 weight_g 属性，使用 GatheredParameters 初始化权重
                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
                        nn.init.kaiming_normal_(module.weight.data)
                else:
                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
                        nn.init.kaiming_normal_(module.weight.data)
            else:
                # 否则使用 He 初始化方法，适用于 ReLU 激活函数
                nn.init.kaiming_normal_(module.weight.data)

        # 对于 Linear 和 Conv1d 类型的模块，且具有偏置的，偏置初始化为 0
        if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
            module.bias.data.zero_()

    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        Computes the output length of the convolutional layers
        """  # 定义一个计算卷积层输出长度的函数，输入是一个长度张量或整数
        def _conv_out_length(input_length, kernel_size, stride):
            # 计算 1D 卷积层输出长度的公式，取自 PyTorch 文档
            # input_length 是输入长度，kernel_size 是卷积核大小，stride 是步幅
            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        # 遍历配置中的卷积核大小和步幅，依次计算输出长度
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        # 返回计算得到的输出长度
        return input_lengths
    # 定义一个方法用于生成特征向量的注意力掩码
    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
        # 计算输出长度，即根据注意力掩码每个样本的有效长度来确定输出的长度
        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
        # 获取批次大小
        batch_size = attention_mask.shape[0]

        # 初始化一个全零的注意力掩码张量，形状为(batch_size, feature_vector_length)
        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )
        
        # 将输出长度前的所有位置设为1，以确保在这些位置之前的所有值都被注意到
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
        
        # 将注意力掩码张量沿最后一个维度翻转，累加并再次翻转，并转换为布尔类型，以生成正确的注意力掩码
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
        
        # 返回生成的注意力掩码张量
        return attention_mask
"""
SEW was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech
Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger,
Yoav Artzi.

This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).

This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.

Parameters:
    config ([`SEWConfig`]): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
    "The bare SEW Model transformer outputting raw hidden-states without any specific head on top.",
    SEW_START_DOCSTRING,
)
class SEWModel(SEWPreTrainedModel):
    # 初始化函数，接收一个SEWConfig类型的配置对象作为参数
    def __init__(self, config: SEWConfig):
        # 调用父类的初始化方法，传入配置对象作为参数
        super().__init__(config)
        # 将配置对象保存到实例变量self.config中
        self.config = config
        # 使用配置对象创建SEWFeatureEncoder类型的特征提取器实例，并保存到self.feature_extractor中
        self.feature_extractor = SEWFeatureEncoder(config)
        # 创建一个LayerNorm层，用于归一化最后一个卷积层的输出，参数为config.conv_dim[-1]，eps为config.layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)

        # 判断是否需要对特征进行投影
        self.project_features = config.conv_dim[-1] != config.hidden_size
        if self.project_features:
            # 如果需要投影，创建一个Linear层，将config.conv_dim[-1]维的特征投影到config.hidden_size维
            self.feature_projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # 创建一个Dropout层，用于特征投影后的dropout，dropout率为config.feat_proj_dropout
        self.feature_dropout = nn.Dropout(config.feat_proj_dropout)

        # 如果配置中mask_time_prob或mask_feature_prob大于0，创建一个参数化的特征嵌入张量
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())

        # 使用配置对象创建SEWEncoder类型的编码器实例，并保存到self.encoder中
        self.encoder = SEWEncoder(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states复制而来
    def _mask_hidden_states(
        self,
        hidden_states: torch.FloatTensor,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # apply SpecAugment along time axis with given mask_time_indices
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # compute mask indices for time axis if not provided
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            # apply SpecAugment along time axis using computed mask indices
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            # expand feature mask indices to match the shape of hidden_states and apply
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        return hidden_states
        ) -> Union[Tuple, BaseModelOutput]:
        # 如果未提供输出注意力机制，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未提供输出隐藏状态，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未提供返回字典选项，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 提取输入特征
        extract_features = self.feature_extractor(input_values)
        # 调整特征的维度顺序
        extract_features = extract_features.transpose(1, 2)
        # 应用层归一化到特征
        extract_features = self.layer_norm(extract_features)

        # 如果需要对特征进行投影
        if self.project_features:
            extract_features = self.feature_projection(extract_features)
        # 使用特征丢弃层处理特征
        hidden_states = self.feature_dropout(extract_features)

        # 如果提供了注意力掩码
        if attention_mask is not None:
            # 计算与特征向量对应的减少注意力掩码
            attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)

        # 对隐藏状态进行掩码处理，根据时间索引进行掩码
        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)

        # 编码器处理隐藏状态
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 取编码器输出的第一个元素作为隐藏状态
        hidden_states = encoder_outputs[0]

        # 如果不需要返回字典形式的输出
        if not return_dict:
            # 返回元组形式的结果，包含隐藏状态和额外的编码器输出
            return (hidden_states,) + encoder_outputs[1:]

        # 返回基础模型输出对象，包含最后的隐藏状态、所有隐藏状态和注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    SEW_START_DOCSTRING,
)
# 使用`add_start_docstrings`装饰器添加模型文档字符串，描述SEW模型具有一个在顶部的语言建模头用于连接主义时间分类(CTC)。
# SEW_START_DOCSTRING为预定义的模型开始文档字符串。

# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC类复制而来，修改Wav2Vec2为SEW，wav2vec2为sew，WAV_2_VEC_2为SEW
class SEWForCTC(SEWPreTrainedModel):
    def __init__(self, config, target_lang: Optional[str] = None):
        super().__init__(config)

        # 初始化SEW模型
        self.sew = SEWModel(config)
        # 使用config中的final_dropout创建一个dropout层
        self.dropout = nn.Dropout(config.final_dropout)

        self.target_lang = target_lang

        # 如果config中未定义语言模型头的词汇量大小，则抛出异常
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )
        
        # 根据config的设置选择输出隐藏大小
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )
        # 创建一个线性层作为语言模型头，连接隐藏大小和词汇量大小
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 初始化权重并应用最终处理
        self.post_init()

    def tie_weights(self):
        """
        重写`~PreTrainedModel.tie_weights`方法，以便在通过`from_pretrained(...)`传递`target_lang=...`时正确加载适配器权重。

        该方法不应由用户调用，并且可能在未来发生更改。
        """

        # 注意，`tie_weights`通常用于绑定输入和输出嵌入权重。在这里重新用于SEW，以便我们在SEW加载适配器层时不必引入新的API。
        # 虽然有些巧妙，SEW永远不必绑定输入和输出嵌入，因此在这里重新使用该函数是可以的。
        target_lang = self.target_lang

        # 如果target_lang不为None，并且config中未定义adapter_attn_dim，则抛出异常
        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
        # 如果target_lang为None，并且config中定义了adapter_attn_dim，则记录信息提示，默认将target_lang设置为'eng'
        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
            logger.info("By default `target_lang` is set to 'eng'.")
        # 如果target_lang不为None，则加载适配器
        elif target_lang is not None:
            self.load_adapter(target_lang, force_load=True)
    # 调用此函数将禁用特征编码器的梯度计算，使其参数在训练过程中不会更新
    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 发出警告，提醒方法`freeze_feature_extractor`已过时，并将在 Transformers v5 中移除。请使用等效的`freeze_feature_encoder`方法。
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用等效的`freeze_feature_encoder`方法，冻结特征编码器的参数
        self.freeze_feature_encoder()

    # 调用此函数将禁用特征编码器的梯度计算，使其参数在训练过程中不会更新
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 冻结特征编码器的参数
        self.sew.feature_extractor._freeze_parameters()

    # 调用此函数将禁用基础模型的梯度计算，使其参数在训练过程中不会更新。只有分类头部会更新。
    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        # 遍历self.sew对象的参数，并设置requires_grad为False，以禁用梯度计算
        for param in self.sew.parameters():
            param.requires_grad = False

    # 使用add_start_docstrings_to_model_forward和add_code_sample_docstrings为模型的forward方法添加文档字符串
    @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    # 定义模型的forward方法，用于执行前向传播
    def forward(
        self,
        input_values: Optional[torch.Tensor],  # 输入值，类型为可选的torch.Tensor
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，类型为可选的torch.Tensor，默认为None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，类型为可选的bool，默认为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的bool，默认为None
        return_dict: Optional[bool] = None,  # 是否返回字典形式的结果，类型为可选的bool，默认为None
        labels: Optional[torch.Tensor] = None,  # 标签，类型为可选的torch.Tensor，默认为None
    ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """

        # Decide whether to return a dictionary or not based on the provided argument or configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Perform sequence to sequence processing using the model's encoder-decoder structure
        outputs = self.sew(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract hidden states from the model's output and apply dropout regularization
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # Generate logits from the processed hidden states using the language model head
        logits = self.lm_head(hidden_states)

        # Initialize loss as None; compute CTC (Connectionist Temporal Classification) loss if labels are provided
        loss = None
        if labels is not None:
            # Validate label values to ensure they are within the vocabulary size
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # Calculate input lengths based on the attention mask
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

            # Determine which labels are valid and compute target lengths
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # Apply log softmax to logits and transpose dimensions for CTC loss computation
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # Compute CTC loss with adjustments for padding and configuration settings
            with torch.backends.cudnn.flags(enabled=False):
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # If return_dict is False, format the output tuple accordingly
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # If return_dict is True, construct and return a CausalLMOutput object with relevant attributes
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
# SEW 模型，顶部带有一个序列分类头（在汇总输出之上的线性层），用于诸如 SUPERB 关键词识别等任务。
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification 复制而来，将 Wav2Vec2 改为 SEW，wav2vec2 改为 sew，WAV_2_VEC_2 改为 SEW。
class SEWForSequenceClassification(SEWPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 如果配置允许添加适配器且为真，则引发值错误，因为序列分类不支持使用 SEW 适配器（config.add_adapter=True）。
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of SEW adapters (config.add_adapter=True)"
            )
        
        # 创建 SEW 模型对象
        self.sew = SEWModel(config)
        
        # 计算层数：变压器层数 + 输入嵌入层
        num_layers = config.num_hidden_layers + 1
        
        # 如果配置使用加权层求和，则初始化层权重参数为均匀值
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 线性投影层，将隐藏状态大小映射到分类器投影大小
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
        
        # 分类器层，将分类器投影大小映射到类别数量
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 冻结特征提取器，不再计算特征编码器的梯度，使其在训练期间不更新
    def freeze_feature_extractor(self):
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    # 冻结特征编码器，不再计算特征编码器的梯度，使其在训练期间不更新
    def freeze_feature_encoder(self):
        self.sew.feature_extractor._freeze_parameters()

    # 冻结基础模型，不再计算基础模型的梯度，使其在训练期间不更新，仅更新分类头
    def freeze_base_model(self):
        for param in self.sew.parameters():
            param.requires_grad = False

    # 在模型前向方法上添加文档字符串注释，详细描述输入和输出
    @add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_SEQ_CLASS_CHECKPOINT,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 设置是否返回字典形式的输出，默认从模型配置中获取
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        # 根据配置决定是否输出隐藏状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用序列编码器模块进行前向传播
        outputs = self.sew(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置要求使用加权层求和，则进行相应的处理
        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            hidden_states = outputs[0]

        # 将处理后的隐藏状态传递给投影层进行处理
        hidden_states = self.projector(hidden_states)
        
        # 如果没有提供注意力掩码，则对隐藏状态进行均值池化
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            # 根据注意力掩码生成填充掩码，并对隐藏状态进行相应的处理
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states[~padding_mask] = 0.0
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        # 对池化后的输出应用分类器以获得最终的分类预测
        logits = self.classifier(pooled_output)

        # 如果提供了标签，则计算损失
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        # 根据是否返回字典形式的输出进行结果的返回
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 返回序列分类器的输出，包括损失、预测 logits、隐藏状态和注意力权重
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-一百零二-

Transformers 源码解析（一百零二）

.\models\segformer\modeling_segformer.py

.\models\segformer\modeling_tf_segformer.py

.\models\segformer\__init__.py

.\models\seggpt\configuration_seggpt.py

.\models\seggpt\convert_seggpt_to_hf.py

.\models\seggpt\image_processing_seggpt.py

.\models\seggpt\modeling_seggpt.py

.\models\seggpt\__init__.py

.\models\sew\configuration_sew.py

.\models\sew\convert_sew_original_pytorch_checkpoint_to_pytorch.py

.\models\sew\modeling_sew.py

`.\models\segformer\modeling_segformer.py`

`.\models\segformer\modeling_tf_segformer.py`

`.\models\segformer\init.py`

`.\models\seggpt\configuration_seggpt.py`

`.\models\seggpt\convert_seggpt_to_hf.py`

`.\models\seggpt\image_processing_seggpt.py`

`.\models\seggpt\modeling_seggpt.py`

`.\models\seggpt\init.py`

`.\models\sew\configuration_sew.py`

`.\models\sew\convert_sew_original_pytorch_checkpoint_to_pytorch.py`

`.\models\sew\modeling_sew.py`