Transformers 源码解析（七十七）

`.\models\mobilenet_v2\modeling_mobilenet_v2.py`

# coding=utf-8
# Copyright 2022 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch MobileNetV2 model."""


from typing import Optional, Union

import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
    SemanticSegmenterOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_mobilenet_v2 import MobileNetV2Config


logger = logging.get_logger(__name__)


# General docstring
_CONFIG_FOR_DOC = "MobileNetV2Config"

# Base docstring
_CHECKPOINT_FOR_DOC = "google/mobilenet_v2_1.0_224"
_EXPECTED_OUTPUT_SHAPE = [1, 1280, 7, 7]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "google/mobilenet_v2_1.0_224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"


MOBILENET_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/mobilenet_v2_1.4_224",
    "google/mobilenet_v2_1.0_224",
    "google/mobilenet_v2_0.37_160",
    "google/mobilenet_v2_0.35_96",
    # See all MobileNetV2 models at https://huggingface.co/models?filter=mobilenet_v2
]


def _build_tf_to_pytorch_map(model, config, tf_weights=None):
    """
    A map of modules from TF to PyTorch.
    """

    # Initialize an empty map to store TF to PyTorch module mappings
    tf_to_pt_map = {}

    # Check if the model is an instance of MobileNetV2ForImageClassification or MobileNetV2ForSemanticSegmentation
    if isinstance(model, (MobileNetV2ForImageClassification, MobileNetV2ForSemanticSegmentation)):
        backbone = model.mobilenet_v2  # Get the MobileNetV2 backbone from the model
    else:
        backbone = model  # Otherwise, use the model directly

    # Function to handle Exponential Moving Average (EMA) weights in TF
    def ema(x):
        return x + "/ExponentialMovingAverage" if x + "/ExponentialMovingAverage" in tf_weights else x

    # Map TF weights to PyTorch model components for the convolutional stem
    prefix = "MobilenetV2/Conv/"
    tf_to_pt_map[ema(prefix + "weights")] = backbone.conv_stem.first_conv.convolution.weight
    tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = backbone.conv_stem.first_conv.normalization.bias
    tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = backbone.conv_stem.first_conv.normalization.weight
    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.first_conv.normalization.running_mean
    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.first_conv.normalization.running_var

    prefix = "MobilenetV2/expanded_conv/depthwise/"
    tf_to_pt_map[ema(prefix + "depthwise_weights")] = backbone.conv_stem.conv_3x3.convolution.weight
    tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = backbone.conv_stem.conv_3x3.normalization.bias
    tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = backbone.conv_stem.conv_3x3.normalization.weight
    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.conv_3x3.normalization.running_mean
    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.conv_3x3.normalization.running_var


# 将 TensorFlow 模型参数映射到 PyTorch 模型参数，处理卷积层的权重和规范化参数
tf_to_pt_map[ema(prefix + "depthwise_weights")] = backbone.conv_stem.conv_3x3.convolution.weight
tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = backbone.conv_stem.conv_3x3.normalization.bias
tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = backbone.conv_stem.conv_3x3.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.conv_3x3.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.conv_3x3.normalization.running_var



    prefix = "MobilenetV2/expanded_conv/project/"
    tf_to_pt_map[ema(prefix + "weights")] = backbone.conv_stem.reduce_1x1.convolution.weight
    tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = backbone.conv_stem.reduce_1x1.normalization.bias
    tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = backbone.conv_stem.reduce_1x1.normalization.weight
    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.reduce_1x1.normalization.running_mean
    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.reduce_1x1.normalization.running_var


# 将 TensorFlow 模型参数映射到 PyTorch 模型参数，处理扩展卷积块的投影部分的权重和规范化参数
prefix = "MobilenetV2/expanded_conv/project/"
tf_to_pt_map[ema(prefix + "weights")] = backbone.conv_stem.reduce_1x1.convolution.weight
tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = backbone.conv_stem.reduce_1x1.normalization.bias
tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = backbone.conv_stem.reduce_1x1.normalization.weight
tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.reduce_1x1.normalization.running_mean
tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.reduce_1x1.normalization.running_var



    for i in range(16):
        tf_index = i + 1
        pt_index = i
        pointer = backbone.layer[pt_index]

        prefix = f"MobilenetV2/expanded_conv_{tf_index}/expand/"
        tf_to_pt_map[ema(prefix + "weights")] = pointer.expand_1x1.convolution.weight
        tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = pointer.expand_1x1.normalization.bias
        tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = pointer.expand_1x1.normalization.weight
        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.expand_1x1.normalization.running_mean
        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.expand_1x1.normalization.running_var

        prefix = f"MobilenetV2/expanded_conv_{tf_index}/depthwise/"
        tf_to_pt_map[ema(prefix + "depthwise_weights")] = pointer.conv_3x3.convolution.weight
        tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = pointer.conv_3x3.normalization.bias
        tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = pointer.conv_3x3.normalization.weight
        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.conv_3x3.normalization.running_mean
        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.conv_3x3.normalization.running_var

        prefix = f"MobilenetV2/expanded_conv_{tf_index}/project/"
        tf_to_pt_map[ema(prefix + "weights")] = pointer.reduce_1x1.convolution.weight
        tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = pointer.reduce_1x1.normalization.bias
        tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = pointer.reduce_1x1.normalization.weight
        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.reduce_1x1.normalization.running_mean
        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.reduce_1x1.normalization.running_var


# 遍历每个 MobileNetV2 扩展卷积块的索引，映射 TensorFlow 模型参数到 PyTorch 模型参数
for i in range(16):
    tf_index = i + 1
    pt_index = i
    pointer = backbone.layer[pt_index]

    # 处理当前扩展卷积块的扩展部分权重和规范化参数
    prefix = f"MobilenetV2/expanded_conv_{tf_index}/expand/"
    tf_to_pt_map[ema(prefix + "weights")] = pointer.expand_1x1.convolution.weight
    tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = pointer.expand_1x1.normalization.bias
    tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = pointer.expand_1x1.normalization.weight
    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.expand_1x1.normalization.running_mean
    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.expand_1x1.normalization.running_var

    # 处理当前扩展卷积块的深度可分离卷积部分的权重和规范化参数
    prefix = f"MobilenetV2/expanded_conv_{tf_index}/depthwise/"
    tf_to_pt_map[ema(prefix + "depthwise_weights")] = pointer.conv_3x3.convolution.weight
    tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = pointer.conv_3x3.normalization.bias
    tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = pointer.conv_3x3.normalization.weight
    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.conv_3x3.normalization.running_mean
    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.conv_3x3.normalization.running_var

    # 处理当前扩展卷积块的投影部分的权重和规范化参数
    prefix = f"MobilenetV2/expanded_conv_{tf_index}/project/"
    tf_to_pt_map[ema(prefix + "weights")] = pointer.reduce_1x1.convolution.weight
    tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = pointer.reduce_1x1.normalization.bias
    tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = pointer.reduce_1x1.normalization.weight
    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.reduce_1x1.normalization.running_mean
    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.reduce_1x1.normalization.running_var



    prefix = "MobilenetV2/Conv_1/"
    tf_to_pt_map[ema(prefix + "weights")] = backbone.conv_1x1.convolution.weight


# 将 TensorFlow 模型参数映射到 PyTorch 模型参数，处理 MobileNetV2 的第一个卷积层的权重
prefix = "MobilenetV2/Conv_1/"
tf_to_pt_map[ema(prefix + "weights")] = backbone.conv_1x1.convolution.weight
    # 将 TensorFlow 中的指定层参数映射到 PyTorch 模型中对应的权重和偏置
    tf_to_pt_map[ema(prefix + "BatchNorm/beta")] = backbone.conv_1x1.normalization.bias
    tf_to_pt_map[ema(prefix + "BatchNorm/gamma")] = backbone.conv_1x1.normalization.weight
    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_1x1.normalization.running_mean
    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_1x1.normalization.running_var

    # 如果模型是 MobileNetV2 图像分类器，则映射额外的层参数
    if isinstance(model, MobileNetV2ForImageClassification):
        prefix = "MobilenetV2/Logits/Conv2d_1c_1x1/"
        tf_to_pt_map[ema(prefix + "weights")] = model.classifier.weight
        tf_to_pt_map[ema(prefix + "biases")] = model.classifier.bias

    # 如果模型是 MobileNetV2 语义分割模型，则映射额外的层参数
    if isinstance(model, MobileNetV2ForSemanticSegmentation):
        prefix = "image_pooling/"
        tf_to_pt_map[prefix + "weights"] = model.segmentation_head.conv_pool.convolution.weight
        tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_pool.normalization.bias
        tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_pool.normalization.weight
        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = model.segmentation_head.conv_pool.normalization.running_mean
        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = model.segmentation_head.conv_pool.normalization.running_var

        prefix = "aspp0/"
        tf_to_pt_map[prefix + "weights"] = model.segmentation_head.conv_aspp.convolution.weight
        tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_aspp.normalization.bias
        tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_aspp.normalization.weight
        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = model.segmentation_head.conv_aspp.normalization.running_mean
        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = model.segmentation_head.conv_aspp.normalization.running_var

        prefix = "concat_projection/"
        tf_to_pt_map[prefix + "weights"] = model.segmentation_head.conv_projection.convolution.weight
        tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_projection.normalization.bias
        tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_projection.normalization.weight
        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = model.segmentation_head.conv_projection.normalization.running_mean
        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = model.segmentation_head.conv_projection.normalization.running_var

        prefix = "logits/semantic/"
        tf_to_pt_map[ema(prefix + "weights")] = model.segmentation_head.classifier.convolution.weight
        tf_to_pt_map[ema(prefix + "biases")] = model.segmentation_head.classifier.convolution.bias

    # 返回 TensorFlow 到 PyTorch 参数映射的字典
    return tf_to_pt_map
# 将 TensorFlow 模型的权重加载到 PyTorch 模型中
def load_tf_weights_in_mobilenet_v2(model, config, tf_checkpoint_path):
    try:
        import numpy as np  # 导入 NumPy 库
        import tensorflow as tf  # 导入 TensorFlow 库
    except ImportError:
        logger.error(
            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise

    # 从 TensorFlow 模型中加载权重
    init_vars = tf.train.list_variables(tf_checkpoint_path)  # 获取 TensorFlow 检查点文件中的变量列表
    tf_weights = {}  # 创建一个空字典，用于存储 TensorFlow 权重

    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")  # 记录日志，显示正在加载的 TensorFlow 权重名称和形状
        array = tf.train.load_variable(tf_checkpoint_path, name)  # 加载 TensorFlow 权重变量
        tf_weights[name] = array  # 将加载的 TensorFlow 权重存入字典中

    # 构建 TensorFlow 到 PyTorch 的权重映射
    tf_to_pt_map = _build_tf_to_pytorch_map(model, config, tf_weights)

    for name, pointer in tf_to_pt_map.items():
        logger.info(f"Importing {name}")  # 记录日志，显示正在导入的权重名称

        if name not in tf_weights:
            logger.info(f"{name} not in tf pre-trained weights, skipping")  # 如果权重名称不在 TensorFlow 预训练权重中，则跳过
            continue

        array = tf_weights[name]  # 获取 TensorFlow 权重数组

        if "depthwise_weights" in name:
            logger.info("Transposing depthwise")  # 记录日志，显示正在转置深度可分离卷积权重
            array = np.transpose(array, (2, 3, 0, 1))  # 对深度可分离卷积的权重进行转置操作
        elif "weights" in name:
            logger.info("Transposing")  # 记录日志，显示正在转置权重
            if len(pointer.shape) == 2:  # 如果指针的形状是二维（即复制到线性层）
                array = array.squeeze().transpose()  # 对数组进行压缩并转置
            else:
                array = np.transpose(array, (3, 2, 0, 1))  # 对权重数组进行转置操作

        if pointer.shape != array.shape:
            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")  # 抛出数值错误，如果指针形状与数组形状不匹配

        logger.info(f"Initialize PyTorch weight {name} {array.shape}")  # 记录日志，显示正在初始化 PyTorch 权重的名称和形状
        pointer.data = torch.from_numpy(array)  # 将 NumPy 数组转换为 PyTorch 张量，并赋值给指针的数据属性

        # 从 TensorFlow 权重字典中移除不需要的条目
        tf_weights.pop(name, None)
        tf_weights.pop(name + "/RMSProp", None)
        tf_weights.pop(name + "/RMSProp_1", None)
        tf_weights.pop(name + "/ExponentialMovingAverage", None)
        tf_weights.pop(name + "/Momentum", None)

    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")  # 记录日志，显示未复制到 PyTorch 模型的权重名称列表
    return model  # 返回加载了 TensorFlow 权重的 PyTorch 模型


def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
    """
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)  # 确保通道数可被 `divisor` 整除
    # 确保向下取整不会减少超过 10%
    if new_value < 0.9 * value:
        new_value += divisor
    return int(new_value)  # 返回确保可被 `divisor` 整除的通道数


def apply_depth_multiplier(config: MobileNetV2Config, channels: int) -> int:
    # 根据给定的参数计算可被指定整除的深度，以满足网络深度的要求
    return make_divisible(int(round(channels * config.depth_multiplier)), config.depth_divisible_by, config.min_depth)
    """
    Apply TensorFlow-style "SAME" padding to a convolution layer. See the notes at:
    https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2
    """
    # 获取输入特征的高度和宽度
    in_height = int(features.shape[-2])
    in_width = int(features.shape[-1])
    # 获取卷积层的步幅、卷积核大小和膨胀率
    stride_height, stride_width = conv_layer.stride
    kernel_height, kernel_width = conv_layer.kernel_size
    dilation_height, dilation_width = conv_layer.dilation

    # 计算沿高度和宽度方向的填充量
    if in_height % stride_height == 0:
        pad_along_height = max(kernel_height - stride_height, 0)
    else:
        pad_along_height = max(kernel_height - (in_height % stride_height), 0)

    if in_width % stride_width == 0:
        pad_along_width = max(kernel_width - stride_width, 0)
    else:
        pad_along_width = max(kernel_width - (in_width % stride_width), 0)

    # 计算左右和上下填充的具体值
    pad_left = pad_along_width // 2
    pad_right = pad_along_width - pad_left
    pad_top = pad_along_height // 2
    pad_bottom = pad_along_height - pad_top

    # 构建填充元组，考虑膨胀率对填充量的影响
    padding = (
        pad_left * dilation_width,
        pad_right * dilation_width,
        pad_top * dilation_height,
        pad_bottom * dilation_height,
    )
    # 使用 PyTorch 提供的函数对特征进行填充
    return nn.functional.pad(features, padding, "constant", 0.0)
    ) -> None:
        super().__init__()
        self.config = config

        # 检查输入通道数是否能被分组数整除，如果不能则抛出错误
        if in_channels % groups != 0:
            raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
        # 检查输出通道数是否能被分组数整除，如果不能则抛出错误
        if out_channels % groups != 0:
            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")

        # 根据配置计算填充数，如果不使用 TensorFlow 填充则按照公式计算
        padding = 0 if config.tf_padding else int((kernel_size - 1) / 2) * dilation

        # 创建卷积层对象
        self.convolution = nn.Conv2d(
            in_channels=in_channels,        # 输入通道数
            out_channels=out_channels,      # 输出通道数
            kernel_size=kernel_size,        # 卷积核大小
            stride=stride,                  # 步长
            padding=padding,                # 填充数
            dilation=dilation,              # 空洞卷积率
            groups=groups,                  # 分组数
            bias=bias,                      # 是否使用偏置
            padding_mode="zeros",           # 填充模式
        )

        # 如果需要使用归一化
        if use_normalization:
            # 创建批归一化层对象
            self.normalization = nn.BatchNorm2d(
                num_features=out_channels,                                      # 输入特征数
                eps=config.layer_norm_eps if layer_norm_eps is None else layer_norm_eps,  # 归一化的 epsilon
                momentum=0.997,                                                 # 动量
                affine=True,                                                    # 是否使用仿射变换
                track_running_stats=True,                                       # 是否追踪运行时统计信息
            )
        else:
            self.normalization = None

        # 如果需要使用激活函数
        if use_activation:
            # 根据配置或者传入的激活函数名称选择激活函数
            if isinstance(use_activation, str):
                self.activation = ACT2FN[use_activation]
            elif isinstance(config.hidden_act, str):
                self.activation = ACT2FN[config.hidden_act]
            else:
                self.activation = config.hidden_act
        else:
            self.activation = None

    # 前向传播函数定义
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 如果配置要求使用 TensorFlow 填充，则应用 TensorFlow 填充函数
        if self.config.tf_padding:
            features = apply_tf_padding(features, self.convolution)
        # 对特征进行卷积操作
        features = self.convolution(features)
        # 如果定义了归一化层，则对特征进行归一化处理
        if self.normalization is not None:
            features = self.normalization(features)
        # 如果定义了激活函数，则对特征进行激活函数处理
        if self.activation is not None:
            features = self.activation(features)
        # 返回处理后的特征
        return features
# 定义 MobileNetV2InvertedResidual 类，继承自 nn.Module
class MobileNetV2InvertedResidual(nn.Module):
    # 初始化函数，接受配置对象 config，输入通道数 in_channels，输出通道数 out_channels，步长 stride 和膨胀率 dilation
    def __init__(
        self, config: MobileNetV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1
    ) -> None:
        super().__init__()

        # 根据配置计算扩展后的通道数，确保可被 config.depth_divisible_by 整除且不低于 config.min_depth
        expanded_channels = make_divisible(
            int(round(in_channels * config.expand_ratio)), config.depth_divisible_by, config.min_depth
        )

        # 如果步长不是 1 或 2，抛出 ValueError 异常
        if stride not in [1, 2]:
            raise ValueError(f"Invalid stride {stride}.")

        # 判断是否使用残差连接，条件是步长为 1 并且输入通道数等于输出通道数
        self.use_residual = (stride == 1) and (in_channels == out_channels)

        # 定义扩展 1x1 卷积层，将输入通道数扩展到 expanded_channels
        self.expand_1x1 = MobileNetV2ConvLayer(
            config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1
        )

        # 定义 3x3 卷积层，处理扩展后的通道数数据，支持指定步长、组卷积和空洞卷积
        self.conv_3x3 = MobileNetV2ConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=expanded_channels,
            kernel_size=3,
            stride=stride,
            groups=expanded_channels,
            dilation=dilation,
        )

        # 定义降维 1x1 卷积层，将通道数降至 out_channels，不使用激活函数
        self.reduce_1x1 = MobileNetV2ConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation=False,
        )

    # 前向传播函数，接受特征张量 features，返回处理后的特征张量
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 将输入特征作为残差备份
        residual = features

        # 依次经过扩展 1x1 卷积、3x3 卷积和降维 1x1 卷积
        features = self.expand_1x1(features)
        features = self.conv_3x3(features)
        features = self.reduce_1x1(features)

        # 如果使用残差连接，将残差张量和处理后的特征张量相加；否则直接返回处理后的特征张量
        return residual + features if self.use_residual else features


# 定义 MobileNetV2Stem 类，继承自 nn.Module
class MobileNetV2Stem(nn.Module):
    # 初始化函数，接受配置对象 config，输入通道数 in_channels，扩展通道数 expanded_channels 和输出通道数 out_channels
    def __init__(self, config: MobileNetV2Config, in_channels: int, expanded_channels: int, out_channels: int) -> None:
        super().__init__()

        # 第一层是普通的 3x3 卷积层，步长为 2，将通道数扩展到 expanded_channels
        self.first_conv = MobileNetV2ConvLayer(
            config,
            in_channels=in_channels,
            out_channels=expanded_channels,
            kernel_size=3,
            stride=2,
        )

        # 如果配置要求首层是扩展层，则将扩展 1x1 卷积层设置为 None；否则定义扩展 1x1 卷积层
        if config.first_layer_is_expansion:
            self.expand_1x1 = None
        else:
            self.expand_1x1 = MobileNetV2ConvLayer(
                config, in_channels=expanded_channels, out_channels=expanded_channels, kernel_size=1
            )

        # 定义 3x3 卷积层，处理扩展后的通道数数据，步长为 1，组卷积使用 expanded_channels 组
        self.conv_3x3 = MobileNetV2ConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=expanded_channels,
            kernel_size=3,
            stride=1,
            groups=expanded_channels,
        )

        # 定义降维 1x1 卷积层，将通道数降至 out_channels，不使用激活函数
        self.reduce_1x1 = MobileNetV2ConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation=False,
        )
    # 定义一个前向传播方法，接受一个特征张量作为输入，返回处理后的特征张量
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 使用self.first_conv对输入特征进行卷积操作，并更新features
        features = self.first_conv(features)
        # 如果存在self.expand_1x1模块，则对features进行1x1扩展卷积操作，并更新features
        if self.expand_1x1 is not None:
            features = self.expand_1x1(features)
        # 使用self.conv_3x3对features进行3x3卷积操作，并更新features
        features = self.conv_3x3(features)
        # 使用self.reduce_1x1对features进行1x1降维卷积操作，并更新features
        features = self.reduce_1x1(features)
        # 返回处理后的特征张量作为输出
        return features
# 定义一个继承自 `PreTrainedModel` 的抽象类，用于处理权重初始化和预训练模型的下载与加载接口。
class MobileNetV2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 `MobileNetV2Config`
    config_class = MobileNetV2Config
    # 指定加载 TensorFlow 权重的函数为 `load_tf_weights_in_mobilenet_v2`
    load_tf_weights = load_tf_weights_in_mobilenet_v2
    # 指定基础模型的前缀为 "mobilenet_v2"
    base_model_prefix = "mobilenet_v2"
    # 主输入的名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 不支持梯度检查点
    supports_gradient_checkpointing = False

    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
        """Initialize the weights"""
        # 如果模块是线性层或卷积层，使用正态分布初始化权重，均值为 0，标准差为配置文件中的 `initializer_range`
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置，将偏置初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是批归一化层，将偏置初始化为零，权重初始化为 1
        elif isinstance(module, nn.BatchNorm2d):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# 定义一个字符串常量，描述 MobileNetV2 模型的起始文档字符串
MOBILENET_V2_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileNetV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义一个字符串常量，描述 MobileNetV2 模型的输入文档字符串
MOBILENET_V2_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileNetV2ImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 使用 `add_start_docstrings` 装饰器，为 `MobileNetV2Model` 类添加文档字符串，包含模型输出原始隐藏状态的描述和 `MOBILENET_V2_START_DOCSTRING` 的内容
@add_start_docstrings(
    "The bare MobileNetV2 model outputting raw hidden-states without any specific head on top.",
    MOBILENET_V2_START_DOCSTRING,
)
class MobileNetV2Model(MobileNetV2PreTrainedModel):
    pass  # 该类目前未添加额外的方法或属性，继承自 `MobileNetV2PreTrainedModel`
    def __init__(self, config: MobileNetV2Config, add_pooling_layer: bool = True):
        super().__init__(config)
        self.config = config

        # Output channels for the projection layers
        channels = [16, 24, 24, 32, 32, 32, 64, 64, 64, 64, 96, 96, 96, 160, 160, 160, 320]
        channels = [apply_depth_multiplier(config, x) for x in channels]

        # Strides for the depthwise layers
        strides = [2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]

        self.conv_stem = MobileNetV2Stem(
            config,
            in_channels=config.num_channels,
            expanded_channels=apply_depth_multiplier(config, 32),
            out_channels=channels[0],
        )

        current_stride = 2  # first conv layer has stride 2
        dilation = 1

        self.layer = nn.ModuleList()
        for i in range(16):
            # Keep making the feature maps smaller or use dilated convolution?
            if current_stride == config.output_stride:
                layer_stride = 1
                layer_dilation = dilation
                dilation *= strides[i]  # larger dilation starts in next block
            else:
                layer_stride = strides[i]
                layer_dilation = 1
                current_stride *= layer_stride

            self.layer.append(
                MobileNetV2InvertedResidual(
                    config,
                    in_channels=channels[i],
                    out_channels=channels[i + 1],
                    stride=layer_stride,
                    dilation=layer_dilation,
                )
            )

        if config.finegrained_output and config.depth_multiplier < 1.0:
            output_channels = 1280
        else:
            output_channels = apply_depth_multiplier(config, 1280)

        self.conv_1x1 = MobileNetV2ConvLayer(
            config,
            in_channels=channels[-1],
            out_channels=output_channels,
            kernel_size=1,
        )

        self.pooler = nn.AdaptiveAvgPool2d((1, 1)) if add_pooling_layer else None

        # Initialize weights and apply final processing
        self.post_init()

    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError

    @add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the MobileNetV2 model.

        Args:
            pixel_values (Optional[torch.Tensor]): Input tensor of shape (batch_size, channels, height, width).
            output_hidden_states (Optional[bool]): Whether to return hidden states.
            return_dict (Optional[bool]): Whether to return as a dictionary.

        Returns:
            BaseModelOutputWithPoolingAndNoAttention: A namedtuple with the model outputs.
        """
        # Implementation of the forward pass is provided by the library
        pass
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 output_hidden_states，则使用模型配置中的默认值

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果未指定 return_dict，则使用模型配置中的默认值

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")
        # 如果未提供 pixel_values，抛出数值错误异常

        hidden_states = self.conv_stem(pixel_values)
        # 将输入的像素值通过卷积层 self.conv_stem 进行处理，得到隐藏状态

        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出所有隐藏状态，则初始化空元组 all_hidden_states，否则设为 None

        for i, layer_module in enumerate(self.layer):
            hidden_states = layer_module(hidden_states)
            # 逐层将隐藏状态通过 self.layer 中的每个层处理

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
                # 如果需要输出所有隐藏状态，则将当前层的隐藏状态添加到 all_hidden_states 中

        last_hidden_state = self.conv_1x1(hidden_states)
        # 将最终的隐藏状态通过卷积层 self.conv_1x1 进行最后的处理，得到最终隐藏状态

        if self.pooler is not None:
            pooled_output = torch.flatten(self.pooler(last_hidden_state), start_dim=1)
            # 如果定义了池化器 self.pooler，则对最终隐藏状态进行池化处理，然后展平成一维张量
        else:
            pooled_output = None
            # 如果未定义池化器，则池化输出设为 None

        if not return_dict:
            return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None)
        # 如果不需要返回字典形式的结果，则返回包含非 None 值的元组

        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=all_hidden_states,
        )
        # 否则，返回包含各隐藏状态的 BaseModelOutputWithPoolingAndNoAttention 对象
``
    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用指定的 return_dict，否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 MobileNetV2 模型处理像素值，返回输出结果，可以包含隐藏状态
        outputs = self.mobilenet_v2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果 return_dict 为 True，则从 outputs 中获取 pooler_output；否则从 outputs 的第二个元素获取
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 对池化后的输出进行 dropout 和分类器处理，得到 logits
        logits = self.classifier(self.dropout(pooled_output))

        # 初始化损失为 None
        loss = None
        # 如果 labels 不为 None，则计算损失
        if labels is not None:
            # 根据配置确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算相应的损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 如果只有一个标签，使用 squeeze 处理 logits 和 labels 后计算损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 多标签分类问题，使用带 logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则按照指定格式返回结果
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 ImageClassifierOutputWithNoAttention 对象
        return ImageClassifierOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )
class MobileNetV2DeepLabV3Plus(nn.Module):
    """
    The neural network from the paper "Encoder-Decoder with Atrous Separable Convolution for Semantic Image
    Segmentation" https://arxiv.org/abs/1802.02611
    """

    def __init__(self, config: MobileNetV2Config) -> None:
        super().__init__()

        # 定义平均池化层，输出大小为1x1
        self.avg_pool = nn.AdaptiveAvgPool2d(output_size=1)

        # 定义池化后的卷积层，将通道数应用深度乘数后，输出通道数为256
        self.conv_pool = MobileNetV2ConvLayer(
            config,
            in_channels=apply_depth_multiplier(config, 320),
            out_channels=256,
            kernel_size=1,
            stride=1,
            use_normalization=True,
            use_activation="relu",
            layer_norm_eps=1e-5,
        )

        # 定义ASPP模块中的卷积层，输出通道数为256
        self.conv_aspp = MobileNetV2ConvLayer(
            config,
            in_channels=apply_depth_multiplier(config, 320),
            out_channels=256,
            kernel_size=1,
            stride=1,
            use_normalization=True,
            use_activation="relu",
            layer_norm_eps=1e-5,
        )

        # 定义投影卷积层，输入通道数为512，输出通道数为256
        self.conv_projection = MobileNetV2ConvLayer(
            config,
            in_channels=512,
            out_channels=256,
            kernel_size=1,
            stride=1,
            use_normalization=True,
            use_activation="relu",
            layer_norm_eps=1e-5,
        )

        # 定义二维Dropout层，按照指定的概率进行丢弃
        self.dropout = nn.Dropout2d(config.classifier_dropout_prob)

        # 定义分类器卷积层，输入通道数为256，输出通道数为类别数
        self.classifier = MobileNetV2ConvLayer(
            config,
            in_channels=256,
            out_channels=config.num_labels,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
            bias=True,
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        spatial_size = features.shape[-2:]

        # 对输入特征进行全局平均池化
        features_pool = self.avg_pool(features)
        # 应用池化后的卷积层
        features_pool = self.conv_pool(features_pool)
        # 进行双线性插值，调整特征大小与原始大小相同
        features_pool = nn.functional.interpolate(
            features_pool, size=spatial_size, mode="bilinear", align_corners=True
        )

        # 应用ASPP模块中的卷积层
        features_aspp = self.conv_aspp(features)

        # 将池化后的特征与ASPP模块的特征拼接起来
        features = torch.cat([features_pool, features_aspp], dim=1)

        # 应用投影卷积层
        features = self.conv_projection(features)
        # 应用Dropout层
        features = self.dropout(features)
        # 应用分类器卷积层，得到最终的特征映射
        features = self.classifier(features)
        return features


@add_start_docstrings(
    """
    MobileNetV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    """,
    MOBILENET_V2_START_DOCSTRING,
)
class MobileNetV2ForSemanticSegmentation(MobileNetV2PreTrainedModel):
    def __init__(self, config: MobileNetV2Config) -> None:
        super().__init__(config)

        # 设置类别数
        self.num_labels = config.num_labels
        # 创建MobileNetV2基础模型，不包括池化层
        self.mobilenet_v2 = MobileNetV2Model(config, add_pooling_layer=False)
        # 创建深度可分离卷积模型用于语义分割
        self.segmentation_head = MobileNetV2DeepLabV3Plus(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MOBILENET_V2_INPUTS_DOCSTRING)
    # 使用装饰器替换返回文档字符串，指定输出类型为SemanticSegmenterOutput，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    # 前向传播方法，接受多个参数
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,  # 输入像素值张量，可选
        labels: Optional[torch.Tensor] = None,         # 标签张量，可选
        output_hidden_states: Optional[bool] = None,   # 是否输出隐藏状态张量，可选
        return_dict: Optional[bool] = None,            # 是否返回字典格式结果，可选
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 MobileNetV2 处理输入图像，获取输出的隐藏状态
        outputs = self.mobilenet_v2(
            pixel_values,
            output_hidden_states=True,  # 我们需要中间的隐藏状态作为输出
            return_dict=return_dict,
        )

        # 如果配置要求返回字典，则从输出中获取编码器的隐藏状态
        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]

        # 使用分割头部处理编码器的最后一个隐藏状态，得到预测的 logits
        logits = self.segmentation_head(encoder_hidden_states[-1])

        loss = None
        if labels is not None:
            if self.config.num_labels == 1:
                raise ValueError("标签数量应大于1")
            else:
                # 将 logits 上采样到原始图像大小
                upsampled_logits = nn.functional.interpolate(
                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
                )
                # 计算交叉熵损失
                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
                loss = loss_fct(upsampled_logits, labels)

        # 如果不要求返回字典，则根据设置决定输出内容
        if not return_dict:
            if output_hidden_states:
                # 如果需要隐藏状态，则包含 logits 和隐藏状态
                output = (logits,) + outputs[1:]
            else:
                # 否则只包含 logits 和额外的输出
                output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有损失、logits、隐藏状态和注意力的 SemanticSegmenterOutput 对象
        return SemanticSegmenterOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )

`.\models\mobilenet_v2\init.py`

# 引入类型检查模块
from typing import TYPE_CHECKING

# 引入自定义的异常类和延迟加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构的字典，包含各个模块及其对应的导入内容
_import_structure = {
    "configuration_mobilenet_v2": [
        "MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "MobileNetV2Config",
        "MobileNetV2OnnxConfig",
    ],
}

# 检查视觉模块是否可用，若不可用则抛出异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，添加视觉特征提取器和图像处理器到导入结构中
    _import_structure["feature_extraction_mobilenet_v2"] = ["MobileNetV2FeatureExtractor"]
    _import_structure["image_processing_mobilenet_v2"] = ["MobileNetV2ImageProcessor"]

# 检查 Torch 是否可用，若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 Torch 可用，添加模型相关的导入内容到导入结构中
    _import_structure["modeling_mobilenet_v2"] = [
        "MOBILENET_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MobileNetV2ForImageClassification",
        "MobileNetV2ForSemanticSegmentation",
        "MobileNetV2Model",
        "MobileNetV2PreTrainedModel",
        "load_tf_weights_in_mobilenet_v2",
    ]

# 如果当前是类型检查模式
if TYPE_CHECKING:
    # 从配置模块中导入所需的配置映射和配置类
    from .configuration_mobilenet_v2 import (
        MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        MobileNetV2Config,
        MobileNetV2OnnxConfig,
    )

    # 再次检查视觉模块是否可用，若不可用则抛出异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，从特征提取和图像处理模块中导入相应类
        from .feature_extraction_mobilenet_v2 import MobileNetV2FeatureExtractor
        from .image_processing_mobilenet_v2 import MobileNetV2ImageProcessor

    # 再次检查 Torch 是否可用，若不可用则抛出异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若 Torch 可用，从模型相关模块中导入相应类和函数
        from .modeling_mobilenet_v2 import (
            MOBILENET_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
            MobileNetV2ForImageClassification,
            MobileNetV2ForSemanticSegmentation,
            MobileNetV2Model,
            MobileNetV2PreTrainedModel,
            load_tf_weights_in_mobilenet_v2,
        )

# 如果不是类型检查模式，将当前模块设为延迟加载模块
else:
    import sys

    # 使用 LazyModule 将当前模块设为延迟加载模块，指定导入结构和模块规范
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mobilevit\configuration_mobilevit.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 上面是版权声明和编码声明

# 导入所需的库和模块
from collections import OrderedDict  # 导入有序字典模块
from typing import Mapping  # 导入 Mapping 类型的声明

from packaging import version  # 导入版本信息的包

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入 ONNX 配置类
from ...utils import logging  # 导入日志工具模块

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型配置文件的映射字典
MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "apple/mobilevit-small": "https://huggingface.co/apple/mobilevit-small/resolve/main/config.json",
    "apple/mobilevit-x-small": "https://huggingface.co/apple/mobilevit-x-small/resolve/main/config.json",
    "apple/mobilevit-xx-small": "https://huggingface.co/apple/mobilevit-xx-small/resolve/main/config.json",
    "apple/deeplabv3-mobilevit-small": (
        "https://huggingface.co/apple/deeplabv3-mobilevit-small/resolve/main/config.json"
    ),
    "apple/deeplabv3-mobilevit-x-small": (
        "https://huggingface.co/apple/deeplabv3-mobilevit-x-small/resolve/main/config.json"
    ),
    "apple/deeplabv3-mobilevit-xx-small": (
        "https://huggingface.co/apple/deeplabv3-mobilevit-xx-small/resolve/main/config.json"
    ),
    # 查看所有 MobileViT 模型 https://huggingface.co/models?filter=mobilevit
}

# MobileViTConfig 类，继承自 PretrainedConfig，用于存储 MobileViT 模型的配置信息
class MobileViTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MobileViTModel`]. It is used to instantiate a
    MobileViT model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MobileViT
    [apple/mobilevit-small](https://huggingface.co/apple/mobilevit-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import MobileViTConfig, MobileViTModel

    >>> # Initializing a mobilevit-small style configuration
    >>> configuration = MobileViTConfig()

    >>> # Initializing a model from the mobilevit-small style configuration
    >>> model = MobileViTModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    
    model_type = "mobilevit"  # 模型类型为 "mobilevit"
    # 定义一个初始化方法，用于初始化神经网络模型的各种参数和属性
    def __init__(
        self,
        num_channels=3,  # 图像的通道数，默认为3（RGB）
        image_size=256,  # 图像尺寸，默认为256x256像素
        patch_size=2,  # 感受野(patch)的大小，默认为2x2像素
        hidden_sizes=[144, 192, 240],  # 编码器和解码器中隐藏层的大小列表
        neck_hidden_sizes=[16, 32, 64, 96, 128, 160, 640],  # 语义分割网络中颈部的隐藏层大小列表
        num_attention_heads=4,  # 注意力头的数量，默认为4
        mlp_ratio=2.0,  # MLP扩展比例，默认为2.0
        expand_ratio=4.0,  # 扩展比例，默认为4.0
        hidden_act="silu",  # 隐藏层激活函数，默认为SILU（Sigmoid-weighted Linear Unit）
        conv_kernel_size=3,  # 卷积核大小，默认为3x3
        output_stride=32,  # 输出步长，默认为32
        hidden_dropout_prob=0.1,  # 隐藏层的Dropout概率，默认为0.1
        attention_probs_dropout_prob=0.0,  # 注意力概率的Dropout概率，默认为0.0
        classifier_dropout_prob=0.1,  # 分类器的Dropout概率，默认为0.1
        initializer_range=0.02,  # 初始化器范围，默认为0.02
        layer_norm_eps=1e-5,  # Layer Normalization的epsilon值，默认为1e-5
        qkv_bias=True,  # 是否在QKV注意力机制中使用偏置，默认为True
    
        # 语义分割网络中ASPP（空洞空间金字塔池化）模块的输出通道数
        aspp_out_channels=256,
        # ASPP模块中不同尺度空洞率的列表
        atrous_rates=[6, 12, 18],
        aspp_dropout_prob=0.1,  # ASPP模块的Dropout概率，默认为0.1
        semantic_loss_ignore_index=255,  # 语义损失函数中的忽略索引，默认为255
        **kwargs,  # 其他可能传递的参数
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
    
        # 将传入的参数赋值给对象的属性
        self.num_channels = num_channels
        self.image_size = image_size
        self.patch_size = patch_size
        self.hidden_sizes = hidden_sizes
        self.neck_hidden_sizes = neck_hidden_sizes
        self.num_attention_heads = num_attention_heads
        self.mlp_ratio = mlp_ratio
        self.expand_ratio = expand_ratio
        self.hidden_act = hidden_act
        self.conv_kernel_size = conv_kernel_size
        self.output_stride = output_stride
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.classifier_dropout_prob = classifier_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.qkv_bias = qkv_bias
    
        # 语义分割头部的属性设置
        self.aspp_out_channels = aspp_out_channels
        self.atrous_rates = atrous_rates
        self.aspp_dropout_prob = aspp_dropout_prob
        self.semantic_loss_ignore_index = semantic_loss_ignore_index
# 定义一个名为 MobileViTOnnxConfig 的类，它继承自 OnnxConfig 类
class MobileViTOnnxConfig(OnnxConfig):
    
    # 设置 torch_onnx_minimum_version 属性为解析后的版本号 "1.11"
    torch_onnx_minimum_version = version.parse("1.11")
    
    # 定义一个 inputs 属性，返回一个有序字典，描述模型输入的结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})])
    
    # 定义一个 outputs 属性，返回一个有序字典，描述模型输出的结构
    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        if self.task == "image-classification":
            return OrderedDict([("logits", {0: "batch"})])
        else:
            return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
    
    # 定义一个 atol_for_validation 属性，返回浮点数 1e-4，表示验证时的绝对误差容限
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

`.\models\mobilevit\convert_mlcvnets_to_pytorch.py`

# 设置脚本的编码格式为UTF-8
# 版权声明，使用 Apache License, Version 2.0 许可协议
# 详细许可信息可以在 http://www.apache.org/licenses/LICENSE-2.0 找到
# 本脚本用于从 ml-cvnets 库中转换 MobileViT 模型检查点

# 引入必要的库和模块
import argparse  # 用于解析命令行参数
import json  # 用于处理 JSON 数据
from pathlib import Path  # 提供处理文件和目录路径的类和函数

import requests  # 用于发送 HTTP 请求
import torch  # PyTorch 深度学习框架
from huggingface_hub import hf_hub_download  # 下载模型文件的辅助函数
from PIL import Image  # Python Imaging Library，处理图像的库

# 从 transformers 库中导入相关模块和函数
from transformers import (
    MobileViTConfig,  # MobileViT 模型配置类
    MobileViTForImageClassification,  # MobileViT 图像分类模型
    MobileViTForSemanticSegmentation,  # MobileViT 语义分割模型
    MobileViTImageProcessor,  # MobileViT 图像处理器
)
from transformers.utils import logging  # transformers 模块的日志记录工具

# 设置日志记录器的详细程度为 INFO
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def get_mobilevit_config(mobilevit_name):
    # 创建一个空的 MobileViTConfig 配置对象
    config = MobileViTConfig()

    # 根据模型名称设置不同的隐藏层大小和颈部隐藏层大小
    if "mobilevit_s" in mobilevit_name:
        config.hidden_sizes = [144, 192, 240]
        config.neck_hidden_sizes = [16, 32, 64, 96, 128, 160, 640]
    elif "mobilevit_xs" in mobilevit_name:
        config.hidden_sizes = [96, 120, 144]
        config.neck_hidden_sizes = [16, 32, 48, 64, 80, 96, 384]
    elif "mobilevit_xxs" in mobilevit_name:
        config.hidden_sizes = [64, 80, 96]
        config.neck_hidden_sizes = [16, 16, 24, 48, 64, 80, 320]
        config.hidden_dropout_prob = 0.05
        config.expand_ratio = 2.0

    # 根据模型名称设置不同的图片大小、输出步长和标签数
    if mobilevit_name.startswith("deeplabv3_"):
        config.image_size = 512
        config.output_stride = 16
        config.num_labels = 21
        filename = "pascal-voc-id2label.json"
    else:
        config.num_labels = 1000
        filename = "imagenet-1k-id2label.json"

    # 从 Hugging Face Hub 下载标签映射文件，并加载为 JSON 格式
    repo_id = "huggingface/label-files"
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}

    return config


def rename_key(name, base_model=False):
    # 根据模型结构重命名模型参数名称中的关键部分
    for i in range(1, 6):
        if f"layer_{i}." in name:
            name = name.replace(f"layer_{i}.", f"encoder.layer.{i - 1}.")

    if "conv_1." in name:
        name = name.replace("conv_1.", "conv_stem.")
    if ".block." in name:
        name = name.replace(".block.", ".")

    if "exp_1x1" in name:
        name = name.replace("exp_1x1", "expand_1x1")
    if "red_1x1" in name:
        name = name.replace("red_1x1", "reduce_1x1")
    if ".local_rep.conv_3x3." in name:
        name = name.replace(".local_rep.conv_3x3.", ".conv_kxk.")
    if ".local_rep.conv_1x1." in name:
        name = name.replace(".local_rep.conv_1x1.", ".conv_1x1.")
    # 如果文件名中包含".norm."，替换为".normalization."
    if ".norm." in name:
        name = name.replace(".norm.", ".normalization.")
    
    # 如果文件名中包含".conv."，替换为".convolution."
    if ".conv." in name:
        name = name.replace(".conv.", ".convolution.")
    
    # 如果文件名中包含".conv_proj."，替换为".conv_projection."
    if ".conv_proj." in name:
        name = name.replace(".conv_proj.", ".conv_projection.")
    
    # 替换文件名中格式为".i.j."的部分为".i.layer.j."
    for i in range(0, 2):
        for j in range(0, 4):
            if f".{i}.{j}." in name:
                name = name.replace(f".{i}.{j}.", f".{i}.layer.{j}.")
    
    # 替换文件名中格式为".i.j."的部分为".i."，并根据特定条件进一步修改
    for i in range(2, 6):
        for j in range(0, 4):
            if f".{i}.{j}." in name:
                name = name.replace(f".{i}.{j}.", f".{i}.")
                # 如果文件名中包含特定关键词，进行进一步替换
                if "expand_1x1" in name:
                    name = name.replace("expand_1x1", "downsampling_layer.expand_1x1")
                if "conv_3x3" in name:
                    name = name.replace("conv_3x3", "downsampling_layer.conv_3x3")
                if "reduce_1x1" in name:
                    name = name.replace("reduce_1x1", "downsampling_layer.reduce_1x1")
    
    # 替换文件名中格式为".global_rep.i.weight"的部分为".layernorm.weight"
    for i in range(2, 5):
        if f".global_rep.{i}.weight" in name:
            name = name.replace(f".global_rep.{i}.weight", ".layernorm.weight")
        if f".global_rep.{i}.bias" in name:
            name = name.replace(f".global_rep.{i}.bias", ".layernorm.bias")
    
    # 如果文件名中包含".global_rep."，替换为".transformer."
    if ".global_rep." in name:
        name = name.replace(".global_rep.", ".transformer.")
    
    # 如果文件名中包含".pre_norm_mha.0."，替换为".layernorm_before."
    if ".pre_norm_mha.0." in name:
        name = name.replace(".pre_norm_mha.0.", ".layernorm_before.")
    
    # 如果文件名中包含".pre_norm_mha.1.out_proj."，替换为".attention.output.dense."
    if ".pre_norm_mha.1.out_proj." in name:
        name = name.replace(".pre_norm_mha.1.out_proj.", ".attention.output.dense.")
    
    # 如果文件名中包含".pre_norm_ffn.0."，替换为".layernorm_after."
    if ".pre_norm_ffn.0." in name:
        name = name.replace(".pre_norm_ffn.0.", ".layernorm_after.")
    
    # 如果文件名中包含".pre_norm_ffn.1."，替换为".intermediate.dense."
    if ".pre_norm_ffn.1." in name:
        name = name.replace(".pre_norm_ffn.1.", ".intermediate.dense.")
    
    # 如果文件名中包含".pre_norm_ffn.4."，替换为".output.dense."
    if ".pre_norm_ffn.4." in name:
        name = name.replace(".pre_norm_ffn.4.", ".output.dense.")
    
    # 如果文件名中包含".transformer."，替换为".transformer.layer."
    if ".transformer." in name:
        name = name.replace(".transformer.", ".transformer.layer.")
    
    # 如果文件名中包含".aspp_layer."，替换为"."
    if ".aspp_layer." in name:
        name = name.replace(".aspp_layer.", ".")
    
    # 如果文件名中包含".aspp_pool."，替换为"."
    if ".aspp_pool." in name:
        name = name.replace(".aspp_pool.", ".")
    
    # 如果文件名中包含"seg_head."，替换为"segmentation_head."
    if "seg_head." in name:
        name = name.replace("seg_head.", "segmentation_head.")
    
    # 如果文件名中包含"segmentation_head.classifier.classifier."，替换为"segmentation_head.classifier."
    if "segmentation_head.classifier.classifier." in name:
        name = name.replace("segmentation_head.classifier.classifier.", "segmentation_head.classifier.")
    
    # 如果文件名中包含"classifier.fc."，替换为"classifier."
    if "classifier.fc." in name:
        name = name.replace("classifier.fc.", "classifier.")
    # 否则，如果base_model为假且文件名中不包含"segmentation_head."，在文件名前加上"mobilevit."
    elif (not base_model) and ("segmentation_head." not in name):
        name = "mobilevit." + name
    
    # 返回修改后的文件名
    return name
# 定义函数，将原始状态字典转换为适合移动ViT模型的状态字典
def convert_state_dict(orig_state_dict, model, base_model=False):
    # 如果是基础模型，则模型前缀为空字符串
    if base_model:
        model_prefix = ""
    else:
        model_prefix = "mobilevit."

    # 遍历原始状态字典的复制键列表
    for key in orig_state_dict.copy().keys():
        # 弹出键值对，并用变量val接收值
        val = orig_state_dict.pop(key)

        # 如果键以"encoder."开头，则去除这个前缀
        if key[:8] == "encoder.":
            key = key[8:]

        # 如果键中包含"qkv"，则处理注意力权重和偏置
        if "qkv" in key:
            # 分割键名，并解析出层编号和变压器编号
            key_split = key.split(".")
            layer_num = int(key_split[0][6:]) - 1
            transformer_num = int(key_split[3])

            # 获取指定层的注意力头尺寸
            layer = model.get_submodule(f"{model_prefix}encoder.layer.{layer_num}")
            dim = layer.transformer.layer[transformer_num].attention.attention.all_head_size

            # 构造权重或偏置的前缀路径
            prefix = (
                f"{model_prefix}encoder.layer.{layer_num}.transformer.layer.{transformer_num}.attention.attention."
            )

            # 根据键名中是否包含"weight"，更新相应的权重或偏置值
            if "weight" in key:
                orig_state_dict[prefix + "query.weight"] = val[:dim, :]
                orig_state_dict[prefix + "key.weight"] = val[dim : dim * 2, :]
                orig_state_dict[prefix + "value.weight"] = val[-dim:, :]
            else:
                orig_state_dict[prefix + "query.bias"] = val[:dim]
                orig_state_dict[prefix + "key.bias"] = val[dim : dim * 2]
                orig_state_dict[prefix + "value.bias"] = val[-dim:]
        else:
            # 对于其他键名，使用自定义函数rename_key重命名键后放回原始状态字典
            orig_state_dict[rename_key(key, base_model)] = val

    # 返回转换后的原始状态字典
    return orig_state_dict


# 使用torch.no_grad()修饰，定义函数，将原始权重加载到MobileViT结构中
@torch.no_grad()
def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
    """
    Copy/paste/tweak model's weights to our MobileViT structure.
    """
    # 获取MobileViT配置
    config = get_mobilevit_config(mobilevit_name)

    # 加载原始状态字典
    state_dict = torch.load(checkpoint_path, map_location="cpu")

    # 根据模型名称选择合适的MobileViT模型
    if mobilevit_name.startswith("deeplabv3_"):
        model = MobileViTForSemanticSegmentation(config).eval()
    else:
        model = MobileViTForImageClassification(config).eval()

    # 转换原始状态字典，并加载到模型中
    new_state_dict = convert_state_dict(state_dict, model)
    model.load_state_dict(new_state_dict)

    # 使用MobileViTImageProcessor准备图像
    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
    encoding = image_processor(images=prepare_img(), return_tensors="pt")
    
    # 对准备好的图像进行模型推理
    outputs = model(**encoding)
    logits = outputs.logits
    # 检查 mobilevit_name 是否以 "deeplabv3_" 开头
    if mobilevit_name.startswith("deeplabv3_"):
        # 断言 logits 的形状应为 (1, 21, 32, 32)
        assert logits.shape == (1, 21, 32, 32)

        # 根据不同的 mobilevit_name 设置期望的 logits
        if mobilevit_name == "deeplabv3_mobilevit_s":
            expected_logits = torch.tensor(
                [
                    [[6.2065, 6.1292, 6.2070], [6.1079, 6.1254, 6.1747], [6.0042, 6.1071, 6.1034]],
                    [[-6.9253, -6.8653, -7.0398], [-7.3218, -7.3983, -7.3670], [-7.1961, -7.2482, -7.1569]],
                    [[-4.4723, -4.4348, -4.3769], [-5.3629, -5.4632, -5.4598], [-5.1587, -5.3402, -5.5059]],
                ]
            )
        elif mobilevit_name == "deeplabv3_mobilevit_xs":
            expected_logits = torch.tensor(
                [
                    [[5.4449, 5.5733, 5.6314], [5.1815, 5.3930, 5.5963], [5.1656, 5.4333, 5.4853]],
                    [[-9.4423, -9.7766, -9.6714], [-9.1581, -9.5720, -9.5519], [-9.1006, -9.6458, -9.5703]],
                    [[-7.7721, -7.3716, -7.1583], [-8.4599, -8.0624, -7.7944], [-8.4172, -7.8366, -7.5025]],
                ]
            )
        elif mobilevit_name == "deeplabv3_mobilevit_xxs":
            expected_logits = torch.tensor(
                [
                    [[6.9811, 6.9743, 7.3123], [7.1777, 7.1931, 7.3938], [7.5633, 7.8050, 7.8901]],
                    [[-10.5536, -10.2332, -10.2924], [-10.2336, -9.8624, -9.5964], [-10.8840, -10.8158, -10.6659]],
                    [[-3.4938, -3.0631, -2.8620], [-3.4205, -2.8135, -2.6875], [-3.4179, -2.7945, -2.8750]],
                ]
            )
        else:
            # 如果 mobilevit_name 不属于已知类型，则抛出 ValueError 异常
            raise ValueError(f"Unknown mobilevit_name: {mobilevit_name}")

        # 断言 logits 的部分数据与期望的 logits 非常接近，使用指定的容差
        assert torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-4)
    else:
        # 如果 mobilevit_name 不以 "deeplabv3_" 开头，则断言 logits 的形状应为 (1, 1000)
        assert logits.shape == (1, 1000)

        # 根据不同的 mobilevit_name 设置期望的 logits
        if mobilevit_name == "mobilevit_s":
            expected_logits = torch.tensor([-0.9866, 0.2392, -1.1241])
        elif mobilevit_name == "mobilevit_xs":
            expected_logits = torch.tensor([-2.4761, -0.9399, -1.9587])
        elif mobilevit_name == "mobilevit_xxs":
            expected_logits = torch.tensor([-1.9364, -1.2327, -0.4653])
        else:
            # 如果 mobilevit_name 不属于已知类型，则抛出 ValueError 异常
            raise ValueError(f"Unknown mobilevit_name: {mobilevit_name}")

        # 断言 logits 的部分数据与期望的 logits 非常接近，使用指定的容差
        assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)

    # 创建一个目录，如果已存在则忽略错误
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印信息，保存模型到指定路径
    print(f"Saving model {mobilevit_name} to {pytorch_dump_folder_path}")
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 打印信息，保存图像处理器到指定路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)
    # 如果需要推送到模型中心
    if push_to_hub:
        # 定义模型名称映射关系
        model_mapping = {
            "mobilevit_s": "mobilevit-small",
            "mobilevit_xs": "mobilevit-x-small",
            "mobilevit_xxs": "mobilevit-xx-small",
            "deeplabv3_mobilevit_s": "deeplabv3-mobilevit-small",
            "deeplabv3_mobilevit_xs": "deeplabv3-mobilevit-x-small",
            "deeplabv3_mobilevit_xxs": "deeplabv3-mobilevit-xx-small",
        }

        # 打印推送到模型中心的消息
        print("Pushing to the hub...")

        # 根据当前 mobilevit_name 获取对应的模型名称
        model_name = model_mapping[mobilevit_name]

        # 调用 image_processor 对象的 push_to_hub 方法，将模型推送到模型中心（组织为 "apple"）
        image_processor.push_to_hub(model_name, organization="apple")

        # 调用 model 对象的 push_to_hub 方法，将模型推送到模型中心（组织为 "apple"）
        model.push_to_hub(model_name, organization="apple")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必需的参数
    parser.add_argument(
        "--mobilevit_name",
        default="mobilevit_s",
        type=str,
        help=(
            "Name of the MobileViT model you'd like to convert. Should be one of 'mobilevit_s', 'mobilevit_xs',"
            " 'mobilevit_xxs', 'deeplabv3_mobilevit_s', 'deeplabv3_mobilevit_xs', 'deeplabv3_mobilevit_xxs'."
        ),
    )
    # 添加命令行参数 `--mobilevit_name`，默认为 `"mobilevit_s"`，类型为字符串，用于指定要转换的 MobileViT 模型名称

    parser.add_argument(
        "--checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
    )
    # 添加命令行参数 `--checkpoint_path`，必需参数，类型为字符串，用于指定原始状态字典文件（.pt 文件）的路径

    parser.add_argument(
        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加命令行参数 `--pytorch_dump_folder_path`，必需参数，类型为字符串，用于指定输出 PyTorch 模型的目录路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加命令行参数 `--push_to_hub`，如果存在则设置为 True，用于指定是否将转换后的模型推送到 🤗 hub

    args = parser.parse_args()
    # 解析命令行参数并存储在 `args` 变量中

    convert_movilevit_checkpoint(
        args.mobilevit_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
    )
    # 调用 `convert_movilevit_checkpoint` 函数，传递解析后的参数以执行模型转换操作

`.\models\mobilevit\feature_extraction_mobilevit.py`

# 设置文件编码为 UTF-8
# 版权声明和许可证信息，声明代码版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache License, Version 2.0 许可证使用本文件
# 除非符合许可证要求，否则不得使用本文件
# 可以从以下链接获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发软件
# 没有任何明示或暗示的担保或条件
# 请查阅许可证了解具体的法律权利和限制
"""MobileViT 的特征提取器类。"""

# 导入警告模块
import warnings

# 从 utils 模块导入 logging
from ...utils import logging
# 从 image_processing_mobilevit 模块导入 MobileViTImageProcessor 类
from .image_processing_mobilevit import MobileViTImageProcessor

# 获取 logger 对象用于记录日志
logger = logging.get_logger(__name__)


class MobileViTFeatureExtractor(MobileViTImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，表明 MobileViTFeatureExtractor 类已被弃用，将在 Transformers 版本 5 中移除，请使用 MobileViTImageProcessor 替代
        warnings.warn(
            "The class MobileViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use MobileViTImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 MobileViTImageProcessor 的构造函数，传递所有位置参数和关键字参数
        super().__init__(*args, **kwargs)

`.\models\mobilevit\image_processing_mobilevit.py`

# coding=utf-8
# 定义文件编码格式为 UTF-8

# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可证授权

# you may not use this file except in compliance with the License.
# 除非符合许可证，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，否则软件依"原样"分发，不附任何明示或暗示的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证获取详细的权限和限制条款

"""Image processor class for MobileViT."""
# MobileViT 的图像处理器类

from typing import Dict, List, Optional, Tuple, Union
# 引入必要的类型提示模块

import numpy as np
# 导入 NumPy 库

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
# 从图像处理工具中导入基础图像处理器、批量特征和获取尺寸字典函数

from ...image_transforms import flip_channel_order, get_resize_output_image_size, resize, to_channel_dimension_format
# 从图像变换模块导入反转通道顺序、获取调整后图像大小、调整大小和转换通道维度格式函数

from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 从图像工具模块导入相关函数和枚举类型

from ...utils import TensorType, is_torch_available, is_torch_tensor, is_vision_available, logging
# 从工具模块导入张量类型、检查是否有 Torch 库、是否为 Torch 张量、是否可用 Vision 模块和日志记录函数

if is_vision_available():
    import PIL
    # 如果 Vision 可用，导入 PIL 库

if is_torch_available():
    import torch
    # 如果 Torch 可用，导入 Torch 库

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象


class MobileViTImageProcessor(BaseImageProcessor):
    r"""
    Constructs a MobileViT image processor.
    构建 MobileViT 图像处理器类
    """
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`.
            Can be overridden by the `do_resize` parameter in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
            Controls the size of the output image after resizing.
            Can be overridden by the `size` parameter in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Defines the resampling filter to use if resizing the image.
            Can be overridden by the `resample` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`.
            Can be overridden by the `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image.
            Can be overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to crop the input at the center.
            Can be overridden by the `do_center_crop` parameter in the `preprocess` method.
        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
            Desired output size `(size["height"], size["width"])` when applying center-cropping.
            Can be overridden by the `crop_size` parameter in the `preprocess` method.
        do_flip_channel_order (`bool`, *optional*, defaults to `True`):
            Whether to flip the color channels from RGB to BGR.
            Can be overridden by the `do_flip_channel_order` parameter in the `preprocess` method.
    """

    # 定义模型输入的名称列表，这里只有一个元素 "pixel_values"
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_center_crop: bool = True,
        crop_size: Dict[str, int] = None,
        do_flip_channel_order: bool = True,
        **kwargs,
    ):
        # 初始化函数，设定图像预处理参数的默认值和类型，参数都可以在 preprocess 方法中被覆盖
        pass
    # 定义类的初始化方法，继承自父类
    ) -> None:
        # 调用父类的初始化方法，并传递关键字参数
        super().__init__(**kwargs)
        # 如果 size 参数不为 None，则设置为指定值；否则使用默认的 {"shortest_edge": 224}
        size = size if size is not None else {"shortest_edge": 224}
        # 调用函数 get_size_dict，获取处理后的 size 字典，不强制为正方形
        size = get_size_dict(size, default_to_square=False)
        # 如果 crop_size 参数不为 None，则设置为指定值；否则使用默认的 {"height": 256, "width": 256}
        crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
        # 调用函数 get_size_dict，获取处理后的 crop_size 字典
        crop_size = get_size_dict(crop_size, param_name="crop_size")

        # 设置类的属性，指示是否执行 resize 操作
        self.do_resize = do_resize
        # 设置类的属性，指定 resize 操作的大小
        self.size = size
        # 设置类的属性，指定 resize 操作的插值方法
        self.resample = resample
        # 设置类的属性，指示是否执行 rescale 操作
        self.do_rescale = do_rescale
        # 设置类的属性，指定 rescale 操作的因子
        self.rescale_factor = rescale_factor
        # 设置类的属性，指示是否执行中心裁剪操作
        self.do_center_crop = do_center_crop
        # 设置类的属性，指定中心裁剪操作的大小
        self.crop_size = crop_size
        # 设置类的属性，指示是否执行通道顺序翻转操作
        self.do_flip_channel_order = do_flip_channel_order
        # 设置类的属性，有效的处理器键列表
        self._valid_processor_keys = [
            "images",
            "segmentation_maps",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_center_crop",
            "crop_size",
            "do_flip_channel_order",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 从 transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize 复制而来，将 PILImageResampling.BICUBIC 替换为 PILImageResampling.BILINEAR
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
        resized to keep the input aspect ratio.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        default_to_square = True  # 默认将图像调整为正方形
        if "shortest_edge" in size:  # 如果 `size` 中包含 "shortest_edge"
            size = size["shortest_edge"]  # 将 `size` 调整为最短边的大小
            default_to_square = False  # 不再默认将图像调整为正方形
        elif "height" in size and "width" in size:  # 如果 `size` 中包含 "height" 和 "width"
            size = (size["height"], size["width"])  # 将 `size` 调整为给定的高度和宽度
        else:
            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")  # 抛出数值错误，要求 `size` 包含 'shortest_edge' 或 'height' 和 'width'

        output_size = get_resize_output_image_size(
            image,
            size=size,
            default_to_square=default_to_square,
            input_data_format=input_data_format,
        )  # 获取调整后的图像大小
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )  # 返回调整后的图像

    def flip_channel_order(
        self,
        image: np.ndarray,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Flip the color channels from RGB to BGR or vice versa.

        Args:
            image (`np.ndarray`):
                The image, represented as a numpy array.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        return flip_channel_order(image, data_format=data_format, input_data_format=input_data_format)
        # 调用函数 `flip_channel_order` 对图像颜色通道顺序进行翻转

    def __call__(self, images, segmentation_maps=None, **kwargs):
        """
        Preprocesses a batch of images and optionally segmentation maps.

        Overrides the `__call__` method of the `Preprocessor` class so that both images and segmentation maps can be
        passed in as positional arguments.
        """
        return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
        # 调用父类的 `__call__` 方法，预处理一批图像和可选的分割地图
    def _preprocess(
        self,
        image: ImageInput,
        do_resize: bool,
        do_rescale: bool,
        do_center_crop: bool,
        do_flip_channel_order: bool,
        size: Optional[Dict[str, int]] = None,
        resample: PILImageResampling = None,
        rescale_factor: Optional[float] = None,
        crop_size: Optional[Dict[str, int]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        # 如果需要进行尺寸调整，则调用 resize 方法
        if do_resize:
            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)

        # 如果需要进行尺度重置，则调用 rescale 方法
        if do_rescale:
            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)

        # 如果需要进行中心裁剪，则调用 center_crop 方法
        if do_center_crop:
            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)

        # 如果需要反转通道顺序，则调用 flip_channel_order 方法
        if do_flip_channel_order:
            image = self.flip_channel_order(image, input_data_format=input_data_format)

        # 返回预处理后的图像
        return image

    def _preprocess_image(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_flip_channel_order: bool = None,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Preprocesses a single image."""
        # 将输入图像转换为 numpy 数组
        image = to_numpy_array(image)

        # 如果图像已经被缩放且需要重新缩放，则发出警告
        if is_scaled_image(image) and do_rescale:
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )

        # 推断图像通道维度格式，如果未指定则进行推断
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

        # 调用 _preprocess 方法进行图像预处理
        image = self._preprocess(
            image=image,
            do_resize=do_resize,
            size=size,
            resample=resample,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_center_crop=do_center_crop,
            crop_size=crop_size,
            do_flip_channel_order=do_flip_channel_order,
            input_data_format=input_data_format,
        )

        # 将图像转换为指定的通道维度格式
        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)

        # 返回处理后的图像
        return image

    def _preprocess_mask(
        self,
        segmentation_map: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        # 这个方法用于预处理分割图像（掩模），与 _preprocess_image 方法类似但不包含缩放和反转通道顺序的选项
        # 这里可以实现相应的分割图像预处理逻辑
        pass
    ) -> np.ndarray:
        """Preprocesses a single mask."""
        # 将分割地图转换为 NumPy 数组，确保数据类型一致
        segmentation_map = to_numpy_array(segmentation_map)
        
        # 如果分割地图的维度为2，则添加通道维度，某些变换需要这样做
        if segmentation_map.ndim == 2:
            added_channel_dim = True
            segmentation_map = segmentation_map[None, ...]  # 在第一个维度上添加一个维度
            input_data_format = ChannelDimension.FIRST  # 设置数据格式为通道维度在第一个位置
        else:
            added_channel_dim = False
            # 如果未指定输入数据格式，则推断通道维度格式
            if input_data_format is None:
                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)

        # 对分割地图进行预处理
        segmentation_map = self._preprocess(
            image=segmentation_map,
            do_resize=do_resize,
            size=size,
            resample=PILImageResampling.NEAREST,
            do_rescale=False,
            do_center_crop=do_center_crop,
            crop_size=crop_size,
            do_flip_channel_order=False,
            input_data_format=input_data_format,
        )
        
        # 如果之前添加了额外的通道维度，则去除它，恢复原始形状
        if added_channel_dim:
            segmentation_map = segmentation_map.squeeze(0)
        
        # 将分割地图转换为 int64 数据类型
        segmentation_map = segmentation_map.astype(np.int64)
        
        # 返回预处理后的分割地图
        return segmentation_map
    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
        """
        Converts the output of [`MobileViTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.

        Args:
            outputs ([`MobileViTForSemanticSegmentation`]):
                Raw outputs of the model.
            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
                predictions will not be resized.

        Returns:
            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
        """
        # TODO: add support for other frameworks

        # Extract logits from the model outputs
        logits = outputs.logits

        # Resize logits and compute semantic segmentation maps
        if target_sizes is not None:
            # Check if the number of logits matches the number of target sizes
            if len(logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

            # Convert target_sizes to numpy array if it's a torch tensor
            if is_torch_tensor(target_sizes):
                target_sizes = target_sizes.numpy()

            semantic_segmentation = []

            # Iterate over each logits tensor
            for idx in range(len(logits)):
                # Resize logits using bilinear interpolation
                resized_logits = torch.nn.functional.interpolate(
                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
                )
                # Extract semantic segmentation map by taking the argmax along the channel dimension
                semantic_map = resized_logits[0].argmax(dim=0)
                semantic_segmentation.append(semantic_map)
        else:
            # Compute semantic segmentation maps directly from logits
            semantic_segmentation = logits.argmax(dim=1)
            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]

        # Return the list of semantic segmentation maps
        return semantic_segmentation

`.\models\mobilevit\modeling_mobilevit.py`

# coding=utf-8
# Copyright 2022 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
""" PyTorch MobileViT model."""

# 导入数学库
import math
# 导入类型提示
from typing import Dict, Optional, Set, Tuple, Union

# 导入 PyTorch 库
import torch
# 导入 PyTorch 中的模型定义相关模块
import torch.utils.checkpoint
from torch import nn
# 导入损失函数
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入激活函数映射
from ...activations import ACT2FN
# 导入模型输出定义
from ...modeling_outputs import (
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
    SemanticSegmenterOutput,
)
# 导入模型工具函数
from ...modeling_utils import PreTrainedModel
# 导入 PyTorch 工具函数
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
# 导入通用工具函数
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

# 获取日志记录器
logger = logging.get_logger(__name__)


# General docstring
_CONFIG_FOR_DOC = "MobileViTConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
_EXPECTED_OUTPUT_SHAPE = [1, 640, 8, 8]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "apple/mobilevit-small"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"


# MobileViT 预训练模型列表
MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "apple/mobilevit-small",
    "apple/mobilevit-x-small",
    "apple/mobilevit-xx-small",
    "apple/deeplabv3-mobilevit-small",
    "apple/deeplabv3-mobilevit-x-small",
    "apple/deeplabv3-mobilevit-xx-small",
    # See all MobileViT models at https://huggingface.co/models?filter=mobilevit
]


def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
    """
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    # 如果未指定最小值，则设为除数
    if min_value is None:
        min_value = divisor
    # 计算新的值，确保可被除数整除
    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
    # 确保向下舍入不会低于原来的值的 90%
    if new_value < 0.9 * value:
        new_value += divisor
    return int(new_value)


class MobileViTConvLayer(nn.Module):
    # 初始化函数，用于设置卷积层、标准化层和激活函数
    def __init__(
        self,
        config: MobileViTConfig,            # MobileViT 模型的配置对象
        in_channels: int,                   # 输入特征的通道数
        out_channels: int,                  # 输出特征的通道数
        kernel_size: int,                   # 卷积核大小
        stride: int = 1,                    # 卷积步长，默认为 1
        groups: int = 1,                    # 分组卷积中的组数，默认为 1
        bias: bool = False,                 # 是否包含偏置项，默认为 False
        dilation: int = 1,                  # 卷积核元素之间的间隔，默认为 1
        use_normalization: bool = True,     # 是否使用标准化层，默认为 True
        use_activation: Union[bool, str] = True,  # 是否使用激活函数，可以是布尔值或激活函数名称
    ) -> None:
        super().__init__()  # 调用父类的初始化函数

        padding = int((kernel_size - 1) / 2) * dilation  # 计算填充大小

        if in_channels % groups != 0:
            raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
        if out_channels % groups != 0:
            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")

        # 创建卷积层对象
        self.convolution = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
            padding_mode="zeros",
        )

        # 根据 use_normalization 参数决定是否创建标准化层对象
        if use_normalization:
            self.normalization = nn.BatchNorm2d(
                num_features=out_channels,
                eps=1e-5,
                momentum=0.1,
                affine=True,
                track_running_stats=True,
            )
        else:
            self.normalization = None

        # 根据 use_activation 参数决定是否创建激活函数对象
        if use_activation:
            if isinstance(use_activation, str):
                self.activation = ACT2FN[use_activation]  # 根据配置的名称选择激活函数
            elif isinstance(config.hidden_act, str):
                self.activation = ACT2FN[config.hidden_act]  # 根据配置中的隐藏层激活函数选择
            else:
                self.activation = config.hidden_act  # 使用默认的激活函数配置
        else:
            self.activation = None  # 不使用激活函数

    # 前向传播函数，接受输入特征并进行卷积、标准化和激活操作，返回处理后的特征
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        features = self.convolution(features)  # 卷积操作
        if self.normalization is not None:
            features = self.normalization(features)  # 标准化操作（若标准化层存在）
        if self.activation is not None:
            features = self.activation(features)  # 激活操作（若激活函数存在）
        return features  # 返回处理后的特征张量
# 定义 MobileViTInvertedResidual 类，实现 MobileNetv2 中的反向残差块
class MobileViTInvertedResidual(nn.Module):
    """
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    """

    def __init__(
        self, config: MobileViTConfig, in_channels: int, out_channels: int, stride: int, dilation: int = 1
    ) -> None:
        super().__init__()
        # 根据配置计算扩展后的通道数，确保为 8 的倍数
        expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8)

        # 检查步幅是否为合法值
        if stride not in [1, 2]:
            raise ValueError(f"Invalid stride {stride}.")

        # 决定是否使用残差连接，条件为步幅为 1 且输入输出通道数相同
        self.use_residual = (stride == 1) and (in_channels == out_channels)

        # 1x1 卷积扩展层
        self.expand_1x1 = MobileViTConvLayer(
            config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1
        )

        # 3x3 卷积层
        self.conv_3x3 = MobileViTConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=expanded_channels,
            kernel_size=3,
            stride=stride,
            groups=expanded_channels,  # 使用组卷积，组数等于扩展后的通道数
            dilation=dilation,
        )

        # 1x1 卷积降维层
        self.reduce_1x1 = MobileViTConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation=False,  # 不使用激活函数
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        residual = features  # 保留输入特征作为残差连接的一部分

        # 执行前向传播：扩展层、3x3 卷积层、1x1 卷积降维层
        features = self.expand_1x1(features)
        features = self.conv_3x3(features)
        features = self.reduce_1x1(features)

        # 如果使用残差连接，则将残差与处理后的特征相加
        return residual + features if self.use_residual else features


# 定义 MobileViTMobileNetLayer 类，用于堆叠多个 MobileViTInvertedResidual 模块
class MobileViTMobileNetLayer(nn.Module):
    def __init__(
        self, config: MobileViTConfig, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1
    ) -> None:
        super().__init__()

        self.layer = nn.ModuleList()  # 创建模块列表用于存放堆叠的 MobileViTInvertedResidual 模块
        for i in range(num_stages):
            # 根据给定参数创建 MobileViTInvertedResidual 模块并添加到模块列表中
            layer = MobileViTInvertedResidual(
                config,
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride if i == 0 else 1,  # 只在第一层使用指定的步幅
            )
            self.layer.append(layer)
            in_channels = out_channels  # 更新输入通道数为输出通道数

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 逐层对输入特征执行前向传播
        for layer_module in self.layer:
            features = layer_module(features)
        return features


# 定义 MobileViTSelfAttention 类，未完成的类定义，暂无代码
class MobileViTSelfAttention(nn.Module):
    pass  # 占位符，待完成
    # 初始化函数，用于初始化一个 MobileViTAttention 对象
    def __init__(self, config: MobileViTConfig, hidden_size: int) -> None:
        # 调用父类的初始化方法
        super().__init__()

        # 检查隐藏层大小是否是注意力头数的整数倍，如果不是则抛出数值错误异常
        if hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size {hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键、值的线性映射层，并指定是否包含偏置
        self.query = nn.Linear(hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(hidden_size, self.all_head_size, bias=config.qkv_bias)

        # 定义注意力概率的 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 将输入张量 x 进行维度转换，以便进行注意力计算
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        # 获取新的张量形状，将最后两个维度替换为注意力头数和每个头的大小
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        # 调整张量的形状
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，用于计算给定隐藏状态的上下文张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 计算混合查询层，通过查询线性映射器
        mixed_query_layer = self.query(hidden_states)

        # 计算转置后的键和值层，以便进行注意力计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算查询和键之间的点积，得到原始注意力分数
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        # 将注意力分数除以缩放因子，以提升计算稳定性
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 对注意力分数进行归一化处理，转换为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力概率应用 dropout 操作，随机屏蔽整个 token
        attention_probs = self.dropout(attention_probs)

        # 计算上下文张量，通过注意力概率加权值层
        context_layer = torch.matmul(attention_probs, value_layer)

        # 调整上下文张量的维度顺序，并确保其连续性
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        # 调整上下文张量的形状，将头维度合并到一起
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer
# 定义 MobileViTSelfOutput 类，继承自 nn.Module
class MobileViTSelfOutput(nn.Module):
    # 初始化方法，接受 MobileViTConfig 类型的 config 对象和整数 hidden_size
    def __init__(self, config: MobileViTConfig, hidden_size: int) -> None:
        super().__init__()
        # 创建一个线性层，输入和输出大小都为 hidden_size
        self.dense = nn.Linear(hidden_size, hidden_size)
        # 创建一个 Dropout 层，使用配置对象中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，接受输入 hidden_states：torch.Tensor，返回输出 torch.Tensor
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 经过线性层 dense 处理
        hidden_states = self.dense(hidden_states)
        # 经过 Dropout 层处理
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states


# 定义 MobileViTAttention 类，继承自 nn.Module
class MobileViTAttention(nn.Module):
    # 初始化方法，接受 MobileViTConfig 类型的 config 对象和整数 hidden_size
    def __init__(self, config: MobileViTConfig, hidden_size: int) -> None:
        super().__init__()
        # 创建一个 MobileViTSelfAttention 对象，使用给定的 config 和 hidden_size
        self.attention = MobileViTSelfAttention(config, hidden_size)
        # 创建一个 MobileViTSelfOutput 对象，使用给定的 config 和 hidden_size
        self.output = MobileViTSelfOutput(config, hidden_size)
        # 初始化一个空集合，用于存储要修剪的注意力头
        self.pruned_heads = set()

    # 头修剪方法，接受一个整数集合 heads
    def prune_heads(self, heads: Set[int]) -> None:
        # 如果 heads 集合为空，直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 方法获取要修剪的头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 对注意力层的查询、键、值和输出进行线性层修剪
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪后的头部
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播方法，接受输入 hidden_states：torch.Tensor，返回输出 torch.Tensor
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用 self.attention 对象处理 hidden_states，得到 self_outputs
        self_outputs = self.attention(hidden_states)
        # 使用 self.output 处理 self_outputs，得到 attention_output
        attention_output = self.output(self_outputs)
        # 返回 attention_output
        return attention_output


# 定义 MobileViTIntermediate 类，继承自 nn.Module
class MobileViTIntermediate(nn.Module):
    # 初始化方法，接受 MobileViTConfig 类型的 config 对象，整数 hidden_size 和 intermediate_size
    def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int) -> None:
        super().__init__()
        # 创建一个线性层，输入大小为 hidden_size，输出大小为 intermediate_size
        self.dense = nn.Linear(hidden_size, intermediate_size)
        # 如果 config.hidden_act 是字符串类型，使用 ACT2FN 字典获取对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则，直接使用 config.hidden_act 作为激活函数
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法，接受输入 hidden_states：torch.Tensor，返回输出 torch.Tensor
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 经过线性层 dense 处理
        hidden_states = self.dense(hidden_states)
        # 经过激活函数 intermediate_act_fn 处理
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states


# 定义 MobileViTOutput 类，继承自 nn.Module
class MobileViTOutput(nn.Module):
    # 初始化方法，接受 MobileViTConfig 类型的 config 对象，整数 hidden_size 和 intermediate_size
    def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int) -> None:
        super().__init__()
        # 创建一个线性层，输入大小为 intermediate_size，输出大小为 hidden_size
        self.dense = nn.Linear(intermediate_size, hidden_size)
        # 创建一个 Dropout 层，使用配置对象中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    # 定义前向传播方法，接受隐藏状态和输入张量作为参数，并返回张量作为输出
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层对隐藏状态进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的隐藏状态应用丢弃操作
        hidden_states = self.dropout(hidden_states)
        # 将丢弃后的隐藏状态与输入张量相加，实现残差连接
        hidden_states = hidden_states + input_tensor
        # 返回最终的隐藏状态张量作为输出
        return hidden_states
# 定义 MobileViTTransformerLayer 类，继承自 nn.Module
class MobileViTTransformerLayer(nn.Module):
    # 初始化方法，接收 MobileViTConfig 对象、隐藏层大小和中间层大小作为参数
    def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int) -> None:
        super().__init__()
        # 创建注意力层对象，使用 MobileViTAttention 类
        self.attention = MobileViTAttention(config, hidden_size)
        # 创建中间层对象，使用 MobileViTIntermediate 类
        self.intermediate = MobileViTIntermediate(config, hidden_size, intermediate_size)
        # 创建输出层对象，使用 MobileViTOutput 类
        self.output = MobileViTOutput(config, hidden_size, intermediate_size)
        # 创建 LayerNorm 层，用于在注意力层之前和之后进行层归一化
        self.layernorm_before = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
        self.layernorm_after = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)

    # 前向传播方法，接收隐藏状态张量作为输入，返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 对输入的隐藏状态进行层归一化，并通过注意力层处理得到注意力输出
        attention_output = self.attention(self.layernorm_before(hidden_states))
        # 将注意力输出与输入的隐藏状态进行残差连接
        hidden_states = attention_output + hidden_states

        # 对残差连接后的隐藏状态再进行层归一化
        layer_output = self.layernorm_after(hidden_states)
        # 通过中间层处理得到中间层的输出
        layer_output = self.intermediate(layer_output)
        # 最后通过输出层处理得到最终的层输出
        layer_output = self.output(layer_output, hidden_states)
        return layer_output


# 定义 MobileViTTransformer 类，继承自 nn.Module
class MobileViTTransformer(nn.Module):
    # 初始化方法，接收 MobileViTConfig 对象、隐藏层大小和阶段数量作为参数
    def __init__(self, config: MobileViTConfig, hidden_size: int, num_stages: int) -> None:
        super().__init__()

        # 创建 nn.ModuleList 对象，用于存储多个 Transformer 层
        self.layer = nn.ModuleList()
        # 根据指定的阶段数量循环创建 TransformerLayer 对象并添加到 nn.ModuleList 中
        for _ in range(num_stages):
            transformer_layer = MobileViTTransformerLayer(
                config,
                hidden_size=hidden_size,
                intermediate_size=int(hidden_size * config.mlp_ratio),
            )
            self.layer.append(transformer_layer)

    # 前向传播方法，接收隐藏状态张量作为输入，通过多个 TransformerLayer 处理后返回最终的张量输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 遍历存储在 nn.ModuleList 中的每个 TransformerLayer，并依次对隐藏状态进行处理
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states)
        return hidden_states


# 定义 MobileViTLayer 类，继承自 nn.Module
class MobileViTLayer(nn.Module):
    """
    MobileViT block: https://arxiv.org/abs/2110.02178
    """

    # 初始化方法，接收 MobileViTConfig 对象、输入通道数、输出通道数、步幅、隐藏层大小、阶段数量和扩展率（默认为1）作为参数
    def __init__(
        self,
        config: MobileViTConfig,
        in_channels: int,
        out_channels: int,
        stride: int,
        hidden_size: int,
        num_stages: int,
        dilation: int = 1,
    ) -> None:
        # 调用父类的构造函数，初始化对象
        super().__init__()
        # 设置补丁的宽度和高度为配置文件中定义的补丁大小
        self.patch_width = config.patch_size
        self.patch_height = config.patch_size

        # 如果步长为2，创建一个下采样层对象
        if stride == 2:
            self.downsampling_layer = MobileViTInvertedResidual(
                config,
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride if dilation == 1 else 1,
                dilation=dilation // 2 if dilation > 1 else 1,
            )
            # 更新输入通道数为输出通道数，以便后续层次使用
            in_channels = out_channels
        else:
            # 如果步长不为2，则不创建下采样层
            self.downsampling_layer = None

        # 创建一个卷积层对象，使用 MobileViTConvLayer 类定义
        self.conv_kxk = MobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=config.conv_kernel_size,
        )

        # 创建另一个卷积层对象，用于变换过程中的特征变换
        self.conv_1x1 = MobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=hidden_size,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
        )

        # 创建一个 MobileViTTransformer 对象，用于进行变换器层处理
        self.transformer = MobileViTTransformer(
            config,
            hidden_size=hidden_size,
            num_stages=num_stages,
        )

        # 创建一个 LayerNorm 层对象，用于归一化处理
        self.layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)

        # 创建一个卷积层对象，用于最终的特征映射
        self.conv_projection = MobileViTConvLayer(
            config, in_channels=hidden_size, out_channels=in_channels, kernel_size=1
        )

        # 创建一个融合层的卷积对象，用于特征融合
        self.fusion = MobileViTConvLayer(
            config, in_channels=2 * in_channels, out_channels=in_channels, kernel_size=config.conv_kernel_size
        )
    # 定义一个方法用于将特征张量展开成补丁（patches）形式，并返回补丁张量及相关信息字典
    def unfolding(self, features: torch.Tensor) -> Tuple[torch.Tensor, Dict]:
        # 获取补丁的宽度和高度
        patch_width, patch_height = self.patch_width, self.patch_height
        # 计算补丁的面积
        patch_area = int(patch_width * patch_height)

        # 获取特征张量的批大小、通道数、原始高度和宽度
        batch_size, channels, orig_height, orig_width = features.shape

        # 计算调整后的新高度和宽度，确保能够划分整数个补丁
        new_height = int(math.ceil(orig_height / patch_height) * patch_height)
        new_width = int(math.ceil(orig_width / patch_width) * patch_width)

        interpolate = False
        # 如果新的宽度或高度与原始不同，则进行插值处理
        if new_width != orig_width or new_height != orig_height:
            # 注意：可以进行填充处理，但需要在注意力函数中处理
            features = nn.functional.interpolate(
                features, size=(new_height, new_width), mode="bilinear", align_corners=False
            )
            interpolate = True

        # 计算沿宽度和高度的补丁数
        num_patch_width = new_width // patch_width
        num_patch_height = new_height // patch_height
        num_patches = num_patch_height * num_patch_width

        # 将特征张量重塑为 (batch_size * patch_area, num_patches, channels) 的形状
        patches = features.reshape(
            batch_size * channels * num_patch_height, patch_height, num_patch_width, patch_width
        )
        # 调整张量的维度顺序
        patches = patches.transpose(1, 2)
        patches = patches.reshape(batch_size, channels, num_patches, patch_area)
        patches = patches.transpose(1, 3)
        patches = patches.reshape(batch_size * patch_area, num_patches, -1)

        # 构建包含相关信息的字典
        info_dict = {
            "orig_size": (orig_height, orig_width),
            "batch_size": batch_size,
            "channels": channels,
            "interpolate": interpolate,
            "num_patches": num_patches,
            "num_patches_width": num_patch_width,
            "num_patches_height": num_patch_height,
        }
        # 返回补丁张量和信息字典
        return patches, info_dict
    def folding(self, patches: torch.Tensor, info_dict: Dict) -> torch.Tensor:
        patch_width, patch_height = self.patch_width, self.patch_height
        patch_area = int(patch_width * patch_height)

        batch_size = info_dict["batch_size"]  # 从信息字典中获取批大小
        channels = info_dict["channels"]  # 从信息字典中获取通道数
        num_patches = info_dict["num_patches"]  # 从信息字典中获取补丁数量
        num_patch_height = info_dict["num_patches_height"]  # 从信息字典中获取补丁高度
        num_patch_width = info_dict["num_patches_width"]  # 从信息字典中获取补丁宽度

        # 将张量重塑为(batch_size, channels, orig_height, orig_width)
        # 形状从(batch_size * patch_area, num_patches, channels)转换回来
        features = patches.contiguous().view(batch_size, patch_area, num_patches, -1)
        features = features.transpose(1, 3)
        features = features.reshape(
            batch_size * channels * num_patch_height, num_patch_width, patch_height, patch_width
        )
        features = features.transpose(1, 2)
        features = features.reshape(
            batch_size, channels, num_patch_height * patch_height, num_patch_width * patch_width
        )

        if info_dict["interpolate"]:
            # 如果需要插值，则使用双线性插值将特征映射插值为原始大小
            features = nn.functional.interpolate(
                features, size=info_dict["orig_size"], mode="bilinear", align_corners=False
            )

        return features

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 如果需要降低空间维度，则使用下采样层对特征进行降维
        if self.downsampling_layer:
            features = self.downsampling_layer(features)

        residual = features  # 保留残差连接的特征

        # 局部表示
        features = self.conv_kxk(features)  # 使用kxk卷积处理特征
        features = self.conv_1x1(features)  # 使用1x1卷积处理特征

        # 将特征图转换为补丁
        patches, info_dict = self.unfolding(features)

        # 学习全局表示
        patches = self.transformer(patches)  # 使用transformer处理补丁
        patches = self.layernorm(patches)  # 对处理后的补丁进行层归一化

        # 将补丁重新转换为特征图
        features = self.folding(patches, info_dict)

        features = self.conv_projection(features)  # 使用投影卷积处理特征
        features = self.fusion(torch.cat((residual, features), dim=1))  # 使用融合操作融合残差和处理后的特征
        return features
# 定义一个名为 MobileViTEncoder 的神经网络模块，继承自 nn.Module 类
class MobileViTEncoder(nn.Module):
    # 初始化方法，接收一个 MobileViTConfig 类型的参数 config
    def __init__(self, config: MobileViTConfig) -> None:
        super().__init__()
        # 将传入的配置参数保存到当前对象的 config 属性中
        self.config = config

        # 初始化一个空的神经网络层列表
        self.layer = nn.ModuleList()
        # 设定梯度检查点标志为 False
        self.gradient_checkpointing = False

        # 根据配置参数中的 output_stride 值，设置两个布尔变量来控制网络结构
        dilate_layer_4 = dilate_layer_5 = False
        if config.output_stride == 8:
            dilate_layer_4 = True
            dilate_layer_5 = True
        elif config.output_stride == 16:
            dilate_layer_5 = True

        # 初始化 dilation 参数为 1
        dilation = 1

        # 创建第一个 MobileViTMobileNetLayer 层，并添加到 self.layer 列表中
        layer_1 = MobileViTMobileNetLayer(
            config,
            in_channels=config.neck_hidden_sizes[0],
            out_channels=config.neck_hidden_sizes[1],
            stride=1,
            num_stages=1,
        )
        self.layer.append(layer_1)

        # 创建第二个 MobileViTMobileNetLayer 层，并添加到 self.layer 列表中
        layer_2 = MobileViTMobileNetLayer(
            config,
            in_channels=config.neck_hidden_sizes[1],
            out_channels=config.neck_hidden_sizes[2],
            stride=2,
            num_stages=3,
        )
        self.layer.append(layer_2)

        # 创建第三个 MobileViTLayer 层，并添加到 self.layer 列表中
        layer_3 = MobileViTLayer(
            config,
            in_channels=config.neck_hidden_sizes[2],
            out_channels=config.neck_hidden_sizes[3],
            stride=2,
            hidden_size=config.hidden_sizes[0],
            num_stages=2,
        )
        self.layer.append(layer_3)

        # 如果 dilate_layer_4 为 True，则将 dilation 增加为原来的两倍
        if dilate_layer_4:
            dilation *= 2

        # 创建第四个 MobileViTLayer 层，并添加到 self.layer 列表中
        layer_4 = MobileViTLayer(
            config,
            in_channels=config.neck_hidden_sizes[3],
            out_channels=config.neck_hidden_sizes[4],
            stride=2,
            hidden_size=config.hidden_sizes[1],
            num_stages=4,
            dilation=dilation,
        )
        self.layer.append(layer_4)

        # 如果 dilate_layer_5 为 True，则再次将 dilation 增加为原来的两倍
        if dilate_layer_5:
            dilation *= 2

        # 创建第五个 MobileViTLayer 层，并添加到 self.layer 列表中
        layer_5 = MobileViTLayer(
            config,
            in_channels=config.neck_hidden_sizes[4],
            out_channels=config.neck_hidden_sizes[5],
            stride=2,
            hidden_size=config.hidden_sizes[2],
            num_stages=3,
            dilation=dilation,
        )
        self.layer.append(layer_5)

    # 前向传播方法，接收输入的隐藏状态张量 hidden_states 和额外的参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        # 继续接收其他参数（未完全显示）
    # 函数签名，声明函数的返回类型为元组或BaseModelOutputWithNoAttention类型
    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
        # 如果不输出所有隐藏状态，则初始化空元组
        all_hidden_states = () if output_hidden_states else None

        # 遍历每个层次的模块
        for i, layer_module in enumerate(self.layer):
            # 如果启用了梯度检查点且处于训练模式下，则使用梯度检查点函数
            if self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(
                    layer_module.__call__,  # 调用当前层模块的__call__方法
                    hidden_states,  # 当前隐藏状态
                )
            else:
                hidden_states = layer_module(hidden_states)  # 调用当前层模块处理当前隐藏状态

            # 如果需要输出所有隐藏状态，则将当前隐藏状态加入到所有隐藏状态的元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则返回一个元组，过滤掉值为None的项
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        # 以BaseModelOutputWithNoAttention类型返回结果，包含最终隐藏状态和所有隐藏状态
        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
class MobileViTPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 MobileViTConfig 作为配置类
    config_class = MobileViTConfig
    # 模型的基础名称前缀
    base_model_prefix = "mobilevit"
    # 主要输入的名称
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 对于线性层和卷积层，使用正态分布初始化权重，标准差为配置中的初始化范围
            # 这与 TF 版本稍有不同，后者使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            # 对于 LayerNorm 层，初始化偏置为零，初始化权重为全1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


MOBILEVIT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

MOBILEVIT_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare MobileViT model outputting raw hidden-states without any specific head on top.",
    MOBILEVIT_START_DOCSTRING,
)
class MobileViTModel(MobileViTPreTrainedModel):
    """
    MobileViTModel extends MobileViTPreTrainedModel to include specific functionalities for the MobileViT model.

    Inherits from:
        `MobileViTPreTrainedModel`: Provides general initialization and weights handling functionalities.

    Docstring from `add_start_docstrings` decorator:
        "The bare MobileViT model outputting raw hidden-states without any specific head on top."
        MOBILEVIT_START_DOCSTRING: Detailed documentation regarding model usage and configuration parameters.
    """
    def __init__(self, config: MobileViTConfig, expand_output: bool = True):
        super().__init__(config)  # 调用父类的初始化方法，传入配置参数
        self.config = config  # 存储模型配置对象
        self.expand_output = expand_output  # 是否扩展输出的标志

        self.conv_stem = MobileViTConvLayer(
            config,
            in_channels=config.num_channels,  # 输入通道数
            out_channels=config.neck_hidden_sizes[0],  # 输出通道数
            kernel_size=3,  # 卷积核大小
            stride=2,  # 步幅
        )

        self.encoder = MobileViTEncoder(config)  # 创建 MobileViT 编码器对象

        if self.expand_output:
            self.conv_1x1_exp = MobileViTConvLayer(
                config,
                in_channels=config.neck_hidden_sizes[5],  # 输入通道数
                out_channels=config.neck_hidden_sizes[6],  # 输出通道数
                kernel_size=1,  # 卷积核大小
            )

        # 执行后续的初始化和权重设置
        self.post_init()

    def _prune_heads(self, heads_to_prune):
        """Prunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        """
        for layer_index, heads in heads_to_prune.items():
            mobilevit_layer = self.encoder.layer[layer_index]  # 获取指定层的 MobileViT 层对象
            if isinstance(mobilevit_layer, MobileViTLayer):  # 如果是 MobileViTLayer 类型的层
                for transformer_layer in mobilevit_layer.transformer.layer:
                    transformer_layer.attention.prune_heads(heads)  # 对注意力机制的头部进行修剪操作

    @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,  # 输入像素值的张量，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选

        self,
        pixel_values: Optional[torch.Tensor] = None,  # 输入像素值的张量，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选
    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
        # 如果没有指定output_hidden_states，则使用self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有指定return_dict，则使用self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果pixel_values为None，则抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将pixel_values输入卷积层stem，得到嵌入输出
        embedding_output = self.conv_stem(pixel_values)

        # 将embedding_output作为输入，调用编码器encoder
        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果设置了expand_output标志位，则对编码器输出进行额外处理
        if self.expand_output:
            # 对编码器输出的第一个元素进行1x1卷积处理
            last_hidden_state = self.conv_1x1_exp(encoder_outputs[0])

            # 全局平均池化：(batch_size, channels, height, width) -> (batch_size, channels)
            pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False)
        else:
            # 否则直接使用编码器的第一个输出作为最终隐藏状态
            last_hidden_state = encoder_outputs[0]
            pooled_output = None

        # 如果return_dict为False，则返回元组形式的输出
        if not return_dict:
            output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,)
            # 返回编码器输出的所有元素，除了第一个（因为它已经在output中）
            return output + encoder_outputs[1:]

        # 如果return_dict为True，则创建BaseModelOutputWithPoolingAndNoAttention对象进行返回
        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
# 使用装饰器添加类的文档字符串，描述了此类是在 MobileViTPreTrainedModel 的基础上添加了一个图像分类头部（线性层）的 MobileViT 模型
@add_start_docstrings(
    """
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """,
    MOBILEVIT_START_DOCSTRING,  # 引用了 MOBILEVIT_START_DOCSTRING 的文档字符串
)
class MobileViTForImageClassification(MobileViTPreTrainedModel):
    def __init__(self, config: MobileViTConfig) -> None:
        super().__init__(config)

        self.num_labels = config.num_labels  # 设置类别数目
        self.mobilevit = MobileViTModel(config)  # 初始化 MobileViT 模型

        # 分类器头部
        self.dropout = nn.Dropout(config.classifier_dropout_prob, inplace=True)  # Dropout 层，使用指定的 dropout 概率
        self.classifier = (
            nn.Linear(config.neck_hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
            # 如果有类别数目大于 0，则使用线性层作为分类器；否则使用恒等映射
        )

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用装饰器添加模型前向方法的文档字符串，描述了输入输出的格式及模型的样例
    @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,  # 指定检查点
        output_type=ImageClassifierOutputWithNoAttention,  # 输出类型
        config_class=_CONFIG_FOR_DOC,  # 配置类
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,  # 预期输出
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,  # 像素值，可选的 PyTorch 张量
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔值
        labels: Optional[torch.Tensor] = None,  # 标签，可选的 PyTorch 张量
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选的布尔值
        # 方法未完全展示
    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据需要设置返回字典，若未指定则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 MobileViT 模型，传入像素值并指定是否返回隐藏状态和是否使用返回字典
        outputs = self.mobilevit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果需要返回字典，则从输出中获取池化后的特征向量；否则直接获取第二个输出（即池化后的特征向量）
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 将池化后的特征向量输入分类器，并施加 dropout
        logits = self.classifier(self.dropout(pooled_output))

        # 初始化损失为 None
        loss = None
        # 如果给定了标签
        if labels is not None:
            # 如果问题类型未定义，则根据标签类型进行推断
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典，则输出结果包括 logits 和可能的隐藏状态
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典，则构造一个 ImageClassifierOutputWithNoAttention 对象
        return ImageClassifierOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )
class MobileViTASPPPooling(nn.Module):
    def __init__(self, config: MobileViTConfig, in_channels: int, out_channels: int) -> None:
        super().__init__()

        # 使用全局平均池化层，将输入特征图池化成大小为 1x1 的输出
        self.global_pool = nn.AdaptiveAvgPool2d(output_size=1)

        # 1x1 卷积层，用于通道变换和特征维度的调整
        self.conv_1x1 = MobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            use_normalization=True,
            use_activation="relu",
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        spatial_size = features.shape[-2:]  # 记录输入特征图的空间维度
        features = self.global_pool(features)  # 对输入特征图进行全局平均池化
        features = self.conv_1x1(features)    # 将池化后的特征图通过1x1卷积层处理
        # 使用双线性插值方法将特征图的大小调整为原始空间大小
        features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False)
        return features


class MobileViTASPP(nn.Module):
    """
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    """

    def __init__(self, config: MobileViTConfig) -> None:
        super().__init__()

        # 获取输入通道数和输出通道数
        in_channels = config.neck_hidden_sizes[-2]
        out_channels = config.aspp_out_channels

        if len(config.atrous_rates) != 3:
            raise ValueError("Expected 3 values for atrous_rates")

        # 初始化卷积层列表
        self.convs = nn.ModuleList()

        # 第一个卷积层，使用1x1卷积核进行通道变换和特征映射
        in_projection = MobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation="relu",
        )
        self.convs.append(in_projection)

        # 使用不同的扩张率构建多个卷积层，以捕捉不同尺度上的信息
        self.convs.extend(
            [
                MobileViTConvLayer(
                    config,
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    dilation=rate,
                    use_activation="relu",
                )
                for rate in config.atrous_rates
            ]
        )

        # ASPP池化层，用于捕捉全局信息
        pool_layer = MobileViTASPPPooling(config, in_channels, out_channels)
        self.convs.append(pool_layer)

        # 最终的投影层，用于将多个卷积层的输出特征连接并减少特征维度
        self.project = MobileViTConvLayer(
            config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu"
        )

        # Dropout 层，用于随机断开神经元连接，防止过拟合
        self.dropout = nn.Dropout(p=config.aspp_dropout_prob)

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        pyramid = []
        for conv in self.convs:
            pyramid.append(conv(features))
        pyramid = torch.cat(pyramid, dim=1)  # 沿通道维度拼接多个卷积层的输出特征

        # 将拼接后的特征图通过投影层进一步处理
        pooled_features = self.project(pyramid)
        pooled_features = self.dropout(pooled_features)  # 对投影层输出进行 Dropout 处理
        return pooled_features


class MobileViTDeepLabV3(nn.Module):
    """
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    """
    # 初始化函数，用于创建一个 MobileViT 模型对象
    def __init__(self, config: MobileViTConfig) -> None:
        # 调用父类构造函数进行初始化
        super().__init__()
        
        # 创建一个 MobileViTASPP 实例，使用给定的配置信息
        self.aspp = MobileViTASPP(config)
        
        # 创建一个二维 Dropout 层，用于在训练时随机丢弃特征图中的一部分数据
        self.dropout = nn.Dropout2d(config.classifier_dropout_prob)
        
        # 创建一个 MobileViTConvLayer 实例作为分类器，用于将特征映射转换为预测标签
        self.classifier = MobileViTConvLayer(
            config,
            in_channels=config.aspp_out_channels,
            out_channels=config.num_labels,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
            bias=True,
        )

    # 前向传播函数，处理输入的隐藏状态并返回预测输出的特征张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态经过 ASPP 模块得到特征表示
        features = self.aspp(hidden_states[-1])
        
        # 对特征表示进行 Dropout 操作，以减少过拟合风险
        features = self.dropout(features)
        
        # 使用分类器模块对处理后的特征表示进行分类预测
        features = self.classifier(features)
        
        # 返回最终的特征表示，用于后续的分类或其他任务
        return features
@add_start_docstrings(
    """
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    """,
    MOBILEVIT_START_DOCSTRING,
)
class MobileViTForSemanticSegmentation(MobileViTPreTrainedModel):
    def __init__(self, config: MobileViTConfig) -> None:
        super().__init__(config)

        # 设置分类数目
        self.num_labels = config.num_labels
        # 创建 MobileViT 模型，关闭扩展输出
        self.mobilevit = MobileViTModel(config, expand_output=False)
        # 创建语义分割头部模型
        self.segmentation_head = MobileViTDeepLabV3(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mobilevit(
            pixel_values,
            output_hidden_states=True,  # we need the intermediate hidden states
            return_dict=return_dict,
        )

获取是否输出隐藏状态和返回类型的设定，若未指定则使用模型配置中的默认设定。


        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]

根据返回类型决定使用模型输出的隐藏状态或者第二个元素作为编码器的隐藏状态。


        logits = self.segmentation_head(encoder_hidden_states)

使用编码器隐藏状态作为输入，通过分割头部生成预测的logits。


        loss = None
        if labels is not None:
            if self.config.num_labels == 1:
                raise ValueError("The number of labels should be greater than one")
            else:
                # upsample logits to the images' original size
                upsampled_logits = nn.functional.interpolate(
                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
                )
                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
                loss = loss_fct(upsampled_logits, labels)

如果提供了标签，根据标签的形状和配置中的忽略索引，使用交叉熵损失函数计算损失值。


        if not return_dict:
            if output_hidden_states:
                output = (logits,) + outputs[1:]
            else:
                output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

根据返回类型和是否输出隐藏状态，构建输出元组并返回。如果有损失值，则将其作为第一个元素返回。


        return SemanticSegmenterOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )

以自定义的输出对象形式返回结果，包括损失、logits、隐藏状态（如果需要）和注意力机制（目前为None）。```
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mobilevit(
            pixel_values,
            output_hidden_states=True,  # we need the intermediate hidden states
            return_dict=return_dict,
        )

设置是否输出隐藏状态和返回类型的选择，如果未指定则使用模型配置中的默认设置。


        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]

根据返回类型决定使用模型输出的隐藏状态或者第二个元素作为编码器的隐藏状态。


        logits = self.segmentation_head(encoder_hidden_states)

使用编码器隐藏状态作为输入，通过分割头部生成预测的logits。


        loss = None
        if labels is not None:
            if self.config.num_labels == 1:
                raise ValueError("The number of labels should be greater than one")
            else:
                # upsample logits to the images' original size
                upsampled_logits = nn.functional.interpolate(
                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
                )
                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
                loss = loss_fct(upsampled_logits, labels)

如果提供了标签，根据标签的形状和配置中的忽略索引，使用交叉熵损失函数计算损失值。


        if not return_dict:
            if output_hidden_states:
                output = (logits,) + outputs[1:]
            else:
                output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

根据返回类型和是否输出隐藏状态，构建输出元组并返回。如果有损失值，则将其作为第一个元素返回。


        return SemanticSegmenterOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )

以自定义的输出对象形式返回结果，包括损失、logits、隐藏状态（如果需要）和注意力机制（目前为None）。

`.\models\mobilevit\modeling_tf_mobilevit.py`

# coding=utf-8
# 版权 2022 年 Apple Inc. 和 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件根据“原样”分发，
# 不附带任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
#
# 原始许可证：https://github.com/apple/ml-cvnets/blob/main/LICENSE
""" TensorFlow 2.0 MobileViT 模型。"""

from __future__ import annotations

from typing import Dict, Optional, Tuple, Union

import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFImageClassifierOutputWithNoAttention,
    TFSemanticSegmenterOutputWithNoAttention,
)
from ...modeling_tf_utils import (
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import logging
from .configuration_mobilevit import MobileViTConfig

logger = logging.get_logger(__name__)

# 一般文档字符串
_CONFIG_FOR_DOC = "MobileViTConfig"

# 基础文档字符串
_CHECKPOINT_FOR_DOC = "apple/mobilevit-small"
_EXPECTED_OUTPUT_SHAPE = [1, 640, 8, 8]

# 图像分类文档字符串
_IMAGE_CLASS_CHECKPOINT = "apple/mobilevit-small"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# TF MobileViT 预训练模型存档列表
TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "apple/mobilevit-small",
    "apple/mobilevit-x-small",
    "apple/mobilevit-xx-small",
    "apple/deeplabv3-mobilevit-small",
    "apple/deeplabv3-mobilevit-x-small",
    "apple/deeplabv3-mobilevit-xx-small",
    # 请访问 https://huggingface.co/models?filter=mobilevit 查看所有 MobileViT 模型
]


def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
    """
    确保所有层的通道数量可被 `divisor` 整除。此函数源自原始 TensorFlow 仓库，可在以下链接找到：
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
    # 确保向下舍入不会减少超过 10%。
    if new_value < 0.9 * value:
        new_value += divisor
    return int(new_value)


class TFMobileViTConvLayer(keras.layers.Layer):
    def __init__(
        self,
        config: MobileViTConfig,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        groups: int = 1,
        bias: bool = False,
        dilation: int = 1,
        use_normalization: bool = True,
        use_activation: Union[bool, str] = True,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        logger.warning(
            f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
            "to train/fine-tune this model, you need a GPU or a TPU"
        )

        # 计算要应用的填充量，以使卷积操作保持输入输出大小相同
        padding = int((kernel_size - 1) / 2) * dilation
        self.padding = keras.layers.ZeroPadding2D(padding)

        if out_channels % groups != 0:
            # 如果输出通道数不能被组数整除，抛出错误
            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")

        # 创建卷积层对象
        self.convolution = keras.layers.Conv2D(
            filters=out_channels,
            kernel_size=kernel_size,
            strides=stride,
            padding="VALID",
            dilation_rate=dilation,
            groups=groups,
            use_bias=bias,
            name="convolution",
        )

        if use_normalization:
            # 如果需要使用标准化层，则创建批量标准化对象
            self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization")
        else:
            self.normalization = None

        if use_activation:
            if isinstance(use_activation, str):
                # 如果指定了激活函数名称，则根据名称获取激活函数对象
                self.activation = get_tf_activation(use_activation)
            elif isinstance(config.hidden_act, str):
                # 否则，根据配置文件中的隐藏层激活函数名称获取激活函数对象
                self.activation = get_tf_activation(config.hidden_act)
            else:
                # 否则，使用配置文件中的隐藏层激活函数
                self.activation = config.hidden_act
        else:
            self.activation = None
        self.in_channels = in_channels
        self.out_channels = out_channels

    def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 对输入特征进行填充
        padded_features = self.padding(features)
        # 应用卷积操作
        features = self.convolution(padded_features)
        if self.normalization is not None:
            # 如果存在标准化层，则应用标准化
            features = self.normalization(features, training=training)
        if self.activation is not None:
            # 如果存在激活函数，则应用激活函数
            features = self.activation(features)
        return features

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "convolution", None) is not None:
            with tf.name_scope(self.convolution.name):
                # 构建卷积层
                self.convolution.build([None, None, None, self.in_channels])
        if getattr(self, "normalization", None) is not None:
            if hasattr(self.normalization, "name"):
                with tf.name_scope(self.normalization.name):
                    # 构建标准化层
                    self.normalization.build([None, None, None, self.out_channels])
# 定义一个自定义层 TFMobileViTInvertedResidual，用于实现 MobileNetv2 中的反向残差块
class TFMobileViTInvertedResidual(keras.layers.Layer):
    """
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    """

    # 初始化方法，接收配置 config，输入通道数 in_channels，输出通道数 out_channels，步长 stride，扩张率 dilation 等参数
    def __init__(
        self, config: MobileViTConfig, in_channels: int, out_channels: int, stride: int, dilation: int = 1, **kwargs
    ) -> None:
        super().__init__(**kwargs)
        
        # 根据配置计算扩展后的通道数，使其能被 8 整除
        expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8)

        # 检查步长是否合法，只能是 1 或 2
        if stride not in [1, 2]:
            raise ValueError(f"Invalid stride {stride}.")

        # 判断是否使用残差连接，条件为步长为 1 且输入通道数等于输出通道数
        self.use_residual = (stride == 1) and (in_channels == out_channels)

        # 创建 1x1 卷积扩展层，输入通道数为 in_channels，输出通道数为 expanded_channels
        self.expand_1x1 = TFMobileViTConvLayer(
            config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1, name="expand_1x1"
        )

        # 创建 3x3 卷积层，输入通道数为 expanded_channels，输出通道数为 expanded_channels
        # 使用组卷积（groups=expanded_channels）和指定的步长和空洞卷积率（dilation）
        self.conv_3x3 = TFMobileViTConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=expanded_channels,
            kernel_size=3,
            stride=stride,
            groups=expanded_channels,
            dilation=dilation,
            name="conv_3x3",
        )

        # 创建 1x1 卷积减少层，输入通道数为 expanded_channels，输出通道数为 out_channels
        # 不使用激活函数
        self.reduce_1x1 = TFMobileViTConvLayer(
            config,
            in_channels=expanded_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation=False,
            name="reduce_1x1",
        )

    # 前向传播方法，接收特征张量 features 和训练标志 training，返回处理后的特征张量
    def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 保存残差连接
        residual = features

        # 经过 1x1 卷积扩展层
        features = self.expand_1x1(features, training=training)
        # 经过 3x3 卷积层
        features = self.conv_3x3(features, training=training)
        # 经过 1x1 卷积减少层
        features = self.reduce_1x1(features, training=training)

        # 如果使用残差连接，则将原始特征张量和处理后的特征张量相加
        return residual + features if self.use_residual else features

    # 构建方法，用于构建层，检查是否已经构建过
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在扩展层，则构建扩展层
        if getattr(self, "expand_1x1", None) is not None:
            with tf.name_scope(self.expand_1x1.name):
                self.expand_1x1.build(None)
        # 如果存在 3x3 卷积层，则构建 3x3 卷积层
        if getattr(self, "conv_3x3", None) is not None:
            with tf.name_scope(self.conv_3x3.name):
                self.conv_3x3.build(None)
        # 如果存在减少层，则构建减少层
        if getattr(self, "reduce_1x1", None) is not None:
            with tf.name_scope(self.reduce_1x1.name):
                self.reduce_1x1.build(None)


# 定义 MobileNet 层，包含多个 TFMobileViTInvertedResidual 反向残差块
class TFMobileViTMobileNetLayer(keras.layers.Layer):
    # 初始化方法，接收配置 config，输入通道数 in_channels，输出通道数 out_channels，步长 stride 等参数
    def __init__(
        self,
        config: MobileViTConfig,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        num_stages: int = 1,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)

        self.layers = []
        # 根据 num_stages 创建多个 TFMobileViTInvertedResidual 层
        for i in range(num_stages):
            layer = TFMobileViTInvertedResidual(
                config,
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride if i == 0 else 1,  # 第一个阶段使用给定的 stride，其余阶段使用步长 1
                name=f"layer.{i}",
            )
            self.layers.append(layer)
            in_channels = out_channels  # 更新下一层的输入通道数为当前层的输出通道数
    # 对神经网络模型进行调用，传入特征张量，并根据训练模式决定是否进行训练
    def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 遍历神经网络的每一层模块
        for layer_module in self.layers:
            # 调用每一层模块，将特征张量作为输入，根据训练模式进行处理
            features = layer_module(features, training=training)
        # 返回处理后的特征张量
        return features

    # 构建神经网络模型
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回
        if self.built:
            return
        # 将模型标记为已构建
        self.built = True
        # 如果模型具有"layers"属性
        if getattr(self, "layers", None) is not None:
            # 遍历神经网络的每一层模块
            for layer_module in self.layers:
                # 使用 TensorFlow 的名称作用域，为每一层模块设置名称空间
                with tf.name_scope(layer_module.name):
                    # 调用每一层模块的build方法，传入input_shape参数为None
                    layer_module.build(None)
# 定义 TFMobileViTSelfAttention 类，继承自 keras.layers.Layer
class TFMobileViTSelfAttention(keras.layers.Layer):
    # 初始化函数，接受 MobileViTConfig 对象和隐藏层大小参数
    def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None:
        super().__init__(**kwargs)

        # 检查隐藏层大小是否能被注意力头数整除
        if hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size {hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )

        # 设置注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        
        # 计算缩放因子，用于注意力分数的缩放
        scale = tf.cast(self.attention_head_size, dtype=tf.float32)
        self.scale = tf.math.sqrt(scale)

        # 定义用于查询、键和值的全连接层
        self.query = keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="query")
        self.key = keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="key")
        self.value = keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="value")

        # 定义用于 dropout 的层，以及隐藏层的大小
        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
        self.hidden_size = hidden_size

    # 将输入张量 x 转置以便计算注意力分数
    def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
        batch_size = tf.shape(x)[0]
        x = tf.reshape(x, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    # 调用函数，计算自注意力机制的输出
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        batch_size = tf.shape(hidden_states)[0]

        # 计算查询、键和值的张量并转置以便计算注意力分数
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(self.query(hidden_states))

        # 计算注意力分数
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        attention_scores = attention_scores / self.scale

        # 对注意力分数进行归一化处理，得到注意力概率
        attention_probs = stable_softmax(attention_scores, axis=-1)

        # 使用 dropout 层进行随机失活
        attention_probs = self.dropout(attention_probs, training=training)

        # 计算上下文张量，即注意力加权的值张量
        context_layer = tf.matmul(attention_probs, value_layer)

        # 将上下文张量转置和重塑以便输出
        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
        context_layer = tf.reshape(context_layer, shape=(batch_size, -1, self.all_head_size))
        
        # 返回最终的上下文张量作为输出
        return context_layer
    # 构建方法，用于构建神经网络层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标记，表明已经完成构建
        self.built = True
        # 如果有查询（query）属性，则构建查询张量，并命名作用域为查询张量的名称
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                # 使用隐藏大小构建查询张量，输入形状为 [None, None, self.hidden_size]
                self.query.build([None, None, self.hidden_size])
        # 如果有键（key）属性，则构建键张量，并命名作用域为键张量的名称
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                # 使用隐藏大小构建键张量，输入形状为 [None, None, self.hidden_size]
                self.key.build([None, None, self.hidden_size])
        # 如果有值（value）属性，则构建值张量，并命名作用域为值张量的名称
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                # 使用隐藏大小构建值张量，输入形状为 [None, None, self.hidden_size]
                self.value.build([None, None, self.hidden_size])
class TFMobileViTSelfOutput(keras.layers.Layer):
    def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 定义一个全连接层，用于变换隐藏状态到指定大小的输出
        self.dense = keras.layers.Dense(hidden_size, name="dense")
        # 定义一个 Dropout 层，用于在训练时随机丢弃部分输出，防止过拟合
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        self.hidden_size = hidden_size

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入的隐藏状态通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 在训练时使用 Dropout 层处理输出，以防止过拟合
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 dense 层已经定义，则使用 tf.name_scope 构建 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.hidden_size])


class TFMobileViTAttention(keras.layers.Layer):
    def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 定义注意力层，用于处理输入的隐藏状态
        self.attention = TFMobileViTSelfAttention(config, hidden_size, name="attention")
        # 定义输出层，用于处理注意力层的输出
        self.dense_output = TFMobileViTSelfOutput(config, hidden_size, name="output")

    def prune_heads(self, heads):
        # 暂未实现的方法，用于裁剪注意力头部
        raise NotImplementedError

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 使用注意力层处理隐藏状态并获取自身输出
        self_outputs = self.attention(hidden_states, training=training)
        # 使用输出层处理注意力层的自身输出
        attention_output = self.dense_output(self_outputs, training=training)
        return attention_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 attention 层已经定义，则使用 tf.name_scope 构建 attention 层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果 dense_output 层已经定义，则使用 tf.name_scope 构建 dense_output 层
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)


class TFMobileViTIntermediate(keras.layers.Layer):
    def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 定义一个全连接层，用于将隐藏状态映射到中间层的大小
        self.dense = keras.layers.Dense(intermediate_size, name="dense")
        # 根据配置获取激活函数，用于处理中间层的输出
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.hidden_size = hidden_size

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将隐藏状态通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 使用中间激活函数处理全连接层的输出
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 dense 层已经定义，则使用 tf.name_scope 构建 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.hidden_size])


class TFMobileViTOutput(keras.layers.Layer):
    # 待继续完善
    # 初始化方法，用于设置类的初始状态
    def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 创建一个全连接层，用于处理输入数据
        self.dense = keras.layers.Dense(hidden_size, name="dense")
        # 创建一个 Dropout 层，用于在训练过程中随机断开输入神经元，防止过拟合
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 设置中间层的大小
        self.intermediate_size = intermediate_size

    # 调用方法，用于定义模型的前向传播逻辑
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入张量通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 在训练过程中，通过 Dropout 层进行随机失活
        hidden_states = self.dropout(hidden_states, training=training)
        # 将全连接层的输出与输入张量相加，实现残差连接
        hidden_states = hidden_states + input_tensor
        # 返回处理后的张量作为输出
        return hidden_states

    # 构建方法，用于在第一次调用 call 方法时构建层的权重
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记当前层已经构建
        self.built = True
        # 检查是否存在 dense 层，并在命名作用域下构建它的权重
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.intermediate_size])
# 定义一个自定义的 Keras 层，实现 MobileViT 的变压器层
class TFMobileViTTransformerLayer(keras.layers.Layer):
    def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 初始化注意力机制层，使用 MobileViTAttention 类
        self.attention = TFMobileViTAttention(config, hidden_size, name="attention")
        # 初始化变换层，使用 TFMobileViTIntermediate 类
        self.intermediate = TFMobileViTIntermediate(config, hidden_size, intermediate_size, name="intermediate")
        # 初始化输出层，使用 TFMobileViTOutput 类
        self.mobilevit_output = TFMobileViTOutput(config, hidden_size, intermediate_size, name="output")
        # 初始化层归一化层（之前），epsilon 参数从配置中获取
        self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
        # 初始化层归一化层（之后），epsilon 参数从配置中获取
        self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
        # 记录隐藏层大小
        self.hidden_size = hidden_size

    # 定义 call 方法，实现层的前向传播
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 执行注意力机制，传入归一化之后的隐藏状态
        attention_output = self.attention(self.layernorm_before(hidden_states), training=training)
        # 加上残差连接，更新隐藏状态
        hidden_states = attention_output + hidden_states

        # 对更新后的隐藏状态执行层归一化（之后）
        layer_output = self.layernorm_after(hidden_states)
        # 执行变换层操作
        layer_output = self.intermediate(layer_output)
        # 执行输出层操作，传入变换层输出和之前的隐藏状态，支持训练模式
        layer_output = self.mobilevit_output(layer_output, hidden_states, training=training)
        # 返回层的输出结果
        return layer_output

    # 实现 build 方法，用于手动构建层
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记该层已经构建
        self.built = True
        # 逐个构建该层的子层，如果已经存在则跳过
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        if getattr(self, "mobilevit_output", None) is not None:
            with tf.name_scope(self.mobilevit_output.name):
                self.mobilevit_output.build(None)
        if getattr(self, "layernorm_before", None) is not None:
            with tf.name_scope(self.layernorm_before.name):
                self.layernorm_before.build([None, None, self.hidden_size])
        if getattr(self, "layernorm_after", None) is not None:
            with tf.name_scope(self.layernorm_after.name):
                self.layernorm_after.build([None, None, self.hidden_size])


# 定义一个自定义的 Keras 层，实现多层 MobileViT 变压器的堆叠
class TFMobileViTTransformer(keras.layers.Layer):
    def __init__(self, config: MobileViTConfig, hidden_size: int, num_stages: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 初始化存储变压器层的列表
        self.layers = []
        # 根据指定的层数，逐层创建 MobileViT 变压器层并添加到列表中
        for i in range(num_stages):
            transformer_layer = TFMobileViTTransformerLayer(
                config,
                hidden_size=hidden_size,
                intermediate_size=int(hidden_size * config.mlp_ratio),
                name=f"layer.{i}",
            )
            self.layers.append(transformer_layer)
    # 定义一个方法，接收隐藏状态和训练标志作为输入，返回处理后的隐藏状态张量
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 遍历神经网络模型的每一层
        for layer_module in self.layers:
            # 调用每一层的call方法，处理隐藏状态张量，并根据训练标志进行适当的处理
            hidden_states = layer_module(hidden_states, training=training)
        # 返回处理后的最终隐藏状态张量
        return hidden_states

    # 定义一个方法，用于构建神经网络模型
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果模型包含了layers属性
        if getattr(self, "layers", None) is not None:
            # 遍历模型的每一层
            for layer_module in self.layers:
                # 在TensorFlow中，使用命名空间来组织张量和操作，这里为当前层设置一个命名空间
                with tf.name_scope(layer_module.name):
                    # 调用每一层的build方法，传入input_shape参数，实现层的构建
                    layer_module.build(None)
    """
    MobileViT block: https://arxiv.org/abs/2110.02178
    """

    # 初始化函数，定义了 TFMobileViTLayer 类的构造方法
    def __init__(
        self,
        config: MobileViTConfig,  # MobileViTConfig 类型的配置参数对象
        in_channels: int,         # 输入通道数
        out_channels: int,        # 输出通道数
        stride: int,              # 步长
        hidden_size: int,         # 隐藏层大小
        num_stages: int,          # 阶段数
        dilation: int = 1,        # 膨胀率，默认为1
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)  # 调用父类的构造方法

        self.patch_width = config.patch_size   # 设置补丁宽度
        self.patch_height = config.patch_size  # 设置补丁高度

        # 根据步长选择是否创建下采样层
        if stride == 2:
            self.downsampling_layer = TFMobileViTInvertedResidual(
                config,
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride if dilation == 1 else 1,  # 如果膨胀率为1，则使用给定的步长，否则步长为1
                dilation=dilation // 2 if dilation > 1 else 1,  # 计算膨胀率的一半，如果膨胀率大于1，否则为1
                name="downsampling_layer",
            )
            in_channels = out_channels  # 更新输入通道数为输出通道数
        else:
            self.downsampling_layer = None  # 否则不创建下采样层

        # 创建 kxk 卷积层
        self.conv_kxk = TFMobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=config.conv_kernel_size,
            name="conv_kxk",
        )

        # 创建 1x1 卷积层，用于调整隐藏层大小
        self.conv_1x1 = TFMobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=hidden_size,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
            name="conv_1x1",
        )

        # 创建 MobileViTTransformer 实例，用于执行转换操作
        self.transformer = TFMobileViTTransformer(
            config, hidden_size=hidden_size, num_stages=num_stages, name="transformer"
        )

        # 创建层归一化层，使用给定的 epsilon 值
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")

        # 创建投影卷积层，将隐藏层特征映射回原始输入通道数
        self.conv_projection = TFMobileViTConvLayer(
            config, in_channels=hidden_size, out_channels=in_channels, kernel_size=1, name="conv_projection"
        )

        # 创建融合卷积层，用于融合两个输入特征
        self.fusion = TFMobileViTConvLayer(
            config,
            in_channels=2 * in_channels,
            out_channels=in_channels,
            kernel_size=config.conv_kernel_size,
            name="fusion",
        )
        self.hidden_size = hidden_size  # 设置隐藏层大小
    def unfolding(self, features: tf.Tensor) -> Tuple[tf.Tensor, Dict]:
        # 获取补丁的宽度和高度
        patch_width, patch_height = self.patch_width, self.patch_height
        # 计算补丁的面积
        patch_area = tf.cast(patch_width * patch_height, "int32")

        # 获取输入特征张量的批量大小、原始高度、原始宽度和通道数
        batch_size = tf.shape(features)[0]
        orig_height = tf.shape(features)[1]
        orig_width = tf.shape(features)[2]
        channels = tf.shape(features)[3]

        # 计算新的高度和宽度，确保能被补丁大小整除
        new_height = tf.cast(tf.math.ceil(orig_height / patch_height) * patch_height, "int32")
        new_width = tf.cast(tf.math.ceil(orig_width / patch_width) * patch_width, "int32")

        # 判断是否需要插值
        interpolate = new_width != orig_width or new_height != orig_height
        if interpolate:
            # 如果需要插值，使用双线性插值方法调整特征大小
            features = tf.image.resize(features, size=(new_height, new_width), method="bilinear")

        # 计算沿宽度和高度的补丁数量
        num_patch_width = new_width // patch_width
        num_patch_height = new_height // patch_height
        num_patches = num_patch_height * num_patch_width

        # 转置特征张量的维度顺序，使得通道数排在第二维度
        features = tf.transpose(features, [0, 3, 1, 2])
        # 重塑张量，将其从 (batch_size, channels, orig_height, orig_width) 转变为 (batch_size * channels * num_patch_height, patch_height, num_patch_width, patch_width)
        patches = tf.reshape(
            features, (batch_size * channels * num_patch_height, patch_height, num_patch_width, patch_width)
        )
        # 再次转置张量的维度顺序，重新安排其形状为 (batch_size, num_patch_width, patch_height, patch_width)
        patches = tf.transpose(patches, [0, 2, 1, 3])
        # 重塑张量，将其形状从 (batch_size, num_patch_width, patch_height, patch_width) 转变为 (batch_size, channels, num_patches, patch_area)
        patches = tf.reshape(patches, (batch_size, channels, num_patches, patch_area))
        # 转置张量的维度顺序，使得补丁数量成为第三维度
        patches = tf.transpose(patches, [0, 3, 2, 1])
        # 重塑张量，将其形状从 (batch_size, patch_area, num_patches, channels) 转变为 (batch_size * patch_area, num_patches, channels)
        patches = tf.reshape(patches, (batch_size * patch_area, num_patches, channels))

        # 创建包含各种信息的字典
        info_dict = {
            "orig_size": (orig_height, orig_width),
            "batch_size": batch_size,
            "channels": channels,
            "interpolate": interpolate,
            "num_patches": num_patches,
            "num_patches_width": num_patch_width,
            "num_patches_height": num_patch_height,
        }
        # 返回补丁张量和信息字典
        return patches, info_dict
    def folding(self, patches: tf.Tensor, info_dict: Dict) -> tf.Tensor:
        # 获取每个补丁的宽度和高度
        patch_width, patch_height = self.patch_width, self.patch_height
        # 计算每个补丁的总像素数
        patch_area = int(patch_width * patch_height)

        # 从信息字典中获取批处理大小、通道数、补丁数量、补丁高度和补丁宽度
        batch_size = info_dict["batch_size"]
        channels = info_dict["channels"]
        num_patches = info_dict["num_patches"]
        num_patch_height = info_dict["num_patches_height"]
        num_patch_width = info_dict["num_patches_width"]

        # 将补丁重新整形成 (batch_size, patch_area, num_patches, -1)
        features = tf.reshape(patches, (batch_size, patch_area, num_patches, -1))
        # 调换维度顺序为 (batch_size, -1, num_patches, patch_area)
        features = tf.transpose(features, perm=(0, 3, 2, 1))
        # 将特征张量重新整形为 (batch_size * channels * num_patch_height, num_patch_width, patch_height, patch_width)
        features = tf.reshape(
            features, (batch_size * channels * num_patch_height, num_patch_width, patch_height, patch_width)
        )
        # 再次调换维度顺序为 (batch_size * channels * num_patch_height, patch_height, num_patch_width, patch_width)
        features = tf.transpose(features, perm=(0, 2, 1, 3))
        # 最终将特征张量重新整形为 (batch_size, channels, num_patch_height * patch_height, num_patch_width * patch_width)
        features = tf.reshape(
            features, (batch_size, channels, num_patch_height * patch_height, num_patch_width * patch_width)
        )
        # 再次调换维度顺序为 (batch_size, num_patch_height * patch_height, num_patch_width * patch_width, channels)
        features = tf.transpose(features, perm=(0, 2, 3, 1))

        # 如果需要插值，对特征图像素进行双线性插值
        if info_dict["interpolate"]:
            features = tf.image.resize(features, size=info_dict["orig_size"], method="bilinear")

        return features

    def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 如果有下采样层，对特征进行空间维度缩减
        if self.downsampling_layer:
            features = self.downsampling_layer(features, training=training)

        residual = features

        # 本地表示
        # 将特征经过 kxk 卷积层
        features = self.conv_kxk(features, training=training)
        # 再将特征经过 1x1 卷积层
        features = self.conv_1x1(features, training=training)

        # 将特征图转换为补丁
        patches, info_dict = self.unfolding(features)

        # 学习全局表示
        # 经过 Transformer 处理补丁
        patches = self.transformer(patches, training=training)
        # Layer normalization
        patches = self.layernorm(patches)

        # 将补丁转换回特征图
        features = self.folding(patches, info_dict)

        # 投影卷积层
        features = self.conv_projection(features, training=training)
        # 特征融合
        features = self.fusion(tf.concat([residual, features], axis=-1), training=training)
        return features
    # 如果已经构建过网络结构，则直接返回，不再重复构建
    if self.built:
        return
    
    # 设置标志位，表示网络已经构建完成
    self.built = True
    
    # 如果存在卷积核大小不为None的属性，构建对应的卷积层
    if getattr(self, "conv_kxk", None) is not None:
        # 在命名空间中构建conv_kxk层
        with tf.name_scope(self.conv_kxk.name):
            self.conv_kxk.build(None)
    
    # 如果存在1x1卷积层不为None的属性，构建对应的1x1卷积层
    if getattr(self, "conv_1x1", None) is not None:
        # 在命名空间中构建conv_1x1层
        with tf.name_scope(self.conv_1x1.name):
            self.conv_1x1.build(None)
    
    # 如果存在transformer层不为None的属性，构建transformer层
    if getattr(self, "transformer", None) is not None:
        # 在命名空间中构建transformer层
        with tf.name_scope(self.transformer.name):
            self.transformer.build(None)
    
    # 如果存在layernorm层不为None的属性，构建layernorm层
    if getattr(self, "layernorm", None) is not None:
        # 在命名空间中构建layernorm层，输入形状为[None, None, self.hidden_size]
        with tf.name_scope(self.layernorm.name):
            self.layernorm.build([None, None, self.hidden_size])
    
    # 如果存在投影卷积层不为None的属性，构建对应的投影卷积层
    if getattr(self, "conv_projection", None) is not None:
        # 在命名空间中构建conv_projection层
        with tf.name_scope(self.conv_projection.name):
            self.conv_projection.build(None)
    
    # 如果存在融合层不为None的属性，构建对应的融合层
    if getattr(self, "fusion", None) is not None:
        # 在命名空间中构建fusion层
        with tf.name_scope(self.fusion.name):
            self.fusion.build(None)
    
    # 如果存在下采样层不为None的属性，构建对应的下采样层
    if getattr(self, "downsampling_layer", None) is not None:
        # 在命名空间中构建downsampling_layer层
        with tf.name_scope(self.downsampling_layer.name):
            self.downsampling_layer.build(None)
# 定义 TFMobileViTEncoder 类，继承自 keras.layers.Layer
class TFMobileViTEncoder(keras.layers.Layer):
    # 初始化方法，接受 MobileViTConfig 类型的 config 参数和其他关键字参数
    def __init__(self, config: MobileViTConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        # 将传入的 config 参数赋值给实例变量 self.config
        self.config = config

        # 初始化空列表 self.layers，用于存储编码器的各个层
        self.layers = []

        # 根据输出步长 output_stride 调整分类主干网络的步幅
        dilate_layer_4 = dilate_layer_5 = False
        if config.output_stride == 8:
            dilate_layer_4 = True
            dilate_layer_5 = True
        elif config.output_stride == 16:
            dilate_layer_5 = True

        # 初始的空间卷积的扩张率设为1
        dilation = 1

        # 创建第一个 MobileNet 层 layer_1，并添加到 self.layers 列表中
        layer_1 = TFMobileViTMobileNetLayer(
            config,
            in_channels=config.neck_hidden_sizes[0],
            out_channels=config.neck_hidden_sizes[1],
            stride=1,
            num_stages=1,
            name="layer.0",
        )
        self.layers.append(layer_1)

        # 创建第二个 MobileNet 层 layer_2，并添加到 self.layers 列表中
        layer_2 = TFMobileViTMobileNetLayer(
            config,
            in_channels=config.neck_hidden_sizes[1],
            out_channels=config.neck_hidden_sizes[2],
            stride=2,
            num_stages=3,
            name="layer.1",
        )
        self.layers.append(layer_2)

        # 创建第三个通用层 layer_3，并添加到 self.layers 列表中
        layer_3 = TFMobileViTLayer(
            config,
            in_channels=config.neck_hidden_sizes[2],
            out_channels=config.neck_hidden_sizes[3],
            stride=2,
            hidden_size=config.hidden_sizes[0],
            num_stages=2,
            name="layer.2",
        )
        self.layers.append(layer_3)

        # 如果 dilate_layer_4 为真，则将 dilation 增加到当前值的两倍
        if dilate_layer_4:
            dilation *= 2

        # 创建第四个通用层 layer_4，并添加到 self.layers 列表中
        layer_4 = TFMobileViTLayer(
            config,
            in_channels=config.neck_hidden_sizes[3],
            out_channels=config.neck_hidden_sizes[4],
            stride=2,
            hidden_size=config.hidden_sizes[1],
            num_stages=4,
            dilation=dilation,
            name="layer.3",
        )
        self.layers.append(layer_4)

        # 如果 dilate_layer_5 为真，则将 dilation 增加到当前值的两倍
        if dilate_layer_5:
            dilation *= 2

        # 创建第五个通用层 layer_5，并添加到 self.layers 列表中
        layer_5 = TFMobileViTLayer(
            config,
            in_channels=config.neck_hidden_sizes[4],
            out_channels=config.neck_hidden_sizes[5],
            stride=2,
            hidden_size=config.hidden_sizes[2],
            num_stages=3,
            dilation=dilation,
            name="layer.4",
        )
        self.layers.append(layer_5)

    # 定义 call 方法，用于执行前向传播
    def call(
        self,
        hidden_states: tf.Tensor,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        training: bool = False,
        # 剩余未注释的参数在这里
        ):
    ) -> Union[tuple, TFBaseModelOutput]:
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出所有隐藏状态，则初始化为空元组，否则初始化为None

        for i, layer_module in enumerate(self.layers):
            hidden_states = layer_module(hidden_states, training=training)
            # 依次对每个层模块进行前向传播计算，并更新隐藏状态

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
                # 如果需要输出所有隐藏状态，则将当前隐藏状态添加到 all_hidden_states 元组中

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
            # 如果不需要以字典形式返回结果，则返回包含非空值的元组

        return TFBaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
        # 以 TFBaseModelOutput 对象形式返回结果，包括最后一个隐藏状态和所有隐藏状态组成的元组

    def build(self, input_shape=None):
        if self.built:
            return
        # 如果模型已经建立，直接返回

        self.built = True
        # 标记模型已经建立

        if getattr(self, "layers", None) is not None:
            for layer_module in self.layers:
                with tf.name_scope(layer_module.name):
                    layer_module.build(None)
        # 对每个层模块进行建立操作，使用各自的名称作为命名空间
# 使用装饰器标记该类可序列化为 Keras 模型
@keras_serializable
class TFMobileViTMainLayer(keras.layers.Layer):
    # 指定配置类为 MobileViTConfig
    config_class = MobileViTConfig

    # 初始化方法，接受 MobileViTConfig 对象和一个扩展输出的布尔值作为参数
    def __init__(self, config: MobileViTConfig, expand_output: bool = True, **kwargs):
        super().__init__(**kwargs)
        # 将传入的配置对象和扩展输出标志保存为实例属性
        self.config = config
        self.expand_output = expand_output

        # 创建 MobileViT 的卷积处理层 conv_stem
        self.conv_stem = TFMobileViTConvLayer(
            config,
            in_channels=config.num_channels,
            out_channels=config.neck_hidden_sizes[0],
            kernel_size=3,
            stride=2,
            name="conv_stem",
        )

        # 创建 MobileViT 的编码器部分 encoder
        self.encoder = TFMobileViTEncoder(config, name="encoder")

        # 如果需要扩展输出，则创建 1x1 卷积层 conv_1x1_exp
        if self.expand_output:
            self.conv_1x1_exp = TFMobileViTConvLayer(
                config,
                in_channels=config.neck_hidden_sizes[5],
                out_channels=config.neck_hidden_sizes[6],
                kernel_size=1,
                name="conv_1x1_exp",
            )

        # 创建全局平均池化层 pooler，用于提取特征图的全局平均值
        self.pooler = keras.layers.GlobalAveragePooling2D(data_format="channels_first", name="pooler")

    # 私有方法，用于剪枝模型的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 使用装饰器 unpack_inputs 标记的 call 方法，接受多个输入参数，并进行模型的前向传播
    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,

        # pixel_values: 输入的像素值张量，可以为 None
        # output_hidden_states: 是否返回隐藏状态的标志，可选布尔值
        # return_dict: 是否返回字典格式的输出，可选布尔值
        # training: 是否处于训练模式的标志，布尔类型

        # 在这里可以添加更多的代码，继续构建模型的前向传播过程...
    ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]:
        # 确定是否输出隐藏状态，默认为模型配置中的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否使用返回字典，默认为模型配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 当在 CPU 上运行时，`keras.layers.Conv2D` 不支持 `NCHW` 格式。
        # 因此将输入格式从 `NCHW` 转换为 `NHWC`。
        # 形状为 (batch_size, in_height, in_width, in_channels=num_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 使用卷积 stem 层处理像素值
        embedding_output = self.conv_stem(pixel_values, training=training)

        # 使用编码器处理嵌入输出
        encoder_outputs = self.encoder(
            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
        )

        # 如果设置了扩展输出，对最后隐藏状态进行处理
        if self.expand_output:
            # 对最后隐藏状态进行 1x1 卷积处理
            last_hidden_state = self.conv_1x1_exp(encoder_outputs[0])

            # 将输出格式改回 NCHW 以保持模块间的一致性
            last_hidden_state = tf.transpose(last_hidden_state, perm=[0, 3, 1, 2])

            # 全局平均池化：(batch_size, channels, height, width) -> (batch_size, channels)
            pooled_output = self.pooler(last_hidden_state)
        else:
            # 如果没有扩展输出，则直接使用编码器的最后隐藏状态
            last_hidden_state = encoder_outputs[0]

            # 将输出格式改回 NCHW 以保持模块间的一致性
            last_hidden_state = tf.transpose(last_hidden_state, perm=[0, 3, 1, 2])
            pooled_output = None

        # 如果不使用返回字典，根据是否扩展输出返回相应的输出格式
        if not return_dict:
            output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,)

            # 将输出格式改回 NCHW 以保持模块间的一致性
            if not self.expand_output:
                remaining_encoder_outputs = encoder_outputs[1:]
                remaining_encoder_outputs = tuple(
                    [tf.transpose(h, perm=(0, 3, 1, 2)) for h in remaining_encoder_outputs[0]]
                )
                remaining_encoder_outputs = (remaining_encoder_outputs,)
                return output + remaining_encoder_outputs
            else:
                return output + encoder_outputs[1:]

        # 如果需要输出隐藏状态，则将所有隐藏状态输出的格式改回 NCHW
        if output_hidden_states:
            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])

        # 返回 TFBaseModelOutputWithPooling 类型的结果
        return TFBaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
        )
    # 如果模型已经构建完成，则直接返回，不再重复构建
    if self.built:
        return
    # 设置标志表示模型已经构建
    self.built = True

    # 如果存在卷积层(conv_stem)，则构建该层
    if getattr(self, "conv_stem", None) is not None:
        # 使用卷积层的命名空间，构建卷积层
        with tf.name_scope(self.conv_stem.name):
            self.conv_stem.build(None)

    # 如果存在编码器(encoder)，则构建该编码器
    if getattr(self, "encoder", None) is not None:
        # 使用编码器的命名空间，构建编码器
        with tf.name_scope(self.encoder.name):
            self.encoder.build(None)

    # 如果存在池化层(pooler)，则构建该层
    if getattr(self, "pooler", None) is not None:
        # 使用池化层的命名空间，构建池化层，输入维度为[None, None, None, None]
        with tf.name_scope(self.pooler.name):
            self.pooler.build([None, None, None, None])

    # 如果存在1x1卷积层(conv_1x1_exp)，则构建该层
    if getattr(self, "conv_1x1_exp", None) is not None:
        # 使用1x1卷积层的命名空间，构建1x1卷积层
        with tf.name_scope(self.conv_1x1_exp.name):
            self.conv_1x1_exp.build(None)
    """
    Documentation string defining the format of inputs accepted by models and layers in the MobileViT architecture.
    It explains the two supported input formats: keyword arguments and positional list/tuple/dict for input tensors.

    When using TensorFlow 2.0 Keras methods like `model.fit()`, the second format (list, tuple, dict) is preferred.
    This enables flexibility in passing inputs such as `pixel_values`, `attention_mask`, and `token_type_ids`.

    For Keras Functional API or subclassing, inputs can be:
    - A single tensor: `model(pixel_values)`
    - A list of tensors: `model([pixel_values, attention_mask])`
    - A dictionary of tensors: `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`

    This documentation guides users on how to interface with MobileViT models and layers effectively.

    Parameters:
        config ([`MobileViTConfig`]): Configuration class containing all model parameters.
            Loading weights requires using [`~TFPreTrainedModel.from_pretrained`], which initializes the model with weights.

    """
    # Args: 声明函数的参数和类型
    # pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
    #     像素数值。可以使用 [`AutoImageProcessor`] 获得像素值。详见 [`MobileViTImageProcessor.__call__`]。
    # output_hidden_states (`bool`, *optional*):
    #     是否返回所有层的隐藏状态。请查看返回的张量中的 `hidden_states` 以获取更多详细信息。此参数仅在 eager 模式下可用，在图模式下将使用配置中的值。
    # return_dict (`bool`, *optional*):
    #     是否返回一个 [`~utils.ModelOutput`] 而不是普通的元组。此参数在 eager 模式下可用，在图模式下将始终设置为 True。
"""
MobileViT model outputting raw hidden-states without any specific head on top.

此类定义了一个MobileViT模型，它没有特定的输出头部。

MOBILEVIT_START_DOCSTRING: 在此处未提供具体内容的示例文档字符串。

"""
class TFMobileViTModel(TFMobileViTPreTrainedModel):
    def __init__(self, config: MobileViTConfig, expand_output: bool = True, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.config = config  # 初始化配置对象
        self.expand_output = expand_output  # 是否扩展输出标志

        self.mobilevit = TFMobileViTMainLayer(config, expand_output=expand_output, name="mobilevit")
        # 创建MobileViT主层对象

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]:
        output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training)
        return output
        # 调用MobileViT主层对象进行前向传播，返回输出

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "mobilevit", None) is not None:
            with tf.name_scope(self.mobilevit.name):
                self.mobilevit.build(None)
        # 构建模型，确保MobileViT主层对象已建立



"""
MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.

此类定义了一个带有图像分类头部的MobileViT模型，例如用于ImageNet。

MOBILEVIT_START_DOCSTRING: 在此处未提供具体内容的示例文档字符串。

"""
class TFMobileViTForImageClassification(TFMobileViTPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: MobileViTConfig, *inputs, **kwargs) -> None:
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels  # 分类标签数量
        self.mobilevit = TFMobileViTMainLayer(config, name="mobilevit")  # 创建MobileViT主层对象

        # 分类器头部
        self.dropout = keras.layers.Dropout(config.classifier_dropout_prob)  # Dropout层
        self.classifier = (
            keras.layers.Dense(config.num_labels, name="classifier") if config.num_labels > 0 else tf.identity
        )  # 分类器，如果标签数量大于0则创建密集层，否则为恒等映射
        self.config = config  # 配置对象

    @unpack_inputs
    @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        output_hidden_states: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFImageClassifierOutputWithNoAttention]:
        output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training)
        return output
        # 调用MobileViT主层对象进行前向传播，返回输出
    ) -> Union[tuple, TFImageClassifierOutputWithNoAttention]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用给定的 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 mobilevit 模型进行前向传播
        outputs = self.mobilevit(
            pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
        )

        # 如果 return_dict 为 True，则使用 outputs 的 pooler_output；否则使用 outputs 的第二个元素作为 pooled_output
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 将 pooled_output 经过 dropout 和 classifier 模型，得到 logits
        logits = self.classifier(self.dropout(pooled_output, training=training))

        # 如果 labels 不为 None，则计算损失；否则损失为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果 return_dict 为 False，则返回 logits 和 outputs 的其他隐藏状态
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TFImageClassifierOutputWithNoAttention 对象，包含损失、logits 和隐藏状态
        return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        
        # 将模型标记为已构建
        self.built = True
        
        # 如果 mobilevit 模型存在，则构建它
        if getattr(self, "mobilevit", None) is not None:
            with tf.name_scope(self.mobilevit.name):
                self.mobilevit.build(None)
        
        # 如果 classifier 模型存在，则构建它
        if getattr(self, "classifier", None) is not None:
            if hasattr(self.classifier, "name"):
                with tf.name_scope(self.classifier.name):
                    self.classifier.build([None, None, self.config.neck_hidden_sizes[-1]])
class TFMobileViTASPPPooling(keras.layers.Layer):
    # 初始化函数，定义了 TFMobileViTASPPPooling 类的构造方法
    def __init__(self, config: MobileViTConfig, in_channels: int, out_channels: int, **kwargs) -> None:
        super().__init__(**kwargs)

        # 创建全局平均池化层，保持维度，命名为 "global_pool"
        self.global_pool = keras.layers.GlobalAveragePooling2D(keepdims=True, name="global_pool")

        # 创建 TFMobileViTConvLayer 实例 conv_1x1，用于 1x1 卷积
        self.conv_1x1 = TFMobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            use_normalization=True,
            use_activation="relu",
            name="conv_1x1",
        )

    # 定义调用方法，对输入特征进行处理
    def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 获取特征的空间尺寸（排除批量和通道维度）
        spatial_size = shape_list(features)[1:-1]
        # 应用全局池化到特征上
        features = self.global_pool(features)
        # 应用 1x1 卷积层到全局池化后的特征上
        features = self.conv_1x1(features, training=training)
        # 使用双线性插值方法将特征尺寸调整回原始空间尺寸
        features = tf.image.resize(features, size=spatial_size, method="bilinear")
        return features

    # 构建方法，用于建立层次结构
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在全局池化层，则建立其内部结构
        if getattr(self, "global_pool", None) is not None:
            with tf.name_scope(self.global_pool.name):
                self.global_pool.build([None, None, None, None])
        # 如果存在 conv_1x1 层，则建立其内部结构
        if getattr(self, "conv_1x1", None) is not None:
            with tf.name_scope(self.conv_1x1.name):
                self.conv_1x1.build(None)


class TFMobileViTASPP(keras.layers.Layer):
    """
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    """
    def __init__(self, config: MobileViTConfig, **kwargs) -> None:
        super().__init__(**kwargs)

        # 从配置中获取输入通道数作为ASPP模块的输入通道数
        in_channels = config.neck_hidden_sizes[-2]
        # 从配置中获取ASPP模块的输出通道数
        out_channels = config.aspp_out_channels

        # 检查配置中空洞卷积的扩张率是否为3个值，如果不是则抛出数值错误异常
        if len(config.atrous_rates) != 3:
            raise ValueError("Expected 3 values for atrous_rates")

        # 初始化空洞卷积层列表
        self.convs = []

        # 创建ASPP模块的第一个投影层，使用1x1卷积核
        in_projection = TFMobileViTConvLayer(
            config,
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            use_activation="relu",
            name="convs.0",
        )
        self.convs.append(in_projection)

        # 创建并添加多个空洞卷积层到ASPP模块中
        self.convs.extend(
            [
                TFMobileViTConvLayer(
                    config,
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    dilation=rate,
                    use_activation="relu",
                    name=f"convs.{i + 1}",
                )
                for i, rate in enumerate(config.atrous_rates)
            ]
        )

        # 创建ASPP模块的池化层
        pool_layer = TFMobileViTASPPPooling(
            config, in_channels, out_channels, name=f"convs.{len(config.atrous_rates) + 1}"
        )
        self.convs.append(pool_layer)

        # 创建ASPP模块的投影层，使用1x1卷积核，将所有特征图通道合并
        self.project = TFMobileViTConvLayer(
            config,
            in_channels=5 * out_channels,  # 合并后的输入通道数
            out_channels=out_channels,
            kernel_size=1,
            use_activation="relu",
            name="project",
        )

        # 创建ASPP模块的Dropout层，使用配置中的丢弃概率
        self.dropout = keras.layers.Dropout(config.aspp_dropout_prob)

    def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将特征张量的通道维度调整为`(batch_size, height, width, channels)`的顺序
        features = tf.transpose(features, perm=[0, 2, 3, 1])
        pyramid = []
        # 对ASPP模块中的每一层进行前向传播计算
        for conv in self.convs:
            pyramid.append(conv(features, training=training))
        # 将所有ASPP模块层的输出在通道维度上拼接起来
        pyramid = tf.concat(pyramid, axis=-1)

        # 对合并后的特征进行投影操作
        pooled_features = self.project(pyramid, training=training)
        # 对投影后的特征进行Dropout操作
        pooled_features = self.dropout(pooled_features, training=training)
        return pooled_features

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，则直接返回
        if getattr(self, "project", None) is not None:
            # 构建ASPP模块的投影层
            with tf.name_scope(self.project.name):
                self.project.build(None)
        if getattr(self, "convs", None) is not None:
            # 构建ASPP模块中的每一层
            for conv in self.convs:
                with tf.name_scope(conv.name):
                    conv.build(None)
class TFMobileViTDeepLabV3(keras.layers.Layer):
    """
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    """

    def __init__(self, config: MobileViTConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        # 初始化 ASPP 模块，配置来自 MobileViTConfig
        self.aspp = TFMobileViTASPP(config, name="aspp")

        # Dropout 层，使用给定的分类器 dropout 概率
        self.dropout = keras.layers.Dropout(config.classifier_dropout_prob)

        # 分类器层，用于输出分类标签
        self.classifier = TFMobileViTConvLayer(
            config,
            in_channels=config.aspp_out_channels,
            out_channels=config.num_labels,
            kernel_size=1,
            use_normalization=False,
            use_activation=False,
            bias=True,
            name="classifier",
        )

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # ASPP 模块处理最后一个隐藏状态的特征
        features = self.aspp(hidden_states[-1], training=training)
        # 应用 Dropout 操作到特征上
        features = self.dropout(features, training=training)
        # 使用分类器层进行最终的分类预测
        features = self.classifier(features, training=training)
        return features

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 ASPP 模块存在，则构建 ASPP 模块
        if getattr(self, "aspp", None) is not None:
            with tf.name_scope(self.aspp.name):
                self.aspp.build(None)
        # 如果分类器存在，则构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)


@add_start_docstrings(
    """
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    """,
    MOBILEVIT_START_DOCSTRING,
)
class TFMobileViTForSemanticSegmentation(TFMobileViTPreTrainedModel):
    def __init__(self, config: MobileViTConfig, **kwargs) -> None:
        super().__init__(config, **kwargs)

        # 类别数目
        self.num_labels = config.num_labels
        # MobileViT 主层，不扩展输出，命名为 'mobilevit'
        self.mobilevit = TFMobileViTMainLayer(config, expand_output=False, name="mobilevit")
        # 语义分割头部，基于 TFMobileViTDeepLabV3 构建
        self.segmentation_head = TFMobileViTDeepLabV3(config, name="segmentation_head")

    def hf_compute_loss(self, logits, labels):
        # 将 logits 上采样到原始图像大小
        # `labels` 的形状为 (batch_size, height, width)
        label_interp_shape = shape_list(labels)[1:]

        upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
        # 计算加权损失
        loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

        def masked_loss(real, pred):
            unmasked_loss = loss_fct(real, pred)
            mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype)
            masked_loss = unmasked_loss * mask
            # 与 https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210 类似的减少策略
            reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)
            return tf.reshape(reduced_masked_loss, (1,))

        return masked_loss(labels, upsampled_logits)
    # 应用装饰器 @unpack_inputs，用于解包输入参数
    @unpack_inputs
    # 应用装饰器 @add_start_docstrings_to_model_forward，向模型的前向传播函数添加起始文档字符串
    @add_start_docstrings_to_model_forward(MOBILEVIT_INPUTS_DOCSTRING)
    # 应用装饰器 @replace_return_docstrings，替换返回值的文档字符串，指定输出类型为 TFSemanticSegmenterOutputWithNoAttention，并指定配置类为 _CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=TFSemanticSegmenterOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播函数，接受以下参数：
    def call(
        self,
        pixel_values: tf.Tensor | None = None,  # 像素值张量，可以为 None
        labels: tf.Tensor | None = None,         # 标签张量，可以为 None
        output_hidden_states: Optional[bool] = None,  # 可选的布尔值，控制是否输出隐藏状态
        return_dict: Optional[bool] = None,      # 可选的布尔值，控制是否以字典形式返回结果
        training: bool = False,                  # 布尔值，指示当前是否处于训练模式
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mobilevit(
            pixel_values,
            output_hidden_states=True,  # 指定输出中间隐藏状态
            return_dict=return_dict,
            training=training,
        )

        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
        # 提取编码器的隐藏状态用于后续的语义分割任务

        logits = self.segmentation_head(encoder_hidden_states, training=training)
        # 使用编码器的隐藏状态生成语义分割的 logits

        loss = None
        if labels is not None:
            if not self.config.num_labels > 1:
                raise ValueError("The number of labels should be greater than one")
            else:
                loss = self.hf_compute_loss(logits=logits, labels=labels)
                # 计算损失函数，要求标签数量大于1

        # 将 logits 的形状转换为 (batch_size, num_labels, height, width)，以保持 API 的一致性
        logits = tf.transpose(logits, perm=[0, 3, 1, 2])

        if not return_dict:
            if output_hidden_states:
                output = (logits,) + outputs[1:]
                # 输出包含 logits 和可能的其他隐藏状态
            else:
                output = (logits,) + outputs[2:]
                # 输出包含 logits 和可能的其他输出信息
            return ((loss,) + output) if loss is not None else output
            # 返回输出元组，可能包含损失和额外输出信息

        return TFSemanticSegmenterOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            # 返回带有损失、logits 和隐藏状态的 TFSemanticSegmenterOutputWithNoAttention 对象
        )
    # 如果模型已经构建，则直接返回，不进行重复构建
    if self.built:
        return
    # 将模型标记为已构建状态
    self.built = True
    
    # 如果存在名为"mobilevit"的属性，并且不为None，则构建其对应的部分
    if getattr(self, "mobilevit", None) is not None:
        # 在 TensorFlow 中为"mobilevit"部分创建命名作用域
        with tf.name_scope(self.mobilevit.name):
            # 调用"mobilevit"部分的build方法，传入None作为输入形状
            self.mobilevit.build(None)
    
    # 如果存在名为"segmentation_head"的属性，并且不为None，则构建其对应的部分
    if getattr(self, "segmentation_head", None) is not None:
        # 在 TensorFlow 中为"segmentation_head"部分创建命名作用域
        with tf.name_scope(self.segmentation_head.name):
            # 调用"segmentation_head"部分的build方法，传入None作为输入形状
            self.segmentation_head.build(None)

Transformers-源码解析-七十七-

Transformers 源码解析（七十七）

.\models\mobilenet_v2\modeling_mobilenet_v2.py

.\models\mobilenet_v2\__init__.py

.\models\mobilevit\configuration_mobilevit.py

.\models\mobilevit\convert_mlcvnets_to_pytorch.py

.\models\mobilevit\feature_extraction_mobilevit.py

.\models\mobilevit\image_processing_mobilevit.py

.\models\mobilevit\modeling_mobilevit.py

.\models\mobilevit\modeling_tf_mobilevit.py

`.\models\mobilenet_v2\modeling_mobilenet_v2.py`

`.\models\mobilenet_v2\init.py`

`.\models\mobilevit\configuration_mobilevit.py`

`.\models\mobilevit\convert_mlcvnets_to_pytorch.py`

`.\models\mobilevit\feature_extraction_mobilevit.py`

`.\models\mobilevit\image_processing_mobilevit.py`

`.\models\mobilevit\modeling_mobilevit.py`

`.\models\mobilevit\modeling_tf_mobilevit.py`