Transformers 源码解析（一百一十五）

`.\models\unispeech_sat\init.py`

# 导入必要的模块和函数，包括依赖检查和异常处理
from typing import TYPE_CHECKING

# 导入自定义的异常类和延迟加载模块函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义模块的导入结构，包括配置和模型相关内容
_import_structure = {
    "configuration_unispeech_sat": ["UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "UniSpeechSatConfig"],
}

# 检查是否可以导入 torch，如果不行则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可以导入 torch，则添加与模型相关的导入内容
    _import_structure["modeling_unispeech_sat"] = [
        "UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "UniSpeechSatForAudioFrameClassification",
        "UniSpeechSatForCTC",
        "UniSpeechSatForPreTraining",
        "UniSpeechSatForSequenceClassification",
        "UniSpeechSatForXVector",
        "UniSpeechSatModel",
        "UniSpeechSatPreTrainedModel",
    ]

# 如果是类型检查阶段，导入配置和模型相关内容的具体类
if TYPE_CHECKING:
    from .configuration_unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_unispeech_sat import (
            UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST,
            UniSpeechSatForAudioFrameClassification,
            UniSpeechSatForCTC,
            UniSpeechSatForPreTraining,
            UniSpeechSatForSequenceClassification,
            UniSpeechSatForXVector,
            UniSpeechSatModel,
            UniSpeechSatPreTrainedModel,
        )

# 如果不是类型检查阶段，则将模块注册为懒加载模块，以延迟加载相关内容
else:
    import sys

    # 将当前模块注册为懒加载模块，用于实现模块的延迟导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\univnet\configuration_univnet.py`

# 版权声明和信息，指出此代码版权归HuggingFace团队所有，并使用Apache许可证2.0授权
#
# 在遵守许可证的前提下，您可以使用此文件。您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 本软件是基于"原样"提供的，没有任何明示或暗示的担保或条件。详情请查看许可证。
""" UnivNetModel 模型配置"""

# 导入必要的模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取用于日志记录的记录器对象
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射，将模型名称映射到对应的配置文件URL
UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "dg845/univnet-dev": "https://huggingface.co/dg845/univnet-dev/resolve/main/config.json",
}

# UnivNetConfig 类，继承自 PretrainedConfig 类
class UnivNetConfig(PretrainedConfig):
    r"""
    这是用于存储 [`UnivNetModel`] 配置的类。它用于根据指定参数实例化 UnivNet 语音合成模型，定义模型架构。
    使用默认值实例化配置会生成类似于 UnivNet [dg845/univnet-dev](https://huggingface.co/dg845/univnet-dev)
    架构的配置，对应于 [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/master/config/default_c32.yaml)
    中的 'c32' 架构。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。

    示例：

    ```
    >>> from transformers import UnivNetModel, UnivNetConfig

    >>> # 初始化 Tortoise TTS 风格的配置
    >>> configuration = UnivNetConfig()

    >>> # 从 Tortoise TTS 风格的配置初始化一个模型（带有随机权重）
    >>> model = UnivNetModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型标识符
    model_type = "univnet"

    # 初始化方法，定义 UnivNetConfig 的各种参数
    def __init__(
        self,
        model_in_channels=64,
        model_hidden_channels=32,
        num_mel_bins=100,
        resblock_kernel_sizes=[3, 3, 3],
        resblock_stride_sizes=[8, 8, 4],
        resblock_dilation_sizes=[[1, 3, 9, 27], [1, 3, 9, 27], [1, 3, 9, 27]],
        kernel_predictor_num_blocks=3,
        kernel_predictor_hidden_channels=64,
        kernel_predictor_conv_size=3,
        kernel_predictor_dropout=0.0,
        initializer_range=0.01,
        leaky_relu_slope=0.2,
        **kwargs,
        ):
            如果 `resblock_kernel_sizes`、`resblock_stride_sizes`、`resblock_dilation_sizes` 的长度不相等，
            抛出 ValueError 异常，提示这三个参数必须具有相同的长度，这个长度也将是模型中 ResNet 块的数量。
        self.model_in_channels = model_in_channels
            设置模型的输入通道数。
        self.model_hidden_channels = model_hidden_channels
            设置模型的隐藏通道数。
        self.num_mel_bins = num_mel_bins
            设置 Mel 频谱的频段数。
        self.resblock_kernel_sizes = resblock_kernel_sizes
            设置 ResNet 块的卷积核大小列表。
        self.resblock_stride_sizes = resblock_stride_sizes
            设置 ResNet 块的步幅大小列表。
        self.resblock_dilation_sizes = resblock_dilation_sizes
            设置 ResNet 块的扩张（dilation）大小列表。
        self.kernel_predictor_num_blocks = kernel_predictor_num_blocks
            设置核预测器中的块数量。
        self.kernel_predictor_hidden_channels = kernel_predictor_hidden_channels
            设置核预测器中的隐藏通道数。
        self.kernel_predictor_conv_size = kernel_predictor_conv_size
            设置核预测器中的卷积大小。
        self.kernel_predictor_dropout = kernel_predictor_dropout
            设置核预测器中的 dropout 概率。
        self.initializer_range = initializer_range
            设置模型参数的初始化范围。
        self.leaky_relu_slope = leaky_relu_slope
            设置 Leaky ReLU 激活函数的斜率。
        super().__init__(**kwargs)
            调用父类的构造函数，传递可能的关键字参数。

`.\models\univnet\convert_univnet.py`

# 引入命令行参数解析模块
import argparse

# 引入 PyTorch 模块
import torch

# 从 transformers 库中引入 UnivNetConfig、UnivNetModel 和 logging 模块
from transformers import UnivNetConfig, UnivNetModel, logging

# 设置 logging 模块的详细信息级别
logging.set_verbosity_info()

# 获取名为 "transformers.models.univnet" 的日志记录器
logger = logging.get_logger("transformers.models.univnet")


# 定义函数：获取内核预测器键映射
def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = "", new_prefix: str = ""):
    # 创建空字典 mapping 用于存储键映射关系
    mapping = {}

    # 初始卷积层映射
    mapping[f"{old_prefix}.input_conv.0.weight_g"] = f"{new_prefix}.input_conv.weight_g"
    mapping[f"{old_prefix}.input_conv.0.weight_v"] = f"{new_prefix}.input_conv.weight_v"
    mapping[f"{old_prefix}.input_conv.0.bias"] = f"{new_prefix}.input_conv.bias"

    # 遍历核预测器的残差块
    for i in range(config.kernel_predictor_num_blocks):
        # 第一个卷积层映射
        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_g"] = f"{new_prefix}.resblocks.{i}.conv1.weight_g"
        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_v"] = f"{new_prefix}.resblocks.{i}.conv1.weight_v"
        mapping[f"{old_prefix}.residual_convs.{i}.1.bias"] = f"{new_prefix}.resblocks.{i}.conv1.bias"

        # 第二个卷积层映射
        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_g"] = f"{new_prefix}.resblocks.{i}.conv2.weight_g"
        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_v"] = f"{new_prefix}.resblocks.{i}.conv2.weight_v"
        mapping[f"{old_prefix}.residual_convs.{i}.3.bias"] = f"{new_prefix}.resblocks.{i}.conv2.bias"

    # 核输出卷积层映射
    mapping[f"{old_prefix}.kernel_conv.weight_g"] = f"{new_prefix}.kernel_conv.weight_g"
    mapping[f"{old_prefix}.kernel_conv.weight_v"] = f"{new_prefix}.kernel_conv.weight_v"
    mapping[f"{old_prefix}.kernel_conv.bias"] = f"{new_prefix}.kernel_conv.bias"

    # 偏置输出卷积层映射
    mapping[f"{old_prefix}.bias_conv.weight_g"] = f"{new_prefix}.bias_conv.weight_g"
    mapping[f"{old_prefix}.bias_conv.weight_v"] = f"{new_prefix}.bias_conv.weight_v"
    mapping[f"{old_prefix}.bias_conv.bias"] = f"{new_prefix}.bias_conv.bias"

    # 返回映射字典
    return mapping


# 定义函数：获取键映射
def get_key_mapping(config: UnivNetConfig):
    # 创建空字典 mapping 用于存储键映射关系
    mapping = {}

    # 注意：初始卷积层键保持不变

    # LVC 残差块（未完成的注释）
    # 遍历配置中的残差块步幅大小列表的长度
    for i in range(len(config.resblock_stride_sizes)):
        # 设置 LVCBlock 的初始卷积层权重和偏置的映射关系
        mapping[f"res_stack.{i}.convt_pre.1.weight_g"] = f"resblocks.{i}.convt_pre.weight_g"
        mapping[f"res_stack.{i}.convt_pre.1.weight_v"] = f"resblocks.{i}.convt_pre.weight_v"
        mapping[f"res_stack.{i}.convt_pre.1.bias"] = f"resblocks.{i}.convt_pre.bias"

        # 获取并更新核预测器的映射关系
        kernel_predictor_mapping = get_kernel_predictor_key_mapping(
            config, old_prefix=f"res_stack.{i}.kernel_predictor", new_prefix=f"resblocks.{i}.kernel_predictor"
        )
        mapping.update(kernel_predictor_mapping)

        # 遍历当前残差块的扩张大小列表的长度
        for j in range(len(config.resblock_dilation_sizes[i])):
            # 设置 LVC 残差块内部卷积层权重和偏置的映射关系
            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_g"] = f"resblocks.{i}.resblocks.{j}.conv.weight_g"
            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_v"] = f"resblocks.{i}.resblocks.{j}.conv.weight_v"
            mapping[f"res_stack.{i}.conv_blocks.{j}.1.bias"] = f"resblocks.{i}.resblocks.{j}.conv.bias"

    # 设置输出卷积层权重和偏置的映射关系
    mapping["conv_post.1.weight_g"] = "conv_post.weight_g"
    mapping["conv_post.1.weight_v"] = "conv_post.weight_v"
    mapping["conv_post.1.bias"] = "conv_post.bias"

    # 返回映射字典
    return mapping
# 定义函数，用于修改状态字典的键，并且可以移除指定的键
def rename_state_dict(state_dict, keys_to_modify, keys_to_remove):
    # 初始化一个空的模型状态字典
    model_state_dict = {}
    # 遍历原始状态字典中的每个键值对
    for key, value in state_dict.items():
        # 如果当前键在要移除的键集合中，则跳过处理
        if key in keys_to_remove:
            continue
        
        # 如果当前键在要修改的键映射中
        if key in keys_to_modify:
            # 使用映射中的新键名替换当前键，并将对应的值存入模型状态字典
            new_key = keys_to_modify[key]
            model_state_dict[new_key] = value
        else:
            # 否则直接将当前键值对存入模型状态字典
            model_state_dict[key] = value
    
    # 返回修改后的模型状态字典
    return model_state_dict


def convert_univnet_checkpoint(
    checkpoint_path,
    pytorch_dump_folder_path,
    config_path=None,
    repo_id=None,
    safe_serialization=False,
):
    # 使用 torch 加载模型的状态字典，指定在 CPU 上加载
    model_state_dict_base = torch.load(checkpoint_path, map_location="cpu")
    # 获取生成器的状态字典
    state_dict = model_state_dict_base["model_g"]

    # 如果提供了配置文件路径，则从预训练配置文件中加载配置，否则使用默认配置
    if config_path is not None:
        config = UnivNetConfig.from_pretrained(config_path)
    else:
        config = UnivNetConfig()

    # 获取需要修改的键映射
    keys_to_modify = get_key_mapping(config)
    # 初始化要移除的键集合为空
    keys_to_remove = set()
    # 使用定义的函数重命名状态字典中的键，并且应用修改后的映射
    hf_state_dict = rename_state_dict(state_dict, keys_to_modify, keys_to_remove)

    # 创建 UnivNetModel 的实例
    model = UnivNetModel(config)
    # 应用权重规范化，因为原始检查点已应用权重规范化
    model.apply_weight_norm()
    # 加载经过重命名的状态字典
    model.load_state_dict(hf_state_dict)
    # 移除权重规范化，为推断准备
    model.remove_weight_norm()

    # 将模型保存到指定路径，支持安全序列化选项
    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)

    # 如果提供了 repo_id，则推送模型到 hub
    if repo_id:
        print("Pushing to the hub...")
        model.push_to_hub(repo_id)


def main():
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser()
    # 添加命令行参数选项
    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    parser.add_argument(
        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
    )
    parser.add_argument(
        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
    )
    parser.add_argument(
        "--safe_serialization", action="store_true", help="Whether to save the model using `safetensors`."
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用转换函数，传入命令行参数解析得到的参数
    convert_univnet_checkpoint(
        args.checkpoint_path,
        args.pytorch_dump_folder_path,
        args.config_path,
        args.push_to_hub,
        args.safe_serialization,
    )


if __name__ == "__main__":
    main()

`.\models\univnet\feature_extraction_univnet.py`

# 版权声明和许可信息，指明代码版权和使用许可条件
# 详细描述使用 Apache 许可证 2.0 版本，允许在遵守许可的前提下使用此代码
#
# 导入必要的库和模块
from typing import Any, Dict, List, Optional, Union

import numpy as np  # 导入 NumPy 库，用于数值计算

# 导入音频处理相关工具函数
from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
# 导入特征提取序列工具类
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
# 导入特征提取批处理类
from ...feature_extraction_utils import BatchFeature
# 导入通用工具类
from ...utils import PaddingStrategy, TensorType, logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 UnivNetFeatureExtractor 类，继承自 SequenceFeatureExtractor 类
class UnivNetFeatureExtractor(SequenceFeatureExtractor):
    r"""
    构建 UnivNet 特征提取器。

    此类使用短时傅里叶变换 (STFT) 从原始语音中提取对数梅尔滤波器组特征。
    STFT 实现遵循 TacoTron 2 和 Hifi-GAN 的实现方式。

    此特征提取器继承自 [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`]，
    该超类包含大部分主要方法。用户应参考此超类以获取有关这些方法的更多信息。
    """

    # 模型输入名称列表
    model_input_names = ["input_features", "noise_sequence", "padding_mask"]

    # 初始化方法，设置特征提取器的各种参数
    def __init__(
        self,
        feature_size: int = 1,  # 特征大小，默认为 1
        sampling_rate: int = 24000,  # 采样率，默认为 24000 Hz
        padding_value: float = 0.0,  # 填充值，默认为 0.0
        do_normalize: bool = False,  # 是否进行归一化，默认为 False
        num_mel_bins: int = 100,  # 梅尔滤波器组数目，默认为 100
        hop_length: int = 256,  # 跳跃长度，默认为 256
        win_length: int = 1024,  # 窗口长度，默认为 1024
        win_function: str = "hann_window",  # 窗函数类型，默认为 "hann_window"
        filter_length: Optional[int] = 1024,  # 滤波器长度，默认为 1024
        max_length_s: int = 10,  # 最大长度（秒），默认为 10
        fmin: float = 0.0,  # 最低频率，默认为 0.0 Hz
        fmax: Optional[float] = None,  # 最高频率，可选参数
        mel_floor: float = 1e-9,  # 梅尔值下限，默认为 1e-9
        center: bool = False,  # 是否居中，默认为 False
        compression_factor: float = 1.0,  # 压缩因子，默认为 1.0
        compression_clip_val: float = 1e-5,  # 压缩剪切值，默认为 1e-5
        normalize_min: float = -11.512925148010254,  # 归一化最小值，默认为 -11.512925148010254
        normalize_max: float = 2.3143386840820312,  # 归一化最大值，默认为 2.3143386840820312
        model_in_channels: int = 64,  # 模型输入通道数，默认为 64
        pad_end_length: int = 10,  # 结尾填充长度，默认为 10
        return_attention_mask=True,  # 是否返回注意力掩码，默认为 True
        **kwargs,  # 其他可选关键字参数
    ):
        # 调用父类的初始化方法
        super().__init__()
        ):
            # 调用父类的构造函数，初始化对象
            super().__init__(
                feature_size=feature_size,
                sampling_rate=sampling_rate,
                padding_value=padding_value,
                return_attention_mask=return_attention_mask,
                **kwargs,
            )

            # 设置是否进行归一化的标志
            self.do_normalize = do_normalize

            # 设置 Mel 频率滤波器的参数
            self.num_mel_bins = num_mel_bins
            self.hop_length = hop_length
            self.win_length = win_length
            self.win_function = win_function
            self.filter_length = filter_length
            self.fmin = fmin
            if fmax is None:
                # 如果未指定 fmax，则根据采样率计算最大频率
                # 遵循 librosa.filters.mel 的实现
                fmax = float(sampling_rate) / 2
            self.fmax = fmax
            self.mel_floor = mel_floor

            # 设置最大长度（秒）及其对应的最大样本数
            self.max_length_s = max_length_s
            self.num_max_samples = max_length_s * sampling_rate

            # 根据是否指定了 filter_length 来决定使用的 FFT 长度
            if self.filter_length is None:
                self.n_fft = optimal_fft_length(self.win_length)
            else:
                self.n_fft = self.filter_length
            self.n_freqs = (self.n_fft // 2) + 1

            # 初始化窗口函数
            self.window = window_function(window_length=self.win_length, name=self.win_function, periodic=True)

            # 初始化 Mel 频率滤波器组
            self.mel_filters = mel_filter_bank(
                num_frequency_bins=self.n_freqs,
                num_mel_filters=self.num_mel_bins,
                min_frequency=self.fmin,
                max_frequency=self.fmax,
                sampling_rate=self.sampling_rate,
                norm="slaney",
                mel_scale="slaney",
            )

            # 设置中心化标志及其它相关参数
            self.center = center
            self.compression_factor = compression_factor
            self.compression_clip_val = compression_clip_val
            self.normalize_min = normalize_min
            self.normalize_max = normalize_max
            self.model_in_channels = model_in_channels
            self.pad_end_length = pad_end_length

        def normalize(self, spectrogram):
            # 对频谱进行归一化处理
            return 2 * ((spectrogram - self.normalize_min) / (self.normalize_max - self.normalize_min)) - 1

        def denormalize(self, spectrogram):
            # 对归一化后的频谱进行反归一化处理
            return self.normalize_min + (self.normalize_max - self.normalize_min) * ((spectrogram + 1) / 2)
    def mel_spectrogram(self, waveform: np.ndarray) -> np.ndarray:
        """
        Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
        `int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.

        Args:
            waveform (`np.ndarray` of shape `(length,)`):
                The input waveform. This must be a single real-valued, mono waveform.

        Returns:
            `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
        """
        # 根据 MelGAN 和 Hifi-GAN 实现的方式，自定义填充波形
        waveform = np.pad(
            waveform,
            (int((self.n_fft - self.hop_length) / 2), int((self.n_fft - self.hop_length) / 2)),
            mode="reflect",
        )

        # 获取复杂谱图
        # 注意：由于 spectrogram(...) 的实现方式，目前必须对波形进行解批处理
        complex_spectrogram = spectrogram(
            waveform,
            window=self.window,
            frame_length=self.n_fft,
            hop_length=self.hop_length,
            fft_length=self.n_fft,
            power=None,
            center=self.center,
            mel_filters=None,
            mel_floor=None,
        )

        # 手动应用 MEL 滤波器组和 MEL floor，因为 UnivNet 使用了稍微不同的实现方式
        amplitude_spectrogram = np.sqrt(
            np.real(complex_spectrogram) ** 2 + np.imag(complex_spectrogram) ** 2 + self.mel_floor
        )
        mel_spectrogram = np.matmul(self.mel_filters.T, amplitude_spectrogram)

        # 执行谱归一化以获得对数 MEL 谱图
        log_mel_spectrogram = np.log(
            np.clip(mel_spectrogram, a_min=self.compression_clip_val, a_max=None) * self.compression_factor
        )

        # 返回最后一个维度是 num_mel_bins 的谱图
        return log_mel_spectrogram.T

    def generate_noise(
        self,
        noise_length: int,
        generator: Optional[np.random.Generator] = None,
    def noise_sequence(self, noise_length: int, generator: Optional[np.random.Generator] = None) -> np.ndarray:
        """
        Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
        [`UnivNetModel.forward`].

        Args:
            noise_length (`int`):
                The length of the generated noise sequence.
            generator (`numpy.random.Generator`, *optional*, defaults to `None`):
                An optional random number generator to control noise generation. If not provided, a new generator
                instance will be created using `np.random.default_rng()`.

        Returns:
            `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
            self.model_in_channels)`.
        """
        # If no generator is provided, create a new default generator
        if generator is None:
            generator = np.random.default_rng()

        # Define the shape of the noise array based on noise_length and self.model_in_channels
        noise_shape = (noise_length, self.model_in_channels)
        
        # Generate standard normal noise using the generator
        noise = generator.standard_normal(noise_shape, dtype=np.float32)

        return noise

    def batch_decode(self, waveforms, waveform_lengths=None) -> List[np.ndarray]:
        r"""
        Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
        audio waveform arrays and not a single tensor/array because in general the waveforms will have different
        lengths after removing padding.

        Args:
            waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                The batched output waveforms from the [`UnivNetModel`].
            waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
                The batched lengths of each waveform before padding.

        Returns:
            `List[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
        """
        # Convert each batched waveform tensor to a 1D numpy array
        waveforms = [waveform.detach().clone().cpu().numpy() for waveform in waveforms]

        # If waveform_lengths is provided, truncate each waveform according to its length
        if waveform_lengths is not None:
            waveforms = [waveform[: waveform_lengths[i]] for i, waveform in enumerate(waveforms)]

        return waveforms
    # 定义一个方法 __call__，用于处理语音数据的预处理和转换
    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        sampling_rate: Optional[int] = None,
        padding: Union[bool, str, PaddingStrategy] = True,
        max_length: Optional[int] = None,
        truncation: bool = True,
        pad_to_multiple_of: Optional[int] = None,
        return_noise: bool = True,
        generator: Optional[np.random.Generator] = None,
        pad_end: bool = False,
        pad_length: Optional[int] = None,
        do_normalize: Optional[str] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ):
        # 调用父类的 to_dict 方法，获取基类的属性字典
        output = super().to_dict()

        # 从属性字典中删除不需要序列化的属性
        names = ["window", "mel_filters", "n_fft", "n_freqs", "num_max_samples"]
        for name in names:
            if name in output:
                del output[name]

        # 返回处理后的属性字典
        return output

`.\models\univnet\modeling_univnet.py`

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch UnivNetModel model."""

from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from ...modeling_utils import ModelOutput, PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_univnet import UnivNetConfig


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "UnivNetConfig"

_CHECKPOINT_FOR_DOC = "dg845/univnet-dev"

UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "dg845/univnet-dev",
    # See all UnivNet models at https://huggingface.co/models?filter=univnet
]


@dataclass
class UnivNetModelOutput(ModelOutput):
    """
    Output class for the [`UnivNetModel`], which includes the generated audio waveforms and the original unpadded
    lengths of those waveforms (so that the padding can be removed by [`UnivNetModel.batch_decode`]).

    Args:
        waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Batched 1D (mono-channel) output audio waveforms.
        waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
            The batched length in samples of each unpadded waveform in `waveforms`.
    """

    waveforms: torch.FloatTensor = None
    waveform_lengths: torch.FloatTensor = None


class UnivNetKernelPredictorResidualBlock(nn.Module):
    """
    Implementation of the residual block for the kernel predictor network inside each location variable convolution
    block (LVCBlock).

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
    """

    def __init__(
        self,
        config: UnivNetConfig,
    ):
        super().__init__()
        # Initialize the residual block parameters based on the provided configuration
        self.channels = config.model_in_channels  # Number of input channels for the block
        self.kernel_size = config.kernel_predictor_conv_size  # Size of the convolutional kernel
        self.dropout_prob = config.kernel_predictor_dropout  # Dropout probability
        self.leaky_relu_slope = config.leaky_relu_slope  # Slope of the Leaky ReLU activation function

        padding = (self.kernel_size - 1) // 2  # Calculate padding size for convolution

        # Dropout layer to randomly zero some of the input elements with probability self.dropout_prob
        self.dropout = nn.Dropout(self.dropout_prob)
        # First 1D convolutional layer with input channels, output channels, kernel size, and padding
        self.conv1 = nn.Conv1d(self.channels, self.channels, self.kernel_size, padding=padding, bias=True)
        # Second 1D convolutional layer with input channels, output channels, kernel size, and padding
        self.conv2 = nn.Conv1d(self.channels, self.channels, self.kernel_size, padding=padding, bias=True)
    # 对神经网络模型中的前向传播方法进行定义，接受隐藏状态作为输入参数
    def forward(self, hidden_states: torch.FloatTensor):
        # residual用于存储输入的原始隐藏状态，以便后续进行残差连接
        residual = hidden_states
        # 对输入的隐藏状态进行dropout操作，以减少过拟合风险
        hidden_states = self.dropout(hidden_states)
        # 第一层卷积操作，将dropout后的隐藏状态作为输入
        hidden_states = self.conv1(hidden_states)
        # 使用LeakyReLU激活函数对第一层卷积的输出进行非线性变换
        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
        # 第二层卷积操作，将第一层卷积的输出作为输入
        hidden_states = self.conv2(hidden_states)
        # 再次使用LeakyReLU激活函数对第二层卷积的输出进行非线性变换
        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
        # 返回经过卷积和激活函数处理后的隐藏状态，加上之前保存的残差
        return hidden_states + residual
    
    # 对模型中的卷积层应用权重归一化（weight normalization）
    def apply_weight_norm(self):
        # 对第一层卷积层应用权重归一化
        nn.utils.weight_norm(self.conv1)
        # 对第二层卷积层应用权重归一化
        nn.utils.weight_norm(self.conv2)
    
    # 移除模型中的卷积层的权重归一化
    def remove_weight_norm(self):
        # 移除第一层卷积层的权重归一化
        nn.utils.remove_weight_norm(self.conv1)
        # 移除第二层卷积层的权重归一化
        nn.utils.remove_weight_norm(self.conv2)
class UnivNetKernelPredictor(nn.Module):
    """
    Implementation of the kernel predictor network which supplies the kernel and bias for the location variable
    convolutional layers (LVCs) in each UnivNet LVCBlock.

    Based on the KernelPredictor implementation in
    [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L7).

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        conv_kernel_size (`int`, *optional*, defaults to 3):
            The kernel size for the location variable convolutional layer kernels (convolutional weight tensor).
        conv_layers (`int`, *optional*, defaults to 4):
            The number of location variable convolutional layers to output kernels and biases for.
    """

    def __init__(
        self,
        config: UnivNetConfig,
        conv_kernel_size: int = 3,
        conv_layers: int = 4,
    ):
        super().__init__()

        # 设置卷积层输入通道数为模型隐藏通道数
        self.conv_in_channels = config.model_hidden_channels
        # 设置卷积层输出通道数为模型隐藏通道数的两倍
        self.conv_out_channels = 2 * config.model_hidden_channels
        # 设置卷积核大小为给定参数值
        self.conv_kernel_size = conv_kernel_size
        # 设置卷积层数为给定参数值
        self.conv_layers = conv_layers

        # 计算卷积核的总通道数，考虑了通道数、卷积核大小和卷积层数
        self.kernel_channels = (
            self.conv_in_channels * self.conv_out_channels * self.conv_kernel_size * self.conv_layers
        )
        # 计算偏置的总通道数，考虑了输出通道数和卷积层数
        self.bias_channels = self.conv_out_channels * self.conv_layers

        # 设置 ResNet 的输入通道数为 Mel 频谱的数量
        self.resnet_in_channels = config.num_mel_bins
        # 设置 ResNet 隐藏层的通道数为给定的隐藏通道数
        self.resnet_hidden_channels = config.kernel_predictor_hidden_channels
        # 设置 ResNet 卷积核大小为给定的卷积核大小
        self.resnet_kernel_size = config.kernel_predictor_conv_size
        # 设置 ResNet 的块数量为给定的块数
        self.num_blocks = config.kernel_predictor_num_blocks

        # 设置 Leaky ReLU 的负斜率为给定的斜率
        self.leaky_relu_slope = config.leaky_relu_slope

        # 计算卷积的填充大小，确保卷积核能够处理边界
        padding = (self.resnet_kernel_size - 1) // 2

        # 输入卷积层，接受 Mel 频谱作为输入，输出到 ResNet 的隐藏层
        self.input_conv = nn.Conv1d(self.resnet_in_channels, self.resnet_hidden_channels, 5, padding=2, bias=True)

        # 创建 ResNet 块的列表，每个块是 UnivNetKernelPredictorResidualBlock 类的实例
        self.resblocks = nn.ModuleList([UnivNetKernelPredictorResidualBlock(config) for _ in range(self.num_blocks)])

        # 输出卷积层，生成卷积核参数，以适应 LVC 的位置变量卷积层
        self.kernel_conv = nn.Conv1d(
            self.resnet_hidden_channels, self.kernel_channels, self.resnet_kernel_size, padding=padding, bias=True
        )
        # 输出卷积层，生成偏置参数，以适应 LVC 的位置变量卷积层
        self.bias_conv = nn.Conv1d(
            self.resnet_hidden_channels, self.bias_channels, self.resnet_kernel_size, padding=padding, bias=True
        )
    def forward(self, spectrogram: torch.FloatTensor):
        """
        将一个条件化的对数梅尔频谱映射到卷积核和偏置的张量，用于位置变量卷积层。注意输入的频谱应具有形状 (batch_size, input_channels, seq_length)。

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, input_channels, seq_length)`):
                包含对数梅尔频谱的张量。

        Returns:
            Tuple[`torch.FloatTensor, `torch.FloatTensor`]: 一个元组，第一个元素是形状为 `(batch_size, self.conv_layers, self.conv_in_channels,
            self.conv_out_channels, self.conv_kernel_size, seq_length)` 的位置变量卷积核张量，第二个元素是形状为 `(batch_size, self.conv_layers, self.conv_out_channels,
            seq_length)` 的位置变量卷积偏置张量。
        """
        batch_size, _, seq_length = spectrogram.shape  # 获取批次大小、输入通道数和序列长度

        hidden_states = self.input_conv(spectrogram)  # 应用输入卷积层
        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)  # 应用泄漏整流激活函数

        for resblock in self.resblocks:  # 遍历所有残差块
            hidden_states = resblock(hidden_states)  # 应用残差块

        kernel_hidden_states = self.kernel_conv(hidden_states)  # 应用核卷积层
        bias_hidden_states = self.bias_conv(hidden_states)  # 应用偏置卷积层

        # 将卷积核和偏置重塑为适当的形状
        kernels = kernel_hidden_states.view(
            batch_size,
            self.conv_layers,
            self.conv_in_channels,
            self.conv_out_channels,
            self.conv_kernel_size,
            seq_length,
        ).contiguous()
        biases = bias_hidden_states.view(
            batch_size,
            self.conv_layers,
            self.conv_out_channels,
            seq_length,
        ).contiguous()

        return kernels, biases

    def apply_weight_norm(self):
        nn.utils.weight_norm(self.input_conv)  # 对输入卷积层应用权重归一化
        for layer in self.resblocks:  # 对所有残差块应用权重归一化
            layer.apply_weight_norm()
        nn.utils.weight_norm(self.kernel_conv)  # 对核卷积层应用权重归一化
        nn.utils.weight_norm(self.bias_conv)  # 对偏置卷积层应用权重归一化

    def remove_weight_norm(self):
        nn.utils.remove_weight_norm(self.input_conv)  # 移除输入卷积层的权重归一化
        for layer in self.resblocks:  # 移除所有残差块的权重归一化
            layer.remove_weight_norm()
        nn.utils.remove_weight_norm(self.kernel_conv)  # 移除核卷积层的权重归一化
        nn.utils.remove_weight_norm(self.bias_conv)  # 移除偏置卷积层的权重归一化
class UnivNetLvcResidualBlock(nn.Module):
    """
    Implementation of the location variable convolution (LVC) residual block for the UnivNet residual network.

    Parameters:
        config: (`UnivNetConfig`):
            Config for the `UnivNetModel` model.
        kernel_size (`int`):
            The kernel size for the dilated 1D convolutional layer.
        dilation (`int`):
            The dilation for the dilated 1D convolutional layer.
    """

    def __init__(
        self,
        config: UnivNetConfig,
        kernel_size: int,
        dilation: int,
    ):
        super().__init__()
        self.hidden_channels = config.model_hidden_channels
        self.kernel_size = kernel_size
        self.dilation = dilation
        self.leaky_relu_slope = config.leaky_relu_slope

        # Calculate padding for the convolution layer
        padding = self.dilation * (self.kernel_size - 1) // 2

        # Define the 1D convolutional layer with specified parameters
        self.conv = nn.Conv1d(
            self.hidden_channels,
            self.hidden_channels,
            self.kernel_size,
            padding=padding,
            dilation=self.dilation,
        )

    def forward(self, hidden_states, kernel, bias, hop_size=256):
        # Store the input hidden_states as residual for skip connection
        residual = hidden_states

        # Apply leaky ReLU activation function to the input
        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)

        # Perform dilated 1D convolution using the defined convolution layer
        hidden_states = self.conv(hidden_states)

        # Apply leaky ReLU activation function again
        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)

        # Apply location variable convolution (LVC) using custom function
        hidden_states = self.location_variable_convolution(hidden_states, kernel, bias, hop_size=hop_size)

        # Apply gated activation unit: sigmoid and tanh functions
        hidden_states = torch.sigmoid(hidden_states[:, : self.hidden_channels, :]) * torch.tanh(
            hidden_states[:, self.hidden_channels :, :]
        )

        # Add the residual (skip connection) to the processed hidden states
        hidden_states = residual + hidden_states

        return hidden_states

    # Custom method for applying weight normalization to the convolution layer
    def apply_weight_norm(self):
        nn.utils.weight_norm(self.conv)

    # Custom method for removing weight normalization from the convolution layer
    def remove_weight_norm(self):
        nn.utils.remove_weight_norm(self.conv)
    Parameters:
        config (`UnivNetConfig`):
            `UnivNetModel`模型的配置。
        layer_id (`int`):
            当前LVC ResNet块层的索引，应在0到`len(config.resblock_stride_sizes) - 1`之间（包括边界）。
        lvc_hop_size (`int`, *可选*, 默认为256):
            位置变量卷积层的跳跃步长。

    """
    
    def __init__(
        self,
        config: UnivNetConfig,
        layer_id: int,
        lvc_hop_size: int = 256,
    ):
        super().__init__()
        self.hidden_channels = config.model_hidden_channels  # 设置隐藏通道数
        self.kernel_size = config.resblock_kernel_sizes[layer_id]  # 根据层索引获取内核大小
        self.stride = config.resblock_stride_sizes[layer_id]  # 根据层索引获取步幅大小
        self.dilations = config.resblock_dilation_sizes[layer_id]  # 根据层索引获取扩张率列表
        self.cond_hop_length = lvc_hop_size  # 设置条件跳跃长度
        self.leaky_relu_slope = config.leaky_relu_slope  # 设置LeakyReLU的斜率
        self.num_blocks = len(self.dilations)  # 获取块的数量

        self.convt_pre = nn.ConvTranspose1d(
            self.hidden_channels,  # 输入通道数
            self.hidden_channels,  # 输出通道数
            2 * self.stride,  # 内核大小
            stride=self.stride,  # 步幅大小
            padding=self.stride // 2 + self.stride % 2,  # 填充大小
            output_padding=self.stride % 2,  # 输出填充大小
        )

        self.kernel_predictor = UnivNetKernelPredictor(config, self.kernel_size, self.num_blocks)  # 初始化内核预测器

        self.resblocks = nn.ModuleList(
            [UnivNetLvcResidualBlock(config, self.kernel_size, self.dilations[i]) for i in range(self.num_blocks)]
        )  # 创建LVC残差块列表

    def forward(self, hidden_states: torch.FloatTensor, spectrogram: torch.FloatTensor):
        # hidden_states: (batch_size, hidden_channels, seq_length)
        # spectrogram: (batch_size, cond_channels, cond_length)
        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)  # 应用LeakyReLU激活函数
        hidden_states = self.convt_pre(hidden_states)  # 执行转置卷积预处理

        kernels, biases = self.kernel_predictor(spectrogram)  # 从谱图预测内核和偏置

        for i, resblock in enumerate(self.resblocks):
            kernel = kernels[:, i, :, :, :, :]  # 获取当前块的内核
            bias = biases[:, i, :, :]  # 获取当前块的偏置
            hidden_states = resblock(hidden_states, kernel, bias, hop_size=self.cond_hop_length)  # 执行残差块操作

        return hidden_states  # 返回处理后的隐藏状态

    def apply_weight_norm(self):
        nn.utils.weight_norm(self.convt_pre)  # 应用权重归一化到转置卷积层
        self.kernel_predictor.apply_weight_norm()  # 应用权重归一化到内核预测器
        for layer in self.resblocks:
            layer.apply_weight_norm()  # 依次应用权重归一化到每个残差块

    def remove_weight_norm(self):
        nn.utils.remove_weight_norm(self.convt_pre)  # 移除转置卷积层的权重归一化
        self.kernel_predictor.remove_weight_norm()  # 移除内核预测器的权重归一化
        for layer in self.resblocks:
            layer.remove_weight_norm()  # 依次移除每个残差块的权重归一化
# 包含关于 UnivNetModel 类的开始文档字符串，描述了该类的继承和基本使用方法
UNIVNET_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`UnivNetConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 包含关于 UnivNetModel 类的输入文档字符串，描述了输入参数及其形状的详细信息
UNIVNET_INPUTS_DOCSTRING = r"""
    Converts a noise waveform and a conditioning spectrogram to a speech waveform. Passing a batch of log-mel
    spectrograms returns a batch of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a
    single, un-batched speech waveform.

    Args:
        input_features (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.num_mel_channels)`, or un-batched and of shape `(sequence_length, config.num_mel_channels)`.
        noise_sequence (`torch.FloatTensor`, *optional*):
            Tensor containing a noise sequence of standard Gaussian noise. Can be batched and of shape `(batch_size,
            sequence_length, config.model_in_channels)`, or un-batched and of shape (sequence_length,
            config.model_in_channels)`. If not supplied, will be randomly generated.
        padding_mask (`torch.BoolTensor`, *optional*):
            Mask indicating which parts of each sequence are padded. Mask values are selected in `[0, 1]`:

            - 1 for tokens that are **not masked**
            - 0 for tokens that are **masked**

            The mask can be batched and of shape `(batch_size, sequence_length)` or un-batched and of shape
            `(sequence_length,)`.
        generator (`torch.Generator`, *optional*):
            A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
            deterministic.
        return_dict:
            Whether to return a [`~utils.ModelOutput`] subclass instead of a plain tuple.
"""

# 使用 @add_start_docstrings 装饰器添加关于 UnivNetModel 类的简要描述和详细文档字符串
@add_start_docstrings(
    """UnivNet GAN vocoder.""",
    UNIVNET_START_DOCSTRING,
)
# 定义 UnivNetModel 类，继承自 PreTrainedModel，表示一个 UnivNet GAN 声码器模型
class UnivNetModel(PreTrainedModel):
    # 指定该模型的配置类为 UnivNetConfig
    config_class = UnivNetConfig
    # 指定主要输入的名称为 "input_features"
    main_input_name = "input_features"
    def __init__(self, config: UnivNetConfig):
        super().__init__(config)

        self.num_kernels = len(config.resblock_kernel_sizes)  # 计算 ResNet 块的内核数目
        self.leaky_relu_slope = config.leaky_relu_slope  # 从配置中获取 Leaky ReLU 的斜率

        self.conv_pre = nn.Conv1d(
            config.model_in_channels,
            config.model_hidden_channels,
            kernel_size=7,
            stride=1,
            padding=3,
            padding_mode="reflect",
        )
        # 创建预处理卷积层，用于输入数据的初始处理

        # Initialize location-variable convolution ResNet Blocks.
        num_layers = len(config.resblock_stride_sizes)  # 获取 ResNet 块的层数
        hop_length = 1
        hop_lengths = []
        for stride in config.resblock_stride_sizes:
            hop_length = hop_length * stride
            hop_lengths.append(hop_length)
        # 计算每个 ResNet 块的跳跃长度，并存储在列表中

        self.resblocks = nn.ModuleList(
            [
                UnivNetLvcBlock(
                    config,
                    layer_id=i,
                    lvc_hop_size=hop_lengths[i],
                )
                for i in range(num_layers)
            ]
        )
        # 创建 ResNet 块的列表，每个块都使用不同的位置变量卷积设置

        self.conv_post = nn.Conv1d(config.model_hidden_channels, 1, 7, padding=3, padding_mode="reflect")
        # 创建后处理卷积层，用于最终输出的处理

        # Initialize weights and apply final processing
        self.post_init()
        # 调用初始化方法，用于权重初始化和最终处理的应用

    @add_start_docstrings_to_model_forward(UNIVNET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=UnivNetModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_features: torch.FloatTensor,
        noise_sequence: Optional[torch.FloatTensor] = None,
        padding_mask: Optional[torch.FloatTensor] = None,
        generator: Optional[torch.Generator] = None,
        return_dict: Optional[bool] = None,
    ):
        # 正向传播方法，详细文档说明见装饰器函数

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        # 初始化模块的权重，适用于线性层、卷积层和转置卷积层

    def apply_weight_norm(self):
        nn.utils.weight_norm(self.conv_pre)
        for layer in self.resblocks:
            layer.apply_weight_norm()
        nn.utils.weight_norm(self.conv_post)
        # 应用权重归一化到预处理卷积层、ResNet 块和后处理卷积层

    def remove_weight_norm(self):
        nn.utils.remove_weight_norm(self.conv_pre)
        for layer in self.resblocks:
            layer.remove_weight_norm()
        nn.utils.remove_weight_norm(self.conv_post)
        # 移除预处理卷积层、ResNet 块和后处理卷积层的权重归一化

`.\models\univnet\init.py`

# 版权声明和许可证信息
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入必要的类型检查模块
from typing import TYPE_CHECKING

# 引入依赖的模块和异常处理类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_univnet": [
        "UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "UnivNetConfig",
    ],
    "feature_extraction_univnet": ["UnivNetFeatureExtractor"],
}

# 检查是否有torch可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若torch可用，则添加对应的模型建模模块到_import_structure中
    _import_structure["modeling_univnet"] = [
        "UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "UnivNetModel",
    ]

# 如果是类型检查阶段，导入特定的配置、特征提取和模型建模类
if TYPE_CHECKING:
    from .configuration_univnet import (
        UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
        UnivNetConfig,
    )
    from .feature_extraction_univnet import UnivNetFeatureExtractor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_univnet import (
            UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            UnivNetModel,
        )

# 如果不是类型检查阶段，则进行模块的懒加载设置
else:
    import sys

    # 使用_LazyModule类来设置懒加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\upernet\configuration_upernet.py`

# coding=utf-8
# 版权所有 2022 年 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权；
# 除非遵守许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于“按现状”提供的，不附带任何明示或暗示的担保或条件。
# 请参阅许可证获取详细信息。
""" UperNet 模型配置"""


from ...configuration_utils import PretrainedConfig  # 导入预配置类
from ...utils import logging  # 导入日志工具
from ..auto.configuration_auto import CONFIG_MAPPING  # 导入自动配置映射


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


class UperNetConfig(PretrainedConfig):
    r"""
    这是用于存储 [`UperNetForSemanticSegmentation`] 配置的类。它用于根据指定的参数实例化 UperNet 模型，
    定义模型的架构。使用默认值实例化配置会产生类似于 UperNet
    [openmmlab/upernet-convnext-tiny](https://huggingface.co/openmmlab/upernet-convnext-tiny) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读
    [`PretrainedConfig`] 的文档获取更多信息。
    """
    # 设置模型类型为 "upernet"
    model_type = "upernet"
    def __init__(
        self,
        backbone_config=None,  # 初始化函数的参数：用于指定主干网络的配置
        backbone=None,  # 初始化函数的参数：用于指定主干网络的实例
        use_pretrained_backbone=False,  # 初始化函数的参数：是否使用预训练的主干网络
        use_timm_backbone=False,  # 初始化函数的参数：是否使用timm库中的主干网络
        backbone_kwargs=None,  # 初始化函数的参数：主干网络的额外参数
        hidden_size=512,  # 初始化函数的参数：隐藏层的大小
        initializer_range=0.02,  # 初始化函数的参数：权重初始化的范围
        pool_scales=[1, 2, 3, 6],  # 初始化函数的参数：池化操作的尺度
        use_auxiliary_head=True,  # 初始化函数的参数：是否使用辅助头部
        auxiliary_loss_weight=0.4,  # 初始化函数的参数：辅助损失的权重
        auxiliary_in_channels=384,  # 初始化函数的参数：辅助头部的输入通道数
        auxiliary_channels=256,  # 初始化函数的参数：辅助头部的通道数
        auxiliary_num_convs=1,  # 初始化函数的参数：辅助头部的卷积层数量
        auxiliary_concat_input=False,  # 初始化函数的参数：辅助头部是否将输入进行拼接
        loss_ignore_index=255,  # 初始化函数的参数：损失函数中需要忽略的索引值
        **kwargs,  # 初始化函数的参数：其他未命名参数
    ):
        super().__init__(**kwargs)  # 调用父类的初始化函数

        if use_pretrained_backbone:
            raise ValueError("Pretrained backbones are not supported yet.")  # 如果使用预训练的主干网络，则抛出错误

        if backbone_config is not None and backbone is not None:
            raise ValueError("You can't specify both `backbone` and `backbone_config`.")  # 如果同时指定了主干网络实例和配置，则抛出错误

        if backbone_config is None and backbone is None:
            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage1", "stage2", "stage3", "stage4"])
            # 如果主干网络配置为空且主干网络实例也为空，则使用默认的ResNet主干网络配置进行初始化
        elif isinstance(backbone_config, dict):
            backbone_model_type = backbone_config.get("model_type")
            config_class = CONFIG_MAPPING[backbone_model_type]
            backbone_config = config_class.from_dict(backbone_config)
            # 如果主干网络配置是一个字典，则根据字典中的信息初始化主干网络配置

        if backbone_kwargs is not None and backbone_config is not None:
            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
            # 如果同时指定了主干网络的额外参数和主干网络配置，则抛出错误

        # 将所有初始化的参数保存到类的属性中
        self.backbone_config = backbone_config
        self.backbone = backbone
        self.use_pretrained_backbone = use_pretrained_backbone
        self.use_timm_backbone = use_timm_backbone
        self.backbone_kwargs = backbone_kwargs
        self.hidden_size = hidden_size
        self.initializer_range = initializer_range
        self.pool_scales = pool_scales
        self.use_auxiliary_head = use_auxiliary_head
        self.auxiliary_loss_weight = auxiliary_loss_weight
        self.auxiliary_in_channels = auxiliary_in_channels
        self.auxiliary_channels = auxiliary_channels
        self.auxiliary_num_convs = auxiliary_num_convs
        self.auxiliary_concat_input = auxiliary_concat_input
        self.loss_ignore_index = loss_ignore_index

`.\models\upernet\convert_convnext_upernet_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert ConvNext + UperNet checkpoints from mmsegmentation."""

import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 数据的模块

import requests  # 导入处理 HTTP 请求的模块
import torch  # 导入 PyTorch 深度学习框架
from huggingface_hub import hf_hub_download  # 导入从 Hugging Face Hub 下载资源的函数
from PIL import Image  # 导入处理图像的 PIL 库

from transformers import ConvNextConfig, SegformerImageProcessor, UperNetConfig, UperNetForSemanticSegmentation  # 导入模型配置和语义分割相关的类


def get_upernet_config(model_name):
    auxiliary_in_channels = 384  # 初始化辅助输入通道数为 384
    if "tiny" in model_name:
        depths = [3, 3, 9, 3]  # 如果模型名中包含 "tiny"，则设置深度列表
        hidden_sizes = [96, 192, 384, 768]  # 设置隐藏层大小列表
    if "small" in model_name:
        depths = [3, 3, 27, 3]  # 如果模型名中包含 "small"，则设置深度列表
        hidden_sizes = [96, 192, 384, 768]  # 设置隐藏层大小列表
    if "base" in model_name:
        depths = [3, 3, 27, 3]  # 如果模型名中包含 "base"，则设置深度列表
        hidden_sizes = [128, 256, 512, 1024]  # 设置隐藏层大小列表
        auxiliary_in_channels = 512  # 设置辅助输入通道数为 512
    if "large" in model_name:
        depths = [3, 3, 27, 3]  # 如果模型名中包含 "large"，则设置深度列表
        hidden_sizes = [192, 384, 768, 1536]  # 设置隐藏层大小列表
        auxiliary_in_channels = 768  # 设置辅助输入通道数为 768
    if "xlarge" in model_name:
        depths = [3, 3, 27, 3]  # 如果模型名中包含 "xlarge"，则设置深度列表
        hidden_sizes = [256, 512, 1024, 2048]  # 设置隐藏层大小列表
        auxiliary_in_channels = 1024  # 设置辅助输入通道数为 1024

    # 设置标签信息
    num_labels = 150  # 设置标签数量为 150
    repo_id = "huggingface/label-files"  # 仓库 ID
    filename = "ade20k-id2label.json"  # 文件名
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))  # 从 Hub 下载并加载 ID 到标签的映射
    id2label = {int(k): v for k, v in id2label.items()}  # 转换为整数类型的字典
    label2id = {v: k for k, v in id2label.items()}  # 反向映射，从标签到 ID 的字典

    backbone_config = ConvNextConfig(
        depths=depths, hidden_sizes=hidden_sizes, out_features=["stage1", "stage2", "stage3", "stage4"]
    )  # 创建 ConvNext 模型的配置对象
    config = UperNetConfig(
        backbone_config=backbone_config,
        auxiliary_in_channels=auxiliary_in_channels,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )  # 创建 UperNet 模型的配置对象

    return config  # 返回配置对象


# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
    rename_keys = []  # 初始化重命名键列表

    # fmt: off
    # stem
    rename_keys.append(("backbone.downsample_layers.0.0.weight", "backbone.embeddings.patch_embeddings.weight"))
    rename_keys.append(("backbone.downsample_layers.0.0.bias", "backbone.embeddings.patch_embeddings.bias"))
    rename_keys.append(("backbone.downsample_layers.0.1.weight", "backbone.embeddings.layernorm.weight"))
    rename_keys.append(("backbone.downsample_layers.0.1.bias", "backbone.embeddings.layernorm.bias"))
    # stages
    # 遍历 backbone_config.depths 列表的长度，这里 i 是索引
    for i in range(len(config.backbone_config.depths)):
        # 遍历 config.backbone_config.depths[i] 次数，这里 j 是索引
        for j in range(config.backbone_config.depths[i]):
            # 将原始键值对映射到新的键值对，修改 gamma 参数的命名
            rename_keys.append((f"backbone.stages.{i}.{j}.gamma", f"backbone.encoder.stages.{i}.layers.{j}.layer_scale_parameter"))
            # 修改深度卷积的权重命名
            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.weight", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.weight"))
            # 修改深度卷积的偏置命名
            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.bias", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.bias"))
            # 修改归一化层权重命名
            rename_keys.append((f"backbone.stages.{i}.{j}.norm.weight", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.weight"))
            # 修改归一化层偏置命名
            rename_keys.append((f"backbone.stages.{i}.{j}.norm.bias", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.bias"))
            # 修改第一个点卷积层的权重命名
            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.weight"))
            # 修改第一个点卷积层的偏置命名
            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.bias"))
            # 修改第二个点卷积层的权重命名
            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.weight"))
            # 修改第二个点卷积层的偏置命名
            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.bias"))
        
        # 如果 i 大于 0，则需要处理下采样层的命名映射
        if i > 0:
            # 修改下采样层第一个卷积层的权重命名
            rename_keys.append((f"backbone.downsample_layers.{i}.0.weight", f"backbone.encoder.stages.{i}.downsampling_layer.0.weight"))
            # 修改下采样层第一个卷积层的偏置命名
            rename_keys.append((f"backbone.downsample_layers.{i}.0.bias", f"backbone.encoder.stages.{i}.downsampling_layer.0.bias"))
            # 修改下采样层第二个归一化层的权重命名
            rename_keys.append((f"backbone.downsample_layers.{i}.1.weight", f"backbone.encoder.stages.{i}.downsampling_layer.1.weight"))
            # 修改下采样层第二个归一化层的偏置命名
            rename_keys.append((f"backbone.downsample_layers.{i}.1.bias", f"backbone.encoder.stages.{i}.downsampling_layer.1.bias"))

        # 修改归一化层权重命名
        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
        # 修改归一化层偏置命名
        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))

    # decode head 部分的命名映射
    rename_keys.extend(
        [
            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
        ]
    )

    # 返回处理后的所有重命名映射列表
    return rename_keys
# 定义函数，用于将字典 dct 中的键 old 更名为 new，保持其对应的值不变
def rename_key(dct, old, new):
    # 弹出旧键对应的值
    val = dct.pop(old)
    # 将该值与新键 new 组成新的键值对，添加到字典 dct 中
    dct[new] = val


# 定义函数，用于从指定的 URL 下载指定模型的预训练检查点，并加载其状态字典
def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
    # 模型名到预训练检查点 URL 的映射字典
    model_name_to_url = {
        "upernet-convnext-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth",
        "upernet-convnext-small": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth",
        "upernet-convnext-base": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth",
        "upernet-convnext-large": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth",
        "upernet-convnext-xlarge": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth",
    }
    
    # 根据给定的模型名获取对应的预训练检查点 URL
    checkpoint_url = model_name_to_url[model_name]
    # 使用 torch.hub 下载指定 URL 的模型状态字典，并存储在变量 state_dict 中
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]

    # 获取指定模型名的配置信息
    config = get_upernet_config(model_name)
    # 根据配置信息创建 UperNetForSemanticSegmentation 模型实例
    model = UperNetForSemanticSegmentation(config)
    # 设置模型为评估模式
    model.eval()

    # 将状态字典中所有键包含 "bn" 的项更名为包含 "batch_norm"
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        if "bn" in key:
            key = key.replace("bn", "batch_norm")
        state_dict[key] = val

    # 使用预定义函数 create_rename_keys(config) 创建需要重命名的键对列表 rename_keys
    rename_keys = create_rename_keys(config)
    # 遍历 rename_keys 列表，对状态字典中的键进行重命名操作
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)

    # 使用更新后的状态字典加载模型参数
    model.load_state_dict(state_dict)

    # 从指定 URL 获取测试图像，并转换为 RGB 格式
    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

    # 创建 SegformerImageProcessor 实例处理图像
    processor = SegformerImageProcessor()
    # 将图像转换为 PyTorch 张量格式
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # 关闭梯度计算，在模型推理时不计算梯度
    with torch.no_grad():
        # 使用模型进行图像的语义分割推理
        outputs = model(pixel_values)

    # 根据模型名选择对应的预期输出结果片段 expected_slice
    if model_name == "upernet-convnext-tiny":
        expected_slice = torch.tensor(
            [[-8.8110, -8.8110, -8.6521], [-8.8110, -8.8110, -8.6521], [-8.7746, -8.7746, -8.6130]]
        )
    elif model_name == "upernet-convnext-small":
        expected_slice = torch.tensor(
            [[-8.8236, -8.8236, -8.6771], [-8.8236, -8.8236, -8.6771], [-8.7638, -8.7638, -8.6240]]
        )
    elif model_name == "upernet-convnext-base":
        expected_slice = torch.tensor(
            [[-8.8558, -8.8558, -8.6905], [-8.8558, -8.8558, -8.6905], [-8.7669, -8.7669, -8.6021]]
        )
    # 如果模型名称为 "upernet-convnext-large"，设定期望的输出张量切片
    elif model_name == "upernet-convnext-large":
        expected_slice = torch.tensor(
            [[-8.6660, -8.6660, -8.6210], [-8.6660, -8.6660, -8.6210], [-8.6310, -8.6310, -8.5964]]
        )
    # 如果模型名称为 "upernet-convnext-xlarge"，设定期望的输出张量切片
    elif model_name == "upernet-convnext-xlarge":
        expected_slice = torch.tensor(
            [[-8.4980, -8.4980, -8.3977], [-8.4980, -8.4980, -8.3977], [-8.4379, -8.4379, -8.3412]]
        )
    # 打印模型输出的 logits 的部分内容，用于调试和验证
    print("Logits:", outputs.logits[0, 0, :3, :3])
    # 断言模型输出的 logits 的部分内容与预期的输出张量切片在给定的误差范围内相似
    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
    # 如果通过断言，则打印消息表示结果看起来正常
    print("Looks ok!")

    # 如果指定了 PyTorch 模型保存路径
    if pytorch_dump_folder_path is not None:
        # 打印保存模型的消息，包括模型名称和保存路径
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 打印保存处理器的消息，包括保存路径
        print(f"Saving processor to {pytorch_dump_folder_path}")
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型推送到 Hub
    if push_to_hub:
        # 打印推送模型和处理器到 Hub 的消息，包括模型名称
        print(f"Pushing model and processor for {model_name} to hub")
        # 将模型推送到 Hub，命名为 "openmmlab/{model_name}"
        model.push_to_hub(f"openmmlab/{model_name}")
        # 将处理器推送到 Hub，命名为 "openmmlab/{model_name}"
        processor.push_to_hub(f"openmmlab/{model_name}")
if __name__ == "__main__":
    # 如果当前脚本被直接执行（而非被导入到其他脚本中），则执行以下代码
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--model_name",
        default="upernet-convnext-tiny",
        type=str,
        choices=[f"upernet-convnext-{size}" for size in ["tiny", "small", "base", "large", "xlarge"]],
        help="Name of the ConvNext UperNet model you'd like to convert."
    )
    # 添加模型名称参数，可以选择的值包括指定格式的模型名称

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加参数，指定输出 PyTorch 模型的目录路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加参数，指定是否将转换后的模型推送到 🤗 hub

    args = parser.parse_args()
    # 解析命令行参数，并存储在 args 对象中

    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 convert_upernet_checkpoint，传递解析后的参数进行模型转换操作

`.\models\upernet\convert_swin_upernet_to_pytorch.py`

# coding=utf-8
# 设置脚本编码格式为UTF-8

# Copyright 2022 The HuggingFace Inc. team.
# 版权声明，版权归HuggingFace Inc.团队所有。

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据Apache License 2.0许可证授权使用本代码

# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用此文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，否则依据“原样”分发此软件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请参阅许可证了解特定语言的权限和限制

"""Convert Swin Transformer + UperNet checkpoints from mmsegmentation.

从mmsegmentation转换Swin Transformer + UperNet检查点。

URL: https://github.com/open-mmlab/mmsegmentation/tree/master/configs/swin
"""

import argparse  # 导入命令行参数解析模块
import json  # 导入JSON操作模块

import requests  # 导入HTTP请求库
import torch  # 导入PyTorch深度学习框架
from huggingface_hub import hf_hub_download  # 从HuggingFace Hub下载模块导入函数
from PIL import Image  # 导入Python Imaging Library (PIL)中的Image模块

from transformers import SegformerImageProcessor, SwinConfig, UperNetConfig, UperNetForSemanticSegmentation  # 导入transformers库中的类和函数


def get_upernet_config(model_name):
    # 根据模型名称获取相应的UperNet配置

    auxiliary_in_channels = 384  # 设置辅助输入通道数
    window_size = 7  # 设置窗口大小初始值
    if "tiny" in model_name:
        embed_dim = 96  # 设置嵌入维度大小
        depths = (2, 2, 6, 2)  # 设置深度
        num_heads = (3, 6, 12, 24)  # 设置头数
    elif "small" in model_name:
        embed_dim = 96
        depths = (2, 2, 18, 2)
        num_heads = (3, 6, 12, 24)
    elif "base" in model_name:
        embed_dim = 128
        depths = (2, 2, 18, 2)
        num_heads = (4, 8, 16, 32)
        window_size = 12
        auxiliary_in_channels = 512
    elif "large" in model_name:
        embed_dim = 192
        depths = (2, 2, 18, 2)
        num_heads = (6, 12, 24, 48)
        window_size = 12
        auxiliary_in_channels = 768

    # 设置标签信息
    num_labels = 150
    repo_id = "huggingface/label-files"
    filename = "ade20k-id2label.json"

    # 从HuggingFace Hub下载标签文件，并加载为JSON格式
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}  # 转换为整数键的字典
    label2id = {v: k for k, v in id2label.items()}  # 反转为值到整数键的字典

    # 创建Swin Transformer的配置
    backbone_config = SwinConfig(
        embed_dim=embed_dim,
        depths=depths,
        num_heads=num_heads,
        window_size=window_size,
        out_features=["stage1", "stage2", "stage3", "stage4"],
    )

    # 创建UperNet的配置
    config = UperNetConfig(
        backbone_config=backbone_config,
        auxiliary_in_channels=auxiliary_in_channels,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    return config


# here we list all keys to be renamed (original name on the left, our name on the right)
# 列出需要重命名的所有键对（原始名称在左侧，我们的名称在右侧）
def create_rename_keys(config):
    rename_keys = []  # 初始化空的重命名键列表

    # fmt: off
    # stem
    # fmt: on

    # 添加需要重命名的键对到列表中
    rename_keys.append(("backbone.patch_embed.projection.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
    rename_keys.append(("backbone.patch_embed.projection.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
    rename_keys.append(("backbone.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
    # 将特定键值对添加到 rename_keys 列表中，用于后续的键名重命名
    rename_keys.append(("backbone.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))

    # 遍历 backbone_config.depths 中的每个深度值
    for i in range(len(config.backbone_config.depths)):
        # 遍历每个深度下的层数量
        for j in range(config.backbone_config.depths[i]):
            # 将 backbone.stages.i.blocks.j.norm1.weight 的键重命名为 backbone.encoder.layers.i.blocks.j.layernorm_before.weight
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
            # 将 backbone.stages.i.blocks.j.norm1.bias 的键重命名为 backbone.encoder.layers.i.blocks.j.layernorm_before.bias
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
            # 将 backbone.stages.i.blocks.j.attn.w_msa.relative_position_bias_table 的键重命名为 backbone.encoder.layers.i.blocks.j.attention.self.relative_position_bias_table
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_bias_table", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
            # 将 backbone.stages.i.blocks.j.attn.w_msa.relative_position_index 的键重命名为 backbone.encoder.layers.i.blocks.j.attention.self.relative_position_index
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_index", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
            # 将 backbone.stages.i.blocks.j.attn.w_msa.proj.weight 的键重命名为 backbone.encoder.layers.i.blocks.j.attention.output.dense.weight
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
            # 将 backbone.stages.i.blocks.j.attn.w_msa.proj.bias 的键重命名为 backbone.encoder.layers.i.blocks.j.attention.output.dense.bias
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
            # 将 backbone.stages.i.blocks.j.norm2.weight 的键重命名为 backbone.encoder.layers.i.blocks.j.layernorm_after.weight
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
            # 将 backbone.stages.i.blocks.j.norm2.bias 的键重命名为 backbone.encoder.layers.i.blocks.j.layernorm_after.bias
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
            # 将 backbone.stages.i.blocks.j.ffn.layers.0.0.weight 的键重命名为 backbone.encoder.layers.i.blocks.j.intermediate.dense.weight
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
            # 将 backbone.stages.i.blocks.j.ffn.layers.0.0.bias 的键重命名为 backbone.encoder.layers.i.blocks.j.intermediate.dense.bias
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
            # 将 backbone.stages.i.blocks.j.ffn.layers.1.weight 的键重命名为 backbone.encoder.layers.i.blocks.j.output.dense.weight
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
            # 将 backbone.stages.i.blocks.j.ffn.layers.1.bias 的键重命名为 backbone.encoder.layers.i.blocks.j.output.dense.bias
            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias"))

        # 如果 i 小于 3，则继续添加下述重命名键值对
        if i < 3:
            # 将 backbone.stages.i.downsample.reduction.weight 的键重命名为 backbone.encoder.layers.i.downsample.reduction.weight
            rename_keys.append((f"backbone.stages.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight"))
            # 将 backbone.stages.i.downsample.norm.weight 的键重命名为 backbone.encoder.layers.i.downsample.norm.weight
            rename_keys.append((f"backbone.stages.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight"))
            # 将 backbone.stages.i.downsample.norm.bias 的键重命名为 backbone.encoder.layers.i.downsample.norm.bias
            rename_keys.append((f"backbone.stages.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias"))
        
        # 将 backbone.norm{i}.weight 的键重命名为 backbone.hidden_states_norms.stage{i+1}.weight
        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
        # 将 backbone.norm{i}.bias 的键重命名为 backbone.hidden_states_norms.stage{i+1}.bias
        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))

    # decode head
    rename_keys.extend(
        [
            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
        ]
    )
    # fmt: on

    return rename_keys



    # 将以下四对键值对添加到 `rename_keys` 列表中
    rename_keys.extend(
        [
            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
        ]
    )
    # 标记格式化的结束点，这里是 `fmt: on`

    # 返回已经更新的 `rename_keys` 列表
    return rename_keys
# 重命名字典中的键。
def rename_key(dct, old, new):
    # 弹出旧键对应的值
    val = dct.pop(old)
    # 将值与新键关联存入字典
    dct[new] = val


# 将每个编码器层的矩阵拆分为查询、键和值。
def read_in_q_k_v(state_dict, backbone_config):
    # 计算每个特征维度的大小
    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
    # 遍历不同深度和层级的编码器
    for i in range(len(backbone_config.depths)):
        dim = num_features[i]
        for j in range(backbone_config.depths[i]):
            # fmt: off
            # 读取输入投影层权重和偏置（在原始实现中，这是一个单独的矩阵加偏置）
            in_proj_weight = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.weight")
            in_proj_bias = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.bias")
            # 按顺序将查询、键和值添加到状态字典中
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
                dim : dim * 2, :
            ]
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
                dim : dim * 2
            ]
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
                -dim :, :
            ]
            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
            # fmt: on


# 修正通过unfold操作导致的张量重排顺序
def correct_unfold_reduction_order(x):
    # 获取输出通道数和输入通道数
    out_channel, in_channel = x.shape
    # 重塑张量形状以便重新排列
    x = x.reshape(out_channel, 4, in_channel // 4)
    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
    return x


# 逆向修正unfold操作导致的张量重排顺序
def reverse_correct_unfold_reduction_order(x):
    # 获取输出通道数和输入通道数
    out_channel, in_channel = x.shape
    # 重塑张量形状以便逆向重排
    x = x.reshape(out_channel, in_channel // 4, 4)
    x = x[:, :, [0, 2, 1, 3]].transpose(1, 2).reshape(out_channel, in_channel)
    return x


# 修正标准化操作导致的张量重排顺序
def correct_unfold_norm_order(x):
    # 获取输入通道数
    in_channel = x.shape[0]
    # 重塑张量形状以便重新排列
    x = x.reshape(4, in_channel // 4)
    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
    return x


# 逆向修正标准化操作导致的张量重排顺序
def reverse_correct_unfold_norm_order(x):
    # 获取输入通道数
    in_channel = x.shape[0]
    # 重塑张量形状以便逆向重排
    x = x.reshape(in_channel // 4, 4)
    x = x[:, [0, 2, 1, 3]].transpose(0, 1).reshape(in_channel)
    return x


# 在这个版本中，由于使用了nn.Unfold实现的新的下采样操作，出现了不兼容性。
# 问题已在以下链接中得到解决：https://github.com/open-mmlab/mmdetection/blob/31c84958f54287a8be2b99cbf87a6dcf12e57753/mmdet/models/utils/ckpt_convert.py#L96。
def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
    pass
    # 定义模型名称到预训练模型权重 URL 的映射字典
    model_name_to_url = {
        "upernet-swin-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth",
        "upernet-swin-small": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth",
        "upernet-swin-base": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth",
        "upernet-swin-large": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k_20220318_091743-9ba68901.pth",
    }
    
    # 获取指定模型名称对应的预训练模型权重 URL
    checkpoint_url = model_name_to_url[model_name]
    
    # 使用 torch.hub 加载预训练模型的状态字典
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)["state_dict"]

    # 打印加载的每个参数名及其形状
    for name, param in state_dict.items():
        print(name, param.shape)

    # 根据模型名称获取对应的配置信息
    config = get_upernet_config(model_name)
    
    # 使用获取的配置信息创建 UperNetForSemanticSegmentation 模型
    model = UperNetForSemanticSegmentation(config)
    
    # 将模型设置为评估模式
    model.eval()

    # 替换状态字典中的键名中的 "bn" 为 "batch_norm"
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        if "bn" in key:
            key = key.replace("bn", "batch_norm")
        state_dict[key] = val

    # 根据预定义的键名重命名状态字典中的键名
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    
    # 从配置中读取相关的 QKV（Query, Key, Value）信息到状态字典中
    read_in_q_k_v(state_dict, config.backbone_config)

    # 修正状态字典中 "downsample" 相关参数
    for key, value in state_dict.items():
        if "downsample" in key:
            if "reduction" in key:
                state_dict[key] = reverse_correct_unfold_reduction_order(value)
            if "norm" in key:
                state_dict[key] = reverse_correct_unfold_norm_order(value)

    # 加载修正后的状态字典到模型中
    model.load_state_dict(state_dict)

    # 在指定的图像 URL 上验证模型输出
    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

    # 创建 SegformerImageProcessor 实例并处理图像获取像素值
    processor = SegformerImageProcessor()
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # 禁用梯度计算环境下执行模型推理
    with torch.no_grad():
        outputs = model(pixel_values)
        logits = outputs.logits

    # 打印 logits 的形状和其前3x3的值
    print(logits.shape)
    print("First values of logits:", logits[0, 0, :3, :3])
    
    # 如果模型名称为 "upernet-swin-tiny"，则进行断言验证
    if model_name == "upernet-swin-tiny":
        expected_slice = torch.tensor(
            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
        )
    elif model_name == "upernet-swin-small":
        # 如果模型名称是 "upernet-swin-small"
        expected_slice = torch.tensor(
            [[-7.1921, -7.1921, -6.9532], [-7.1921, -7.1921, -6.9532], [-7.0908, -7.0908, -6.8534]]
        )
    elif model_name == "upernet-swin-base":
        # 如果模型名称是 "upernet-swin-base"
        expected_slice = torch.tensor(
            [[-6.5851, -6.5851, -6.4330], [-6.5851, -6.5851, -6.4330], [-6.4763, -6.4763, -6.3254]]
        )
    elif model_name == "upernet-swin-large":
        # 如果模型名称是 "upernet-swin-large"
        expected_slice = torch.tensor(
            [[-7.5297, -7.5297, -7.3802], [-7.5297, -7.5297, -7.3802], [-7.4044, -7.4044, -7.2586]]
        )
    # 打印模型输出的前 3x3 的 logits
    print("Logits:", outputs.logits[0, 0, :3, :3])
    # 使用 torch.allclose 检查输出的 logits 是否与预期的片段相近
    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
    # 打印确认信息
    print("Looks ok!")

    if pytorch_dump_folder_path is not None:
        # 如果指定了 pytorch_dump_folder_path，则保存模型和处理器到指定路径
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        print(f"Saving processor to {pytorch_dump_folder_path}")
        processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        # 如果需要推送到 Hub
        print(f"Pushing model and processor for {model_name} to hub")
        # 将模型推送到 Hub
        model.push_to_hub(f"openmmlab/{model_name}")
        # 将处理器推送到 Hub
        processor.push_to_hub(f"openmmlab/{model_name}")
if __name__ == "__main__":
    # 如果脚本直接运行而非被导入，则执行以下代码
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    # 必填参数
    parser.add_argument(
        "--model_name",
        default="upernet-swin-tiny",
        type=str,
        choices=[f"upernet-swin-{size}" for size in ["tiny", "small", "base", "large"]],
        help="Name of the Swin + UperNet model you'd like to convert.",
    )
    # 模型名称，可以选择的值为 upernet-swin-tiny、upernet-swin-small、upernet-swin-base、upernet-swin-large

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # PyTorch 模型输出目录的路径，可以是任意有效的字符串路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 是否将转换后的模型推送到 🤗 hub

    # 解析命令行参数并返回命名空间对象
    args = parser.parse_args()

    # 调用函数 convert_upernet_checkpoint，传入解析后的参数
    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\upernet\modeling_upernet.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation."""

from typing import List, Optional, Tuple, Union

import torch
from torch import nn
from torch.nn import CrossEntropyLoss

from ...modeling_outputs import SemanticSegmenterOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from ...utils.backbone_utils import load_backbone
from .configuration_upernet import UperNetConfig


UPERNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openmmlab/upernet-convnext-tiny",
    # See all UperNet models at https://huggingface.co/models?filter=upernet
]

# General docstring
_CONFIG_FOR_DOC = "UperNetConfig"


class UperNetConvModule(nn.Module):
    """
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple[int, int]],
        padding: Union[int, Tuple[int, int], str] = 0,
        bias: bool = False,
        dilation: Union[int, Tuple[int, int]] = 1,
    ) -> None:
        super().__init__()
        # Initialize convolutional layer with specified parameters
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            padding=padding,
            bias=bias,
            dilation=dilation,
        )
        # Batch normalization layer to normalize the output of convolution
        self.batch_norm = nn.BatchNorm2d(out_channels)
        # ReLU activation function to introduce non-linearity
        self.activation = nn.ReLU()

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # Perform convolution operation
        output = self.conv(input)
        # Apply batch normalization
        output = self.batch_norm(output)
        # Apply ReLU activation
        output = self.activation(output)

        return output


class UperNetPyramidPoolingBlock(nn.Module):
    def __init__(self, pool_scale: int, in_channels: int, channels: int) -> None:
        super().__init__()
        # Define layers for pyramid pooling block: adaptive average pooling and convolution module
        self.layers = [
            nn.AdaptiveAvgPool2d(pool_scale),  # Adaptive average pooling with specified scale
            UperNetConvModule(in_channels, channels, kernel_size=1),  # Convolution module
        ]
        # Add each layer to the module
        for i, layer in enumerate(self.layers):
            self.add_module(str(i), layer)
    # 定义神经网络前向传播方法，接受输入张量 input，并返回处理后的张量
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # 将输入张量作为初始隐藏状态
        hidden_state = input
        # 遍历神经网络的每一层，并依次对隐藏状态进行处理
        for layer in self.layers:
            hidden_state = layer(hidden_state)
        # 返回处理后的最终隐藏状态
        return hidden_state
class UperNetPyramidPoolingModule(nn.Module):
    """
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (`Tuple[int]`):
            Pooling scales used in Pooling Pyramid Module.
        in_channels (`int`):
            Input channels.
        channels (`int`):
            Channels after modules, before conv_seg.
        align_corners (`bool`):
            align_corners argument of F.interpolate.
    """

    def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, channels: int, align_corners: bool) -> None:
        super().__init__()
        # 存储传入的参数
        self.pool_scales = pool_scales
        self.align_corners = align_corners
        self.in_channels = in_channels
        self.channels = channels
        self.blocks = []
        # 根据给定的每个尺度创建对应的 UperNetPyramidPoolingBlock 并添加到 blocks 列表中
        for i, pool_scale in enumerate(pool_scales):
            block = UperNetPyramidPoolingBlock(pool_scale=pool_scale, in_channels=in_channels, channels=channels)
            self.blocks.append(block)
            self.add_module(str(i), block)

    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
        ppm_outs = []
        # 遍历每个 PyramidPoolingBlock 执行前向传播
        for ppm in self.blocks:
            ppm_out = ppm(x)
            # 使用双线性插值上采样到原始大小
            upsampled_ppm_out = nn.functional.interpolate(
                ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners
            )
            ppm_outs.append(upsampled_ppm_out)
        return ppm_outs


class UperNetHead(nn.Module):
    """
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).
    """
    # 初始化函数，接受配置对象和输入通道数作为参数
    def __init__(self, config, in_channels):
        # 调用父类的初始化方法
        super().__init__()

        # 保存配置对象和池化尺度
        self.config = config
        self.pool_scales = config.pool_scales  # e.g. (1, 2, 3, 6)
        # 保存输入通道数和隐藏层大小
        self.in_channels = in_channels
        self.channels = config.hidden_size
        # 设置插值参数为False
        self.align_corners = False
        # 创建一个卷积层分类器，输出通道数为config.num_labels，核大小为1
        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)

        # PSP模块
        self.psp_modules = UperNetPyramidPoolingModule(
            self.pool_scales,
            self.in_channels[-1],  # 取输入通道数的最后一个值
            self.channels,
            align_corners=self.align_corners,
        )
        # 创建一个UperNetConvModule作为瓶颈层，输入为最后一个输入通道数和池化尺度数乘以隐藏层大小
        self.bottleneck = UperNetConvModule(
            self.in_channels[-1] + len(self.pool_scales) * self.channels,
            self.channels,
            kernel_size=3,
            padding=1,
        )
        
        # FPN模块
        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()
        # 遍历除了最后一层的所有输入通道
        for in_channels in self.in_channels[:-1]:  # 跳过顶层
            # 创建UperNetConvModule作为侧边卷积层，输入通道数到隐藏层大小的转换，核大小为1
            l_conv = UperNetConvModule(in_channels, self.channels, kernel_size=1)
            # 创建UperNetConvModule作为FPN卷积层，输入隐藏层大小到隐藏层大小的转换，核大小为3
            fpn_conv = UperNetConvModule(self.channels, self.channels, kernel_size=3, padding=1)
            # 将侧边卷积层和FPN卷积层添加到对应的列表中
            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

        # 创建一个UperNetConvModule作为FPN瓶颈层，输入为所有输入通道数乘以隐藏层大小
        self.fpn_bottleneck = UperNetConvModule(
            len(self.in_channels) * self.channels,
            self.channels,
            kernel_size=3,
            padding=1,
        )

    # 初始化权重的方法
    def init_weights(self):
        # 调用apply方法，应用_init_weights方法初始化权重
        self.apply(self._init_weights)

    # 初始化权重的具体实现方法
    def _init_weights(self, module):
        # 如果是Conv2d类型的模块
        if isinstance(module, nn.Conv2d):
            # 从正态分布中初始化权重，均值为0，标准差为配置对象的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，则将偏置初始化为0
            if module.bias is not None:
                module.bias.data.zero_()

    # PSP模块的前向传播方法
    def psp_forward(self, inputs):
        # 取最后一个输入作为x
        x = inputs[-1]
        # 将x作为初始输出
        psp_outs = [x]
        # 使用PSP模块处理x，将处理后的结果扩展到psp_outs列表中
        psp_outs.extend(self.psp_modules(x))
        # 在通道维度上连接psp_outs列表中的所有张量
        psp_outs = torch.cat(psp_outs, dim=1)
        # 将连接后的结果作为输入，通过瓶颈层进行处理，并返回处理后的输出
        output = self.bottleneck(psp_outs)

        return output
    # 前向传播函数，接收编码器隐藏状态作为输入，并返回一个张量作为输出
    def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
        # 构建侧边连接
        laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]

        # 将 PSP 模块的输出添加到侧边连接列表中
        laterals.append(self.psp_forward(encoder_hidden_states))

        # 构建自顶向下的路径
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            prev_shape = laterals[i - 1].shape[2:]
            # 将当前层的特征图与上一层的特征图进行相加，并使用双线性插值调整大小
            laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate(
                laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners
            )

        # 构建输出
        # 对侧边连接中的每一层应用 FPN 卷积层
        fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
        # 将 PSP 模块的特征图追加到 FPN 输出列表中
        fpn_outs.append(laterals[-1])

        # 对每一层 FPN 输出进行自顶向下的插值调整大小
        for i in range(used_backbone_levels - 1, 0, -1):
            fpn_outs[i] = nn.functional.interpolate(
                fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners
            )
        
        # 在通道维度上连接所有 FPN 输出
        fpn_outs = torch.cat(fpn_outs, dim=1)
        
        # 使用 FPN 瓶颈层处理连接后的特征图
        output = self.fpn_bottleneck(fpn_outs)
        
        # 使用分类器处理最终的输出特征图
        output = self.classifier(output)

        # 返回处理后的输出张量
        return output
class UperNetFCNHead(nn.Module):
    """
    Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config:
            Configuration.
        in_channels (int):
            Number of input channels.
        kernel_size (int):
            The kernel size for convs in the head. Default: 3.
        dilation (int):
            The dilation rate for convs in the head. Default: 1.
    """

    def __init__(
        self, config, in_index: int = 2, kernel_size: int = 3, dilation: Union[int, Tuple[int, int]] = 1
    ) -> None:
        super().__init__()

        self.config = config  # 保存配置信息
        self.in_channels = config.auxiliary_in_channels  # 输入通道数
        self.channels = config.auxiliary_channels  # 通道数
        self.num_convs = config.auxiliary_num_convs  # 卷积层数
        self.concat_input = config.auxiliary_concat_input  # 是否连接输入
        self.in_index = in_index  # 输入索引

        conv_padding = (kernel_size // 2) * dilation  # 计算卷积的填充大小
        convs = []
        convs.append(
            UperNetConvModule(
                self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
            )  # 添加第一个卷积模块
        )
        for i in range(self.num_convs - 1):
            convs.append(
                UperNetConvModule(
                    self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
                )  # 根据配置添加更多卷积模块
            )
        if self.num_convs == 0:
            self.convs = nn.Identity()  # 如果没有卷积层，使用恒等映射
        else:
            self.convs = nn.Sequential(*convs)  # 将卷积模块序列化
        if self.concat_input:
            self.conv_cat = UperNetConvModule(
                self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2
            )  # 如果连接输入，则添加一个连接卷积模块

        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)  # 分类器卷积层

    def init_weights(self):
        self.apply(self._init_weights)  # 初始化权重

    def _init_weights(self, module):
        if isinstance(module, nn.Conv2d):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)  # 初始化卷积层权重
            if module.bias is not None:
                module.bias.data.zero_()  # 初始化卷积层偏置

    def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
        # just take the relevant feature maps
        hidden_states = encoder_hidden_states[self.in_index]  # 根据输入索引选择隐藏状态
        output = self.convs(hidden_states)  # 通过卷积模块处理隐藏状态
        if self.concat_input:
            output = self.conv_cat(torch.cat([hidden_states, output], dim=1))  # 如果连接输入，则进行连接操作
        output = self.classifier(output)  # 分类器处理输出
        return output
    # 定义一个方法用于初始化模型的权重，接受一个模块作为参数
    def _init_weights(self, module):
        # 检查传入的模块是否是 UperNetPreTrainedModel 的实例
        if isinstance(module, UperNetPreTrainedModel):
            # 初始化模块的主干网络的权重
            module.backbone.init_weights()
            # 初始化模块的解码头部的权重
            module.decode_head.init_weights()
            # 如果模块有辅助头部，则初始化辅助头部的权重
            if module.auxiliary_head is not None:
                module.auxiliary_head.init_weights()

    # 定义一个方法用于初始化整个模型的权重
    def init_weights(self):
        """Initialize the weights"""
        # 初始化模型的主干网络的权重
        self.backbone.init_weights()
        # 初始化模型的解码头部的权重
        self.decode_head.init_weights()
        # 如果模型有辅助头部，则初始化辅助头部的权重
        if self.auxiliary_head is not None:
            self.auxiliary_head.init_weights()
# UperNetForSemanticSegmentation 类的文档字符串，描述了 UperNet 框架及其使用的模型参数和配置信息
UPERNET_START_DOCSTRING = r"""
    Parameters:
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# UPERNET_INPUTS_DOCSTRING 为 UperNetForSemanticSegmentation 类的 forward 方法提供的输入参数文档字符串
UPERNET_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
            `attentions` under returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
            returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# UperNetForSemanticSegmentation 类的定义，继承自 UperNetPreTrainedModel 类
@add_start_docstrings(
    """UperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.""",
    UPERNET_START_DOCSTRING,
)
class UperNetForSemanticSegmentation(UperNetPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 加载指定的视觉骨干网络，并赋值给 self.backbone
        self.backbone = load_backbone(config)

        # 语义分割头部模块，使用 UperNetHead 类进行初始化
        self.decode_head = UperNetHead(config, in_channels=self.backbone.channels)
        # 如果配置中指定使用辅助头部，则初始化 UperNetFCNHead 类作为辅助头部
        self.auxiliary_head = UperNetFCNHead(config) if config.use_auxiliary_head else None

        # 初始化权重并进行最终处理
        self.post_init()

    # forward 方法的文档字符串，描述了 forward 方法的输入参数及其作用
    @add_start_docstrings_to_model_forward(UPERNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,

`.\models\upernet\init.py`

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入自定义的异常和模块延迟加载工具
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构字典
_import_structure = {
    "configuration_upernet": ["UperNetConfig"],  # 导入 UperNetConfig 配置
}

# 检查是否可用 Torch 库，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则扩展导入结构字典，导入 modeling_upernet 模块中的特定类
    _import_structure["modeling_upernet"] = [
        "UperNetForSemanticSegmentation",
        "UperNetPreTrainedModel",
    ]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 导入 UperNetConfig 配置类
    from .configuration_upernet import UperNetConfig

    # 再次检查 Torch 是否可用，若不可用则捕获异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，则导入 modeling_upernet 模块中的特定类
        from .modeling_upernet import UperNetForSemanticSegmentation, UperNetPreTrainedModel

# 如果不是类型检查模式
else:
    import sys

    # 动态地将当前模块替换为一个 LazyModule 实例，实现模块的延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\videomae\configuration_videomae.py`

# coding=utf-8
# 上面这行指定了文件的编码格式为UTF-8，确保支持非英语字符
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归HuggingFace Inc.团队所有，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可证进行许可，允许使用、复制、修改、合并、发布、分发、再授权和销售
# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用此文件
# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则不得更改
# distributed under the License is distributed on an "AS IS" BASIS,
# 根据许可证发布的软件是按“原样”提供的，没有任何形式的明示或暗示保证或条件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有明示或暗示的任何保证或条件
# See the License for the specific language governing permissions and
# 详细信息请参阅许可证
# limitations under the License.
# 许可证下的限制
""" VideoMAE model configuration"""

# 从相对路径中导入预训练配置
from ...configuration_utils import PretrainedConfig
# 从相对路径中导入日志记录工具
from ...utils import logging

# 获取与当前模块相关的日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件的映射字典，将模型名称映射到其配置文件的URL
VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "MCG-NJU/videomae-base": "https://huggingface.co/MCG-NJU/videomae-base/resolve/main/config.json",
}

# VideoMAE模型的配置类，用于存储VideoMAEModel的配置信息
class VideoMAEConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`VideoMAEModel`]. It is used to instantiate a
    VideoMAE model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the VideoMAE
    [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义一个类，表示一个视觉变换的模型配置
    class VisionTransformerConfig:
        def __init__(
            self,
            image_size=224,
            patch_size=16,
            num_channels=3,
            num_frames=16,
            tubelet_size=2,
            hidden_size=768,
            num_hidden_layers=12,
            num_attention_heads=12,
            intermediate_size=3072,
            hidden_act="gelu",
            hidden_dropout_prob=0.0,
            attention_probs_dropout_prob=0.0,
            initializer_range=0.02,
            layer_norm_eps=1e-12,
            qkv_bias=True,
            use_mean_pooling=True,
            decoder_num_attention_heads=6,
            decoder_hidden_size=384,
            decoder_num_hidden_layers=4,
            decoder_intermediate_size=1536,
            norm_pix_loss=True
        ):
            # 设置图像大小
            self.image_size = image_size
            # 设置每个补丁的大小
            self.patch_size = patch_size
            # 设置输入通道数
            self.num_channels = num_channels
            # 设置每个视频中的帧数
            self.num_frames = num_frames
            # 设置管道大小
            self.tubelet_size = tubelet_size
            # 设置编码器层和汇集层的维度
            self.hidden_size = hidden_size
            # 设置Transformer编码器中的隐藏层数
            self.num_hidden_layers = num_hidden_layers
            # 设置Transformer编码器中每个注意力层的注意头数
            self.num_attention_heads = num_attention_heads
            # 设置Transformer编码器中"中间"（即前馈）层的维度
            self.intermediate_size = intermediate_size
            # 设置编码器和汇集器中的非线性激活函数
            self.hidden_act = hidden_act
            # 设置所有全连接层的dropout概率
            self.hidden_dropout_prob = hidden_dropout_prob
            # 设置注意力概率的dropout比率
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            # 设置初始化所有权重矩阵的截断正态分布的标准差
            self.initializer_range = initializer_range
            # 设置层归一化层使用的epsilon
            self.layer_norm_eps = layer_norm_eps
            # 设置是否向查询、键和值添加偏置
            self.qkv_bias = qkv_bias
            # 设置是否使用均值池化最终隐藏状态，而不是使用[CLS]标记的最终隐藏状态
            self.use_mean_pooling = use_mean_pooling
            # 设置解码器中每个注意力层的注意头数
            self.decoder_num_attention_heads = decoder_num_attention_heads
            # 设置解码器的隐藏大小
            self.decoder_hidden_size = decoder_hidden_size
            # 设置解码器中的隐藏层数
            self.decoder_num_hidden_layers = decoder_num_hidden_layers
            # 设置解码器中"中间"（即前馈）层的维度
            self.decoder_intermediate_size = decoder_intermediate_size
            # 设置是否归一化目标补丁像素
            self.norm_pix_loss = norm_pix_loss
    # 导入VideoMAEConfig和VideoMAEModel类
    >>> from transformers import VideoMAEConfig, VideoMAEModel

    # 初始化一个VideoMAE videomae-base风格的配置对象
    >>> configuration = VideoMAEConfig()

    # 根据配置对象随机初始化一个模型
    >>> model = VideoMAEModel(configuration)

    # 获取模型的配置信息
    >>> configuration = model.config

`.\models\videomae\convert_videomae_to_pytorch.py`

# coding=utf-8
# 声明文件编码格式为 UTF-8

# Copyright 2022 The HuggingFace Inc. team.
# 版权声明

# Licensed under the Apache License, Version 2.0 (the "License");
# 依据 Apache License, Version 2.0 授权许可

# you may not use this file except in compliance with the License.
# 除非符合 Apache License, Version 2.0 的授权许可，否则不得使用本文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 在适用法律要求或书面同意的情况下，依据“原样”提供，软件分发

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 无论是明示还是暗示的保证或条件

# See the License for the specific language governing permissions and
# 详细了解许可证可参阅特定的语言和权限
# limitations under the License.
# 在许可证下的限制

"""Convert VideoMAE checkpoints from the original repository: https://github.com/MCG-NJU/VideoMAE"""
# 文档字符串，指明代码用途是将 VideoMAE 检查点从原始仓库转换过来

import argparse  # 导入命令行参数解析模块
import json  # 导入 JSON 数据处理模块

import gdown  # 导入 gdown 用于下载工具
import numpy as np  # 导入 NumPy 模块
import torch  # 导入 PyTorch 模块
from huggingface_hub import hf_hub_download  # 从 huggingface_hub 导入模型下载函数

from transformers import (  # 导入 transformers 模块中的多个类
    VideoMAEConfig,  # VideoMAE 模型配置类
    VideoMAEForPreTraining,  # 用于预训练的 VideoMAE 模型类
    VideoMAEForVideoClassification,  # 用于视频分类的 VideoMAE 模型类
    VideoMAEImageProcessor,  # VideoMAE 图像处理器类
)


def get_videomae_config(model_name):
    # 获取 VideoMAE 模型配置的函数定义，参数为模型名称
    config = VideoMAEConfig()  # 创建 VideoMAEConfig 实例

    set_architecture_configs(model_name, config)  # 调用设置架构配置的函数

    if "finetuned" not in model_name:
        # 如果模型名称中不包含 "finetuned"
        config.use_mean_pooling = False  # 禁用平均池化

    if "finetuned" in model_name:
        # 如果模型名称中包含 "finetuned"
        repo_id = "huggingface/label-files"  # 设置仓库 ID
        if "kinetics" in model_name:
            # 如果模型名称中包含 "kinetics"
            config.num_labels = 400  # 设置标签数量为 400
            filename = "kinetics400-id2label.json"  # 设置文件名
        elif "ssv2" in model_name:
            # 如果模型名称中包含 "ssv2"
            config.num_labels = 174  # 设置标签数量为 174
            filename = "something-something-v2-id2label.json"  # 设置文件名
        else:
            # 如果模型名称既不包含 "kinetics" 也不包含 "ssv2"
            raise ValueError("Model name should either contain 'kinetics' or 'ssv2' in case it's fine-tuned.")
            # 抛出数值错误，要求模型名称中应包含 'kinetics' 或 'ssv2'，以表明其是否进行了微调
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        # 使用 huggingface_hub 下载并加载标签文件内容到 id2label 字典中
        id2label = {int(k): v for k, v in id2label.items()}  # 将键转换为整数类型
        config.id2label = id2label  # 设置配置对象的 id2label 属性
        config.label2id = {v: k for k, v in id2label.items()}  # 设置配置对象的 label2id 属性

    return config  # 返回配置对象


def set_architecture_configs(model_name, config):
    # 设置架构配置的函数定义，参数为模型名称和配置对象
    if "small" in model_name:
        # 如果模型名称中包含 "small"
        config.hidden_size = 384  # 设置隐藏层大小为 384
        config.intermediate_size = 1536  # 设置中间层大小为 1536
        config.num_hidden_layers = 12  # 设置隐藏层层数为 12
        config.num_attention_heads = 16  # 设置注意力头数为 16
        config.decoder_num_hidden_layers = 12  # 设置解码器隐藏层层数为 12
        config.decoder_num_attention_heads = 3  # 设置解码器注意力头数为 3
        config.decoder_hidden_size = 192  # 设置解码器隐藏层大小为 192
        config.decoder_intermediate_size = 768  # 设置解码器中间层大小为 768
    elif "large" in model_name:
        # 如果模型名称中包含 "large"
        config.hidden_size = 1024  # 设置隐藏层大小为 1024
        config.intermediate_size = 4096  # 设置中间层大小为 4096
        config.num_hidden_layers = 24  # 设置隐藏层层数为 24
        config.num_attention_heads = 16  # 设置注意力头数为 16
        config.decoder_num_hidden_layers = 12  # 设置解码器隐藏层层数为 12
        config.decoder_num_attention_heads = 8  # 设置解码器注意力头数为 8
        config.decoder_hidden_size = 512  # 设置解码器隐藏层大小为 512
        config.decoder_intermediate_size = 2048  # 设置解码器中间层大小为 2048
    # 如果模型名中包含 "huge"
    elif "huge" in model_name:
        # 设置隐藏层大小为 1280
        config.hidden_size = 1280
        # 设置中间层大小为 5120
        config.intermediate_size = 5120
        # 设置隐藏层的数量为 32
        config.num_hidden_layers = 32
        # 设置注意力头的数量为 16
        config.num_attention_heads = 16
        # 设置解码器隐藏层的数量为 12
        config.decoder_num_hidden_layers = 12
        # 设置解码器注意力头的数量为 8
        config.decoder_num_attention_heads = 8
        # 设置解码器隐藏层大小为 640
        config.decoder_hidden_size = 640
        # 设置解码器中间层大小为 2560
        config.decoder_intermediate_size = 2560
    # 如果模型名中不包含 "base"
    elif "base" not in model_name:
        # 抛出数值错误，提示模型名应包含 "small", "base", "large", 或 "huge"
        raise ValueError('Model name should include either "small", "base", "large", or "huge"')
# 定义一个函数用于重命名给定的键名
def rename_key(name):
    # 如果键名中包含 "encoder."，则替换为空字符串
    if "encoder." in name:
        name = name.replace("encoder.", "")
    # 如果键名中包含 "cls_token"，则替换为 "videomae.embeddings.cls_token"
    if "cls_token" in name:
        name = name.replace("cls_token", "videomae.embeddings.cls_token")
    # 如果键名中包含 "decoder_pos_embed"，则替换为 "decoder.decoder_pos_embed"
    if "decoder_pos_embed" in name:
        name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
    # 如果键名中包含 "pos_embed" 且不包含 "decoder"，则替换为 "videomae.embeddings.position_embeddings"
    if "pos_embed" in name and "decoder" not in name:
        name = name.replace("pos_embed", "videomae.embeddings.position_embeddings")
    # 如果键名中包含 "patch_embed.proj"，则替换为 "videomae.embeddings.patch_embeddings.projection"
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "videomae.embeddings.patch_embeddings.projection")
    # 如果键名中包含 "patch_embed.norm"，则替换为 "videomae.embeddings.norm"
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "videomae.embeddings.norm")
    # 如果键名中包含 "decoder.blocks"，则替换为 "decoder.decoder_layers"
    if "decoder.blocks" in name:
        name = name.replace("decoder.blocks", "decoder.decoder_layers")
    # 如果键名中包含 "blocks"，则替换为 "videomae.encoder.layer"
    if "blocks" in name:
        name = name.replace("blocks", "videomae.encoder.layer")
    # 如果键名中包含 "attn.proj"，则替换为 "attention.output.dense"
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    # 如果键名中包含 "attn" 且不包含 "bias"，则替换为 "attention.self"
    if "attn" in name and "bias" not in name:
        name = name.replace("attn", "attention.self")
    # 如果键名中包含 "attn"，则替换为 "attention.attention"
    if "attn" in name:
        name = name.replace("attn", "attention.attention")
    # 如果键名中包含 "norm1"，则替换为 "layernorm_before"
    if "norm1" in name:
        name = name.replace("norm1", "layernorm_before")
    # 如果键名中包含 "norm2"，则替换为 "layernorm_after"
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    # 如果键名中包含 "mlp.fc1"，则替换为 "intermediate.dense"
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    # 如果键名中包含 "mlp.fc2"，则替换为 "output.dense"
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")
    # 如果键名中包含 "decoder_embed"，则替换为 "decoder.decoder_embed"
    if "decoder_embed" in name:
        name = name.replace("decoder_embed", "decoder.decoder_embed")
    # 如果键名中包含 "decoder_norm"，则替换为 "decoder.decoder_norm"
    if "decoder_norm" in name:
        name = name.replace("decoder_norm", "decoder.decoder_norm")
    # 如果键名中包含 "decoder_pred"，则替换为 "decoder.decoder_pred"
    if "decoder_pred" in name:
        name = name.replace("decoder_pred", "decoder.decoder_pred")
    # 如果键名中包含 "norm.weight" 且不包含 "decoder" 和 "fc"，则替换为 "videomae.layernorm.weight"
    if "norm.weight" in name and "decoder" not in name and "fc" not in name:
        name = name.replace("norm.weight", "videomae.layernorm.weight")
    # 如果键名中包含 "norm.bias" 且不包含 "decoder" 和 "fc"，则替换为 "videomae.layernorm.bias"
    if "norm.bias" in name and "decoder" not in name and "fc" not in name:
        name = name.replace("norm.bias", "videomae.layernorm.bias")
    # 如果键名中包含 "head" 且不包含 "decoder"，则替换为 "classifier"
    if "head" in name and "decoder" not in name:
        name = name.replace("head", "classifier")

    # 返回处理后的键名
    return name
    # 遍历原始状态字典的键的副本
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键以"encoder."开头，则移除该前缀
        if key.startswith("encoder."):
            key = key.replace("encoder.", "")

        # 如果键中包含"qkv"
        if "qkv" in key:
            # 根据"."分割键
            key_split = key.split(".")
            # 如果键以"decoder.blocks"开头
            if key.startswith("decoder.blocks"):
                # 设置维度和层号
                dim = config.decoder_hidden_size
                layer_num = int(key_split[2])
                prefix = "decoder.decoder_layers."
                # 如果键包含"weight"
                if "weight" in key:
                    # 更新原始状态字典，替换成特定格式的键和对应的值
                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
            else:
                # 设置维度和层号
                dim = config.hidden_size
                layer_num = int(key_split[1])
                prefix = "videomae.encoder.layer."
                # 如果键包含"weight"
                if "weight" in key:
                    # 更新原始状态字典，替换成特定格式的键和对应的值
                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
        else:
            # 对键进行重命名处理并更新原始状态字典
            orig_state_dict[rename_key(key)] = val

    # 返回更新后的原始状态字典
    return orig_state_dict
# 我们将在吃意大利面视频上验证我们的结果
# 使用的帧索引：[164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
def prepare_video():
    # 从指定的数据集仓库下载名为 'eating_spaghetti.npy' 的文件
    file = hf_hub_download(
        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
    )
    # 加载.npy文件中的视频数据
    video = np.load(file)
    return list(video)


def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub):
    # 获取VideoMAE模型配置
    config = get_videomae_config(model_name)

    if "finetuned" in model_name:
        # 如果模型名中包含'finetuned'，则使用VideoMAEForVideoClassification进行初始化
        model = VideoMAEForVideoClassification(config)
    else:
        # 否则使用VideoMAEForPreTraining进行初始化
        model = VideoMAEForPreTraining(config)

    # 下载托管在Google Drive上的原始检查点
    output = "pytorch_model.bin"
    gdown.cached_download(checkpoint_url, output, quiet=False)
    # 加载检查点文件并映射到CPU
    files = torch.load(output, map_location="cpu")
    if "model" in files:
        state_dict = files["model"]
    else:
        state_dict = files["module"]
    # 转换检查点的状态字典
    new_state_dict = convert_state_dict(state_dict, config)

    # 加载新状态字典到模型中
    model.load_state_dict(new_state_dict)
    # 设置模型为评估模式
    model.eval()

    # 使用图像处理器VideoMAEImageProcessor进行视频帧的预处理
    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
    # 准备视频数据，转换为PyTorch张量列表
    video = prepare_video()
    inputs = image_processor(video, return_tensors="pt")

    # 如果模型名中不包含'finetuned'
    if "finetuned" not in model_name:
        # 从指定的数据集仓库下载名为 'bool_masked_pos.pt' 的本地文件
        local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
        # 加载本地文件到inputs字典中的 'bool_masked_pos' 键
        inputs["bool_masked_pos"] = torch.load(local_path)

    # 使用模型处理inputs，得到输出结果
    outputs = model(**inputs)
    logits = outputs.logits

    # 定义不同模型名称对应的预期输出形状和切片
    model_names = [
        "videomae-small-finetuned-kinetics",
        "videomae-small-finetuned-ssv2",
        # Kinetics-400检查点（short = 仅预训练800个周期，而不是1600个周期）
        "videomae-base-short",
        "videomae-base-short-finetuned-kinetics",
        "videomae-base",
        "videomae-base-finetuned-kinetics",
        "videomae-large",
        "videomae-large-finetuned-kinetics",
        "videomae-huge-finetuned-kinetics",
        # Something-Something-v2检查点（short = 仅预训练800个周期，而不是2400个周期）
        "videomae-base-short-ssv2",
        "videomae-base-short-finetuned-ssv2",
        "videomae-base-ssv2",
        "videomae-base-finetuned-ssv2",
    ]

    # 注意：logits使用的图像均值和标准差分别为[0.5, 0.5, 0.5]和[0.5, 0.5, 0.5]进行了测试
    if model_name == "videomae-small-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([-0.9291, -0.4061, -0.9307])
    elif model_name == "videomae-small-finetuned-ssv2":
        expected_shape = torch.Size([1, 174])
        expected_slice = torch.tensor([0.2671, -0.4689, -0.8235])
    elif model_name == "videomae-base":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
    elif model_name == "videomae-base-short":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
        # 对于这个模型，我们验证了归一化和非归一化目标的损失
        expected_loss = torch.tensor([0.5142]) if config.norm_pix_loss else torch.tensor([0.6469])
    elif model_name == "videomae-large":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.7149, 0.7997, 0.6966], [0.6768, 0.7869, 0.6948], [0.5139, 0.6221, 0.5605]])
    elif model_name == "videomae-large-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
    elif model_name == "videomae-huge-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([0.2433, 0.1632, -0.4894])
    elif model_name == "videomae-base-short-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
    elif model_name == "videomae-base-finetuned-kinetics":
        expected_shape = torch.Size([1, 400])
        expected_slice = torch.tensor([0.3669, -0.0688, -0.2421])
    elif model_name == "videomae-base-short-ssv2":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.4712, 0.5296, 0.5786], [0.2278, 0.2729, 0.4026], [0.0352, 0.0730, 0.2506]])
    elif model_name == "videomae-base-short-finetuned-ssv2":
        expected_shape = torch.Size([1, 174])
        expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266])
    elif model_name == "videomae-base-ssv2":
        expected_shape = torch.Size([1, 1408, 1536])
        expected_slice = torch.tensor([[0.8131, 0.8727, 0.8546], [0.7366, 0.9377, 0.8870], [0.5935, 0.8874, 0.8564]])
    elif model_name == "videomae-base-finetuned-ssv2":
        expected_shape = torch.Size([1, 174])
        expected_slice = torch.tensor([0.1961, -0.8337, -0.6389])
    else:
        raise ValueError(f"Model name not supported. Should be one of {model_names}")

    # 验证输出的形状是否符合预期
    assert logits.shape == expected_shape
    # 如果模型名称包含“finetuned”，则验证前三个输出值是否接近预期切片值
    if "finetuned" in model_name:
        assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
    else:
        print("Logits:", logits[0, :3, :3])
        assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
    print("Logits ok!")

    # 如果适用，验证损失值
    if model_name == "videomae-base-short":
        loss = outputs.loss
        assert torch.allclose(loss, expected_loss, atol=1e-4)
        print("Loss ok!")

    # 如果指定了 PyTorch 模型保存路径，则保存模型和图像处理器
    if pytorch_dump_folder_path is not None:
        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
        image_processor.save_pretrained(pytorch_dump_folder_path)
        model.save_pretrained(pytorch_dump_folder_path)
    # 如果 push_to_hub 为真，则执行下面的代码块
    if push_to_hub:
        # 打印信息：正在推送到hub...
        print("Pushing to the hub...")
        # 调用 model 对象的 push_to_hub 方法，将模型推送到指定的 hub
        model.push_to_hub(model_name, organization="nielsr")
if __name__ == "__main__":
    # 如果脚本直接运行而非被导入，则执行以下代码
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--checkpoint_url",
        default="https://drive.google.com/u/1/uc?id=1tEhLyskjb755TJ65ptsrafUG2llSwQE1&amp;export=download&amp;confirm=t&amp;uuid=aa3276eb-fb7e-482a-adec-dc7171df14c4",
        type=str,
        help=(
            "URL of the original PyTorch checkpoint (on Google Drive) you'd like to convert. Should be a direct"
            " download link."
        ),
    )
    # 添加必需的参数：原始 PyTorch 检查点的下载链接

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default="/Users/nielsrogge/Documents/VideoMAE/Test",
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加必需的参数：输出 PyTorch 模型的目录路径

    parser.add_argument("--model_name", default="videomae-base", type=str, help="Name of the model.")
    # 添加参数：模型的名称，默认为 "videomae-base"

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加参数：是否将转换后的模型推送到 🤗 hub

    args = parser.parse_args()
    # 解析命令行参数并返回一个命名空间

    convert_videomae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
    # 调用函数 convert_videomae_checkpoint，传递解析后的参数进行模型检查点转换

`.\models\videomae\feature_extraction_videomae.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for VideoMAE.
"""

# 导入警告模块
import warnings
# 导入日志模块
from ...utils import logging
# 导入图像处理类 VideoMAEImageProcessor
from .image_processing_videomae import VideoMAEImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# VideoMAEFeatureExtractor 类继承自 VideoMAEImageProcessor 类
class VideoMAEFeatureExtractor(VideoMAEImageProcessor):
    
    # 初始化方法，接受任意数量的位置参数和关键字参数
    def __init__(self, *args, **kwargs) -> None:
        # 发出关于类即将在 Transformers 版本 5 中移除的警告
        warnings.warn(
            "The class VideoMAEFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use VideoMAEImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 VideoMAEImageProcessor 的初始化方法
        super().__init__(*args, **kwargs)

`.\models\videomae\image_processing_videomae.py`

# 指定编码为UTF-8，确保源文件可以正确解析中文和其他非ASCII字符
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# 根据Apache License, Version 2.0许可协议，这段代码版权归HuggingFace Inc.团队所有
# 除非遵循许可协议的规定，否则不得使用此文件
# 可以从以下链接获取许可协议的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件基于"按现状"提供，不提供任何明示或暗示的保证或条件
# 请查阅许可协议以获取更多详细信息
"""Image processor class for VideoMAE."""


from typing import Dict, List, Optional, Union

import numpy as np

# 从image_processing_utils模块导入BaseImageProcessor、BatchFeature和get_size_dict函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
# 从image_transforms模块导入get_resize_output_image_size、resize和to_channel_dimension_format函数
from ...image_transforms import (
    get_resize_output_image_size,
    resize,
    to_channel_dimension_format,
)
# 从image_utils模块导入IMAGENET_STANDARD_MEAN、IMAGENET_STANDARD_STD、ChannelDimension、
# ImageInput、PILImageResampling、infer_channel_dimension_format、is_scaled_image、is_valid_image、
# to_numpy_array、valid_images、validate_kwargs和validate_preprocess_arguments函数
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
    is_valid_image,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 从utils模块导入TensorType、is_vision_available和logging函数
from ...utils import TensorType, is_vision_available, logging

# 如果视觉处理可用，导入PIL库
if is_vision_available():
    import PIL

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义make_batched函数，用于将视频处理成批处理的图像序列
def make_batched(videos) -> List[List[ImageInput]]:
    # 检查videos是否是列表或元组，且第一个元素也是列表或元组，且第一个图像是有效的
    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
        return videos

    # 检查videos是否是列表或元组，且第一个元素是有效的图像
    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
        return [videos]

    # 检查videos是否是有效的图像
    elif is_valid_image(videos):
        return [[videos]]

    # 如果无法构建批处理视频，则抛出值错误异常
    raise ValueError(f"Could not make batched video from {videos}")


# 定义VideoMAEImageProcessor类，继承自BaseImageProcessor类
class VideoMAEImageProcessor(BaseImageProcessor):
    r"""
    Constructs a VideoMAE image processor.
    # 定义函数参数说明文档，描述了预处理图像的可选参数及其默认值和用途
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            是否调整图像的（高度，宽度）尺寸至指定 `size`。可以被 `preprocess` 方法中的 `do_resize` 参数覆盖。
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
            调整后图像的尺寸。图像的最短边将被调整至 `size["shortest_edge"]`，同时保持原始图像的长宽比。
            可以被 `preprocess` 方法中的 `size` 参数覆盖。
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            调整图像尺寸时使用的重采样滤波器。可以被 `preprocess` 方法中的 `resample` 参数覆盖。
        do_center_crop (`bool`, *optional*, defaults to `True`):
            是否对图像进行中心裁剪至指定 `crop_size`。可以被 `preprocess` 方法中的 `do_center_crop` 参数覆盖。
        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
            应用中心裁剪后的图像尺寸。可以被 `preprocess` 方法中的 `crop_size` 参数覆盖。
        do_rescale (`bool`, *optional*, defaults to `True`):
            是否按照指定的缩放因子 `rescale_factor` 进行图像缩放。可以被 `preprocess` 方法中的 `do_rescale` 参数覆盖。
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            如果进行图像缩放，定义要使用的缩放因子。可以被 `preprocess` 方法中的 `rescale_factor` 参数覆盖。
        do_normalize (`bool`, *optional*, defaults to `True`):
            是否对图像进行归一化。可以被 `preprocess` 方法中的 `do_normalize` 参数覆盖。
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            归一化时使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。
            可以被 `preprocess` 方法中的 `image_mean` 参数覆盖。
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            归一化时使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。
            可以被 `preprocess` 方法中的 `image_std` 参数覆盖。
    """
    
    # 定义模型输入名称列表，只包含一个元素 "pixel_values"
    model_input_names = ["pixel_values"]
    # 初始化函数，用于设置图像处理的各种参数
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行大小调整，默认为True
        size: Dict[str, int] = None,  # 图像尺寸的字典，默认为{"shortest_edge": 224}
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像重采样方法，默认为双线性插值
        do_center_crop: bool = True,  # 是否进行中心裁剪，默认为True
        crop_size: Dict[str, int] = None,  # 裁剪尺寸的字典，默认为{"height": 224, "width": 224}
        do_rescale: bool = True,  # 是否进行重新缩放，默认为True
        rescale_factor: Union[int, float] = 1 / 255,  # 重新缩放的因子，默认为1/255
        do_normalize: bool = True,  # 是否进行归一化，默认为True
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值，默认为IMAGENET_STANDARD_MEAN
        image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差，默认为IMAGENET_STANDARD_STD
        **kwargs,  # 其他关键字参数
    ) -> None:
        # 调用父类初始化方法
        super().__init__(**kwargs)
        
        # 如果没有传入size，则使用默认的{"shortest_edge": 224}
        size = size if size is not None else {"shortest_edge": 224}
        # 调用函数get_size_dict处理size，确保参数合法性
        size = get_size_dict(size, default_to_square=False)
        
        # 如果没有传入crop_size，则使用默认的{"height": 224, "width": 224}
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 调用函数get_size_dict处理crop_size，确保参数合法性
        crop_size = get_size_dict(crop_size, param_name="crop_size")

        # 初始化对象的各个属性
        self.do_resize = do_resize
        self.size = size
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
        
        # 验证处理器关键字列表，用于后续数据处理
        self._valid_processor_keys = [
            "videos",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    ) -> np.ndarray:
        """
        Resize an image.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
                shortest edge of length `s` while keeping the aspect ratio of the original image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 根据 size 参数获取确切的图像尺寸，如果设置了 default_to_square=False，则不强制成正方形
        size = get_size_dict(size, default_to_square=False)
        
        # 如果 size 字典中包含 "shortest_edge" 键，根据最短边的长度调整图像大小
        if "shortest_edge" in size:
            output_size = get_resize_output_image_size(
                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
            )
        # 如果 size 字典同时包含 "height" 和 "width" 键，直接使用指定的高度和宽度
        elif "height" in size and "width" in size:
            output_size = (size["height"], size["width"])
        else:
            # 如果 size 字典既不包含 "shortest_edge" 也不包含 "height" 和 "width"，抛出异常
            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
        
        # 调用 resize 函数，对图像进行调整大小操作，传入指定的参数
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def _preprocess_image(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Preprocesses a single image.
        """
        # Validate preprocessing arguments based on provided options
        validate_preprocess_arguments(
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            do_center_crop=do_center_crop,
            crop_size=crop_size,
            do_resize=do_resize,
            size=size,
            resample=resample,
        )

        # Convert image to numpy array format for consistent handling
        image = to_numpy_array(image)

        # Warn if attempting to rescale already scaled images unnecessarily
        if is_scaled_image(image) and do_rescale:
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )

        # Infer input data format if not explicitly provided
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

        # Resize image if specified
        if do_resize:
            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)

        # Perform center cropping if specified
        if do_center_crop:
            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)

        # Rescale image if specified
        if do_rescale:
            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)

        # Normalize image pixel values if specified
        if do_normalize:
            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)

        # Convert image to the desired channel dimension format
        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
        return image

    def preprocess(
        self,
        videos: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
        Ultimate method to preprocess images or videos with flexible options.
        """

`.\models\videomae\modeling_videomae.py`

# 设置文件编码为 UTF-8
# 版权声明，指明版权归属及保留所有权利
# 根据 Apache License, Version 2.0 许可证使用本文件
# 除非符合许可证要求，否则不得使用本文件
# 可在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
# 本软件根据许可证“按原样”提供，无任何明示或暗示的担保或条件
# 请参阅许可证了解具体条款和限制
""" PyTorch VideoMAE (masked autoencoder) model."""

# 导入所需模块和库
import collections.abc  # 导入 collections.abc 模块
import math  # 导入 math 模块
from copy import deepcopy  # 导入 deepcopy 函数
from dataclasses import dataclass  # 导入 dataclass 装饰器
from typing import Optional, Set, Tuple, Union  # 导入类型注解相关的类和装饰器

import numpy as np  # 导入 NumPy 库并命名为 np
import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 功能
from torch import nn  # 从 PyTorch 导入 nn 模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 从 nn 模块导入损失函数类

# 导入 Hugging Face 库中的相关模块和函数
from ...activations import ACT2FN  # 从 activations 模块导入 ACT2FN 函数
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput  # 从 modeling_outputs 导入输出类
from ...modeling_utils import PreTrainedModel  # 从 modeling_utils 导入预训练模型基类
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer  # 导入模型优化相关函数
from ...utils import (  # 导入通用工具函数和类
    ModelOutput,  # 导入 ModelOutput 类
    add_start_docstrings,  # 导入函数，用于向模型方法添加文档字符串
    add_start_docstrings_to_model_forward,  # 导入函数，用于向模型前向方法添加文档字符串
    logging,  # 导入 logging 模块
    replace_return_docstrings,  # 导入函数，用于替换返回文档字符串
)
from ...utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD  # 导入常量
from .configuration_videomae import VideoMAEConfig  # 导入 VideoMAE 模型的配置类


# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置和检查点信息
_CONFIG_FOR_DOC = "VideoMAEConfig"
_CHECKPOINT_FOR_DOC = "MCG-NJU/videomae-base"

# 预训练模型存档列表
VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "MCG-NJU/videomae-base",
    # 可在 https://huggingface.co/models?filter=videomae 查看所有 VideoMAE 模型
]


@dataclass
class VideoMAEDecoderOutput(ModelOutput):
    """
    VideoMAEDecoder 的输出类，可能包含隐藏状态和注意力权重。

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            像素重构的 logits。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 返回时 `output_hidden_states=True` 或 `config.output_hidden_states=True`):
            一个元组，包含 `torch.FloatTensor`（嵌入层输出 + 每个层的输出）的形状为 `(batch_size, sequence_length, hidden_size)`。
            模型每一层的隐藏状态以及初始嵌入层的输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 返回时 `output_attentions=True` 或 `config.output_attentions=True`):
            一个元组，包含 `torch.FloatTensor`（每个层的注意力权重）的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            经过注意力 softmax 后的注意力权重，用于计算自注意力头的加权平均值。
    """
    # 定义一个变量 logits，类型为 torch 的 FloatTensor，初始值为 None
    logits: torch.FloatTensor = None
    # 定义一个变量 hidden_states，类型为 torch 的 FloatTensor 元组，可选类型为 None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 定义一个变量 attentions，类型为 torch 的 FloatTensor 元组，可选类型为 None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class VideoMAEForPreTrainingOutput(ModelOutput):
    """
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`):
            Pixel reconstruction loss.
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


# sin-cos position encoding
# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
def get_sinusoid_encoding_table(n_position, d_hid):
    """
    Sinusoid position encoding table.

    Args:
        n_position (int): Number of positions to encode.
        d_hid (int): Hidden dimension size.

    Returns:
        torch.FloatTensor: Sinusoid position encoding table of shape `(1, n_position, d_hid)`.
    """

    # Define a function to compute position-based angles
    def get_position_angle_vec(position):
        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

    # Create a numpy array for sinusoid table initialization
    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
    
    # Apply sine and cosine to alternate columns of the sinusoid table
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

    # Convert the numpy array to a torch tensor and add a batch dimension
    return torch.FloatTensor(sinusoid_table).unsqueeze(0)


class VideoMAEEmbeddings(nn.Module):
    """
    Construct the patch and position embeddings for VideoMAE model.

    Args:
        config (object): Configuration object containing model settings.

    Attributes:
        patch_embeddings (VideoMAEPatchEmbeddings): Patch embeddings module.
        num_patches (int): Number of patches in the input.
        position_embeddings (torch.FloatTensor): Sinusoid position embeddings tensor.
        config (object): Configuration object.
    """

    def __init__(self, config):
        super().__init__()

        # Initialize patch embeddings using VideoMAEPatchEmbeddings
        self.patch_embeddings = VideoMAEPatchEmbeddings(config)
        
        # Determine the number of patches from patch embeddings
        self.num_patches = self.patch_embeddings.num_patches
        
        # Initialize fixed sin-cos position embeddings
        self.position_embeddings = get_sinusoid_encoding_table(self.num_patches, config.hidden_size)
        
        # Store the configuration object
        self.config = config
    def forward(self, pixel_values, bool_masked_pos):
        # 创建补丁嵌入
        embeddings = self.patch_embeddings(pixel_values)

        # 添加位置嵌入
        # 将位置嵌入转换为与embeddings相同类型并复制到相同设备上
        embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()

        # 只保留可见的补丁
        # ~bool_masked_pos 表示可见的补丁
        if bool_masked_pos is not None:
            batch_size, _, num_channels = embeddings.shape
            embeddings = embeddings[~bool_masked_pos]
            embeddings = embeddings.reshape(batch_size, -1, num_channels)

        return embeddings
# 视频到补丁嵌入的模块。将形状为 (batch_size, num_frames, num_channels, height, width) 的视频批次转换为
# 形状为 (batch_size, seq_len, hidden_size) 的张量，以供 Transformer 编码器使用。

class VideoMAEPatchEmbeddings(nn.Module):
    """
    Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
    height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

    The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
    patch_size).

    """

    def __init__(self, config):
        super().__init__()

        # 从配置中获取各种参数
        image_size = config.image_size
        patch_size = config.patch_size
        num_channels = config.num_channels
        hidden_size = config.hidden_size
        num_frames = config.num_frames
        tubelet_size = config.tubelet_size

        # 如果图像大小和补丁大小不是可迭代对象，则转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)

        # 设置类属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.tubelet_size = int(tubelet_size)

        # 计算补丁数量 seq_len，即 patches 的数量
        num_patches = (
            (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
        )
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 创建用于将视频像素值映射为补丁嵌入的 3D 卷积层
        self.projection = nn.Conv3d(
            in_channels=num_channels,
            out_channels=hidden_size,
            kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
            stride=(self.tubelet_size, patch_size[0], patch_size[1]),
        )

    def forward(self, pixel_values):
        # 获取输入张量的形状信息
        batch_size, num_frames, num_channels, height, width = pixel_values.shape

        # 检查通道数是否与配置中的一致
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )

        # 检查输入图像尺寸是否与配置中的一致
        if height != self.image_size[0] or width != self.image_size[1]:
            raise ValueError(
                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
            )

        # 将像素值排列为 (batch_size, num_channels, num_frames, height, width)
        pixel_values = pixel_values.permute(0, 2, 1, 3, 4)

        # 通过投影层将像素值映射为补丁嵌入，并进行扁平化和转置以适应 Transformer 的输入要求
        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)

        return embeddings
    def __init__(self, config: VideoMAEConfig) -> None:
        super().__init__()
        # 检查隐藏大小是否是注意力头数的倍数，且未定义嵌入大小
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            # 如果不是，则引发值错误异常
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )

        # 初始化注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)

        # 如果配置指定了 QKV 的偏置，则初始化偏置参数
        if config.qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(self.all_head_size))
            self.v_bias = nn.Parameter(torch.zeros(self.all_head_size))
        else:
            self.q_bias = None
            self.v_bias = None

        # 初始化注意力概率的 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        # 重塑张量 x 的形状以适应注意力分数计算所需的维度顺序
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
    ):
        # 正向传播函数定义
        # 定义函数签名和返回类型注解，可以返回包含 torch.Tensor 的元组
        ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 如果存在查询偏置 self.q_bias，则创建一个与 self.v_bias 相同形状的零张量 k_bias
        k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
        # 计算键 keys，使用线性变换将 hidden_states 与 self.key.weight 相乘并加上偏置 k_bias
        keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
        # 计算值 values，使用线性变换将 hidden_states 与 self.value.weight 相乘并加上偏置 self.v_bias
        values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
        # 计算查询 queries，使用线性变换将 hidden_states 与 self.query.weight 相乘并加上偏置 self.q_bias
        queries = nn.functional.linear(input=hidden_states, weight=self.query.weight, bias=self.q_bias)

        # 将 keys、values、queries 转换为多头注意力的格式
        key_layer = self.transpose_for_scores(keys)
        value_layer = self.transpose_for_scores(values)
        query_layer = self.transpose_for_scores(queries)

        # 计算注意力分数，即查询与键的点积
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 对注意力分数进行缩放，以提高数值稳定性
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 对注意力分数进行 softmax 归一化，得到注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 使用 dropout 随机丢弃一些注意力概率，以防止过拟合
        attention_probs = self.dropout(attention_probs)

        # 如果存在 head_mask，则将注意力概率与 head_mask 相乘，实现注意力头的屏蔽
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文向量，即注意力概率与值的加权和
        context_layer = torch.matmul(attention_probs, value_layer)

        # 将上下文向量进行维度重排，以符合模型输出的形状
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据输出设置，返回上下文向量及注意力概率，或仅返回上下文向量
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->VideoMAE
class VideoMAESelfOutput(nn.Module):
    """
    The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, config: VideoMAEConfig) -> None:
        super().__init__()
        # 定义一个全连接层，将输入的隐藏状态转换为相同维度的输出
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个 dropout 层，用于随机断开神经元，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理输入的隐藏状态
        hidden_states = self.dense(hidden_states)
        # 对处理后的隐藏状态应用 dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->VideoMAE
class VideoMAEAttention(nn.Module):
    def __init__(self, config: VideoMAEConfig) -> None:
        super().__init__()
        # 创建一个 VideoMAESelfAttention 实例，用于注意力机制
        self.attention = VideoMAESelfAttention(config)
        # 创建一个 VideoMAESelfOutput 实例，用于处理注意力输出
        self.output = VideoMAESelfOutput(config)
        # 存储需要剪枝的注意力头信息的集合
        self.pruned_heads = set()

    def prune_heads(self, heads: Set[int]) -> None:
        if len(heads) == 0:
            return
        # 寻找可剪枝的注意力头和相应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝后的头信息
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 使用注意力层处理隐藏状态，可能输出注意力权重
        self_outputs = self.attention(hidden_states, head_mask, output_attentions)

        # 使用输出层处理注意力层的输出和输入的隐藏状态
        attention_output = self.output(self_outputs[0], hidden_states)

        # 如果需要输出注意力权重，则将其添加到输出中
        outputs = (attention_output,) + self_outputs[1:]  # 如果输出注意力权重，则添加到输出中
        return outputs


# Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->VideoMAE
class VideoMAEIntermediate(nn.Module):
    def __init__(self, config: VideoMAEConfig) -> None:
        super().__init__()
        # 创建一个线性层，将输入隐藏状态转换为中间隐藏层的维度
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果配置中隐藏激活函数为字符串，则选择相应的激活函数；否则使用给定的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
    # 对输入的隐藏状态进行前向传播
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的结果应用激活函数（可能是ReLU等）
        hidden_states = self.intermediate_act_fn(hidden_states)

        # 返回处理后的隐藏状态
        return hidden_states
# Copied from transformers.models.vit.modeling_vit.ViTOutput ViT->VideoMAE
class VideoMAEOutput(nn.Module):
    def __init__(self, config: VideoMAEConfig) -> None:
        super().__init__()
        # 初始化一个全连接层，将输入特征大小转换为隐藏大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 初始化一个dropout层，用于在训练过程中随机置零输入张量的部分元素，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态传入全连接层
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行dropout操作
        hidden_states = self.dropout(hidden_states)

        # 将dropout后的输出与输入张量相加，实现残差连接
        hidden_states = hidden_states + input_tensor

        return hidden_states


# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->VideoMAE
class VideoMAELayer(nn.Module):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(self, config: VideoMAEConfig) -> None:
        super().__init__()
        # 定义用于分块前馈的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 定义序列长度维度
        self.seq_len_dim = 1
        # 初始化注意力机制模块
        self.attention = VideoMAEAttention(config)
        # 初始化中间层模块
        self.intermediate = VideoMAEIntermediate(config)
        # 初始化输出层模块
        self.output = VideoMAEOutput(config)
        # 初始化一个LayerNorm层，用于对隐藏状态进行归一化处理
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 在VideoMAE中，先对隐藏状态进行LayerNorm处理，然后应用自注意力机制
        self_attention_outputs = self.attention(
            self.layernorm_before(hidden_states),
            head_mask,
            output_attentions=output_attentions,
        )
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则将其添加到输出中

        # 第一个残差连接
        hidden_states = attention_output + hidden_states

        # 在VideoMAE中，还需在自注意力后再次应用LayerNorm
        layer_output = self.layernorm_after(hidden_states)
        # 经过中间层处理
        layer_output = self.intermediate(layer_output)

        # 第二个残差连接在这里实现
        layer_output = self.output(layer_output, hidden_states)

        outputs = (layer_output,) + outputs

        return outputs


# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->VideoMAE
class VideoMAEEncoder(nn.Module):
    def __init__(self, config: VideoMAEConfig) -> None:
        super().__init__()
        self.config = config
        # 初始化一个由VideoMAELayer组成的模块列表，每个VideoMAELayer对应一个隐藏层
        self.layer = nn.ModuleList([VideoMAELayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False
    # 定义一个方法，用于前向传播（推理阶段）的操作，输入参数包括隐藏状态、头部掩码、是否输出注意力权重、是否输出每层隐藏状态、是否返回字典形式的输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutput]:
        # 如果需要输出每层隐藏状态，则初始化一个空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化一个空元组
        all_self_attentions = () if output_attentions else None

        # 遍历每个层次的 Transformer 层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出每层隐藏状态，在 all_hidden_states 中添加当前隐藏状态
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用了梯度检查点技术并且当前处于训练模式
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数来调用当前层，并传入相应的参数
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的 __call__ 方法，传入隐藏状态、头部掩码和是否输出注意力权重
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素（通常是最终的隐藏状态）
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重，在 all_self_attentions 中添加当前层的注意力权重
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出每层隐藏状态，在 all_hidden_states 中添加最终的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的输出，则返回一个元组，过滤掉为 None 的部分
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则返回一个 BaseModelOutput 对象，包括最终的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
@add_start_docstrings(
    "The bare VideoMAE Model transformer outputting raw hidden-states without any specific head on top.",
    VIDEOMAE_START_DOCSTRING,
)

模型类的装饰器，用于为 `VideoMAEModel` 添加文档字符串，并且包括了模型的描述信息和参数说明。


class VideoMAEModel(VideoMAEPreTrainedModel):

定义了 `VideoMAEModel` 类，它继承自 `VideoMAEPreTrainedModel` 类，是视频多模态自编码器（VideoMAE）的模型类。


VIDEOMAE_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VideoMAEConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

`VIDEOMAE_START_DOCSTRING` 是一个原始字符串，用于描述 `VideoMAEModel` 类的基本信息和参数说明。它介绍了模型是如何作为 PyTorch 的 `torch.nn.Module` 子类来使用的，并提供了初始化参数的说明。


VIDEOMAE_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`VideoMAEImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

`VIDEOMAE_INPUTS_DOCSTRING` 是一个原始字符串，用于描述 `VideoMAEModel` 类的输入参数。它详细说明了模型接受的各种输入参数，包括像素值、头部掩码、是否返回注意力张量和隐藏状态等。

这些注释和文档字符串为 `VideoMAEModel` 类提供了清晰的描述和参数说明，帮助用户了解如何使用和配置该模型。
    # 初始化函数，接受配置参数并调用父类的初始化方法
    def __init__(self, config):
        super().__init__(config)
        # 将配置参数保存在实例变量中
        self.config = config

        # 创建视频嵌入对象
        self.embeddings = VideoMAEEmbeddings(config)
        # 创建视频编码器对象
        self.encoder = VideoMAEEncoder(config)

        # 根据配置决定是否使用层归一化，如果使用平均池化则不需要层归一化
        if config.use_mean_pooling:
            self.layernorm = None
        else:
            # 初始化层归一化对象
            self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回输入嵌入对象的方法
    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

    # 剪枝模型中注意力头的方法
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要剪枝的层和对应的注意力头
        for layer, heads in heads_to_prune.items():
            # 调用编码器对象的指定层的注意力机制对象进行注意力头的剪枝操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 模型前向传播方法，用于处理视频输入和其他参数，返回模型输出
    @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义 VideoMAEDecoder 类，继承自 nn.Module，用于解码视频MAE模型
class VideoMAEDecoder(nn.Module):
    # 初始化方法
    def __init__(self, config, num_patches):
        super().__init__()

        # 计算解码器输出标签数目
        decoder_num_labels = config.num_channels * config.tubelet_size * config.patch_size**2

        # 深拷贝配置对象，设置解码器配置参数
        decoder_config = deepcopy(config)
        decoder_config.hidden_size = config.decoder_hidden_size
        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
        decoder_config.num_attention_heads = config.decoder_num_attention_heads
        decoder_config.intermediate_size = config.decoder_intermediate_size

        # 创建解码器层列表，每一层使用 VideoMAELayer 类初始化
        self.decoder_layers = nn.ModuleList(
            [VideoMAELayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
        )

        # 设置层归一化对象
        self.norm = nn.LayerNorm(config.decoder_hidden_size)

        # 根据解码器输出标签数目确定头部连接层，如果为零则使用恒等映射
        self.head = (
            nn.Linear(config.decoder_hidden_size, decoder_num_labels) if decoder_num_labels > 0 else nn.Identity()
        )

        # 是否使用梯度检查点技术，默认关闭
        self.gradient_checkpointing = False
        self.config = config

    # 前向传播方法
    def forward(
        self,
        hidden_states,
        return_token_num,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        # 初始化存储所有隐藏状态和注意力分数的元组，根据输出标志初始化为 None 或空元组
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 遍历所有解码器层进行前向传播
        for i, layer_module in enumerate(self.decoder_layers):
            # 如果需要输出隐藏状态，则将当前隐藏状态加入到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果启用梯度检查点技术并且在训练阶段，则使用 _gradient_checkpointing_func 函数调用
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    None,
                    output_attentions,
                )
            else:
                # 否则正常调用解码器层的前向传播方法
                layer_outputs = layer_module(hidden_states, head_mask=None, output_attentions=output_attentions)

            # 更新隐藏状态为解码器层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力分数，则将当前层的注意力分数加入到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 最后一层解码器的隐藏状态加入到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_token_num 大于 0，则截取隐藏状态的后 return_token_num 个片段
        if return_token_num > 0:
            hidden_states = hidden_states[:, -return_token_num:]

        # 对最终隐藏状态进行归一化处理
        hidden_states = self.norm(hidden_states)

        # 使用头部连接层计算最终的 logits
        logits = self.head(hidden_states)

        # 如果 return_dict 为 False，则返回包含 logits、all_hidden_states 和 all_self_attentions 的元组
        if not return_dict:
            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
        # 否则返回 VideoMAEDecoderOutput 对象，包含 logits、all_hidden_states 和 all_self_attentions
        return VideoMAEDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)


# 使用 add_start_docstrings 装饰器为 VideoMAEForPreTraining 类添加文档字符串
@add_start_docstrings(
    "The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.",
    VIDEOMAE_START_DOCSTRING,
)
class VideoMAEForPreTraining(VideoMAEPreTrainedModel):
    # 类定义部分省略
    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法，传递配置对象作为参数
        super().__init__(config)
        # 将配置对象保存在实例变量中
        self.config = config

        # 创建 VideoMAEModel 对象并保存在实例变量中
        self.videomae = VideoMAEModel(config)

        # 创建一个线性层，用于编码器到解码器的映射，不使用偏置项
        self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=False)
        # 创建一个可学习的参数，用于表示掩码的标记
        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
        
        # 使用 sinusoid 编码表生成位置嵌入
        self.position_embeddings = get_sinusoid_encoding_table(
            self.videomae.embeddings.num_patches, config.decoder_hidden_size
        )

        # 创建 VideoMAEDecoder 对象，传递配置对象和图像片段数量作为参数
        self.decoder = VideoMAEDecoder(config, num_patches=self.videomae.embeddings.num_patches)

        # 调用后初始化方法，执行权重初始化和最终处理操作
        self.post_init()

    # 将输入的视频像素值和掩码的位置信息作为输入，执行前向传播操作
    @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=VideoMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        bool_masked_pos: torch.BoolTensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义 VideoMAEForVideoClassification 类，继承自 VideoMAEPreTrainedModel 类，用于视频分类任务的模型转换器
@add_start_docstrings(
    """VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.""",
    VIDEOMAE_START_DOCSTRING,
)
class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
    def __init__(self, config):
        # 调用父类 VideoMAEPreTrainedModel 的初始化方法
        super().__init__(config)

        # 设定标签数量
        self.num_labels = config.num_labels
        # 初始化 VideoMAEModel 模型
        self.videomae = VideoMAEModel(config)

        # 分类器头部
        # 如果 config.use_mean_pooling 为 True，则使用 LayerNorm 对象进行归一化处理
        self.fc_norm = nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
        # 根据标签数量初始化线性分类器或恒等映射
        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()

        # 初始化权重并进行最终处理
        self.post_init()

    # 定义前向传播函数
    @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\videomae\init.py`

# 版权声明及许可证信息，声明代码版权及使用许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块中的 TYPE_CHECKING 类型
from typing import TYPE_CHECKING

# 引入依赖模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构
_import_structure = {
    "configuration_videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
}

# 检查是否有 torch 库可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加 modeling_videomae 模块到导入结构
    _import_structure["modeling_videomae"] = [
        "VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "VideoMAEForPreTraining",
        "VideoMAEModel",
        "VideoMAEPreTrainedModel",
        "VideoMAEForVideoClassification",
    ]

# 检查是否有 vision 库可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 vision 可用，则添加 feature_extraction_videomae 和 image_processing_videomae 模块到导入结构
    _import_structure["feature_extraction_videomae"] = ["VideoMAEFeatureExtractor"]
    _import_structure["image_processing_videomae"] = ["VideoMAEImageProcessor"]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 从 configuration_videomae 模块导入指定内容
    from .configuration_videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig

    # 检查是否有 torch 库可用，如果不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 modeling_videomae 模块导入指定内容
        from .modeling_videomae import (
            VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
            VideoMAEForPreTraining,
            VideoMAEForVideoClassification,
            VideoMAEModel,
            VideoMAEPreTrainedModel,
        )

    # 检查是否有 vision 库可用，如果不可用则忽略
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 feature_extraction_videomae 和 image_processing_videomae 模块导入指定内容
        from .feature_extraction_videomae import VideoMAEFeatureExtractor
        from .image_processing_videomae import VideoMAEImageProcessor

# 如果不是类型检查模式
else:
    import sys

    # 将当前模块映射到 LazyModule，用于懒加载导入结构
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\vilt\configuration_vilt.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" VilT model configuration"""

# 导入所需模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取logger对象用于记录日志
logger = logging.get_logger(__name__)

# 预训练模型配置文件的映射字典，指定模型名称及其对应的配置文件URL
VILT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "dandelin/vilt-b32-mlm": "https://huggingface.co/dandelin/vilt-b32-mlm/blob/main/config.json"
}


class ViltConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ViLTModel`]. It is used to instantiate an ViLT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ViLT
    [dandelin/vilt-b32-mlm](https://huggingface.co/dandelin/vilt-b32-mlm) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import ViLTModel, ViLTConfig

    >>> # Initializing a ViLT dandelin/vilt-b32-mlm style configuration
    >>> configuration = ViLTConfig()

    >>> # Initializing a model from the dandelin/vilt-b32-mlm style configuration
    >>> model = ViLTModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    model_type = "vilt"

    def __init__(
        self,
        vocab_size=30522,
        type_vocab_size=2,
        modality_type_vocab_size=2,
        max_position_embeddings=40,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        image_size=384,
        patch_size=32,
        num_channels=3,
        qkv_bias=True,
        max_image_length=-1,
        tie_word_embeddings=False,
        num_images=-1,
        **kwargs,
        ):
        # 调用父类的构造函数，初始化模型参数和超参数
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

        # 设置模型的词汇表大小
        self.vocab_size = vocab_size
        # 设置模型的类型词汇表大小
        self.type_vocab_size = type_vocab_size
        # 设置模型的模态类型词汇表大小
        self.modality_type_vocab_size = modality_type_vocab_size
        # 设置模型的最大位置嵌入长度
        self.max_position_embeddings = max_position_embeddings

        # 设置模型的隐藏层大小
        self.hidden_size = hidden_size
        # 设置模型的隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置模型的注意力头数量
        self.num_attention_heads = num_attention_heads
        # 设置模型的中间层大小
        self.intermediate_size = intermediate_size
        # 设置模型的隐藏层激活函数类型
        self.hidden_act = hidden_act
        # 设置模型的隐藏层的丢弃率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置模型的注意力机制的概率丢弃率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置模型的初始化范围
        self.initializer_range = initializer_range
        # 设置模型的层归一化 epsilon 参数
        self.layer_norm_eps = layer_norm_eps

        # 设置模型的图像输入大小
        self.image_size = image_size
        # 设置模型的图像块的大小
        self.patch_size = patch_size
        # 设置模型的图像通道数量
        self.num_channels = num_channels
        # 设置模型的注意力中的查询、键、值是否包含偏置
        self.qkv_bias = qkv_bias
        # 设置模型的最大图像长度
        self.max_image_length = max_image_length
        # 设置模型处理的图像数量
        self.num_images = num_images

`.\models\vilt\convert_vilt_original_to_pytorch.py`

# coding=utf-8
# 设置脚本的字符编码为UTF-8

# Copyright 2022 The HuggingFace Inc. team.
# 版权声明，指明代码的版权信息

# Licensed under the Apache License, Version 2.0 (the "License");
# 使用 Apache License, Version 2.0 许可证

# you may not use this file except in compliance with the License.
# 按照许可证要求，除非获得许可，否则不得使用此文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 无论是明示的还是隐含的，不附带任何形式的担保或条件

# See the License for the specific language governing permissions and
# 请查阅许可证，获取具体的使用权限和

# limitations under the License.
# 限制和条件

"""Convert ViLT checkpoints from the original Github repository."""

# 从原始的 Github 仓库中转换 ViLT 检查点

import argparse
# 导入 argparse 用于解析命令行参数

import json
# 导入 json 模块用于处理 JSON 数据

from pathlib import Path
# 从 pathlib 模块中导入 Path 类，用于处理文件路径

import requests
# 导入 requests 模块，用于发送 HTTP 请求

import torch
# 导入 torch 模块，用于 PyTorch 相关操作

from huggingface_hub import hf_hub_download
# 从 huggingface_hub 库中导入 hf_hub_download 函数，用于从 Hugging Face Hub 下载模型

from PIL import Image
# 从 PIL 库中导入 Image 模块，用于图像处理

from transformers import (
    BertTokenizer,
    ViltConfig,
    ViltForImageAndTextRetrieval,
    ViltForImagesAndTextClassification,
    ViltForMaskedLM,
    ViltForQuestionAnswering,
    ViltImageProcessor,
    ViltProcessor,
)
# 从 transformers 库中导入多个类和函数，用于加载和处理 ViLT 模型的不同配置和任务

from transformers.utils import logging
# 从 transformers.utils 中导入 logging 模块，用于设置日志信息

logging.set_verbosity_info()
# 设置日志记录级别为 info

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

# here we list all keys to be renamed (original name on the left, our name on the right)
# 在此处列出需要重命名的所有键（左侧为原始名称，右侧为我们的名称）

def create_rename_keys(config, vqa_model=False, nlvr_model=False, irtr_model=False):
    # 定义一个函数，用于生成重命名键的列表，根据不同的模型类型设置参数

    rename_keys = []
    # 初始化空的重命名键列表

    for i in range(config.num_hidden_layers):
        # 遍历隐藏层的数量，进行重命名操作

        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
        # 编码器层：输出投影，2 个前馈神经网络和 2 个层归一化

        rename_keys.append((f"transformer.blocks.{i}.norm1.weight", f"vilt.encoder.layer.{i}.layernorm_before.weight"))
        # 添加归一化层权重的重命名映射

        rename_keys.append((f"transformer.blocks.{i}.norm1.bias", f"vilt.encoder.layer.{i}.layernorm_before.bias"))
        # 添加归一化层偏置的重命名映射

        rename_keys.append(
            (f"transformer.blocks.{i}.attn.proj.weight", f"vilt.encoder.layer.{i}.attention.output.dense.weight")
        )
        # 添加注意力投影层权重的重命名映射

        rename_keys.append(
            (f"transformer.blocks.{i}.attn.proj.bias", f"vilt.encoder.layer.{i}.attention.output.dense.bias")
        )
        # 添加注意力投影层偏置的重命名映射

        rename_keys.append((f"transformer.blocks.{i}.norm2.weight", f"vilt.encoder.layer.{i}.layernorm_after.weight"))
        # 添加第二层归一化权重的重命名映射

        rename_keys.append((f"transformer.blocks.{i}.norm2.bias", f"vilt.encoder.layer.{i}.layernorm_after.bias"))
        # 添加第二层归一化偏置的重命名映射

        rename_keys.append(
            (f"transformer.blocks.{i}.mlp.fc1.weight", f"vilt.encoder.layer.{i}.intermediate.dense.weight")
        )
        # 添加 MLP 第一层权重的重命名映射

        rename_keys.append(
            (f"transformer.blocks.{i}.mlp.fc1.bias", f"vilt.encoder.layer.{i}.intermediate.dense.bias")
        )
        # 添加 MLP 第一层偏置的重命名映射

        rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.weight", f"vilt.encoder.layer.{i}.output.dense.weight"))
        # 添加 MLP 第二层权重的重命名映射

        rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.bias", f"vilt.encoder.layer.{i}.output.dense.bias"))
        # 添加 MLP 第二层偏置的重命名映射

    # embeddings
    # 处理嵌入层的重命名，暂缺省略部分
    # 将下列键值对列表扩展到已有的 rename_keys 列表中，用于重命名模型中的参数路径
    rename_keys.extend(
        [
            # 文本嵌入
            ("text_embeddings.word_embeddings.weight", "vilt.embeddings.text_embeddings.word_embeddings.weight"),
            ("text_embeddings.position_embeddings.weight", "vilt.embeddings.text_embeddings.position_embeddings.weight"),
            ("text_embeddings.position_ids", "vilt.embeddings.text_embeddings.position_ids"),
            ("text_embeddings.token_type_embeddings.weight", "vilt.embeddings.text_embeddings.token_type_embeddings.weight"),
            ("text_embeddings.LayerNorm.weight", "vilt.embeddings.text_embeddings.LayerNorm.weight"),
            ("text_embeddings.LayerNorm.bias", "vilt.embeddings.text_embeddings.LayerNorm.bias"),
            # 补丁嵌入
            ("transformer.cls_token", "vilt.embeddings.cls_token"),
            ("transformer.patch_embed.proj.weight", "vilt.embeddings.patch_embeddings.projection.weight"),
            ("transformer.patch_embed.proj.bias", "vilt.embeddings.patch_embeddings.projection.bias"),
            ("transformer.pos_embed", "vilt.embeddings.position_embeddings"),
            # 标记类型嵌入
            ("token_type_embeddings.weight", "vilt.embeddings.token_type_embeddings.weight"),
        ]
    )
    
    # 最终的 Layernorm 和池化器
    rename_keys.extend(
        [
            ("transformer.norm.weight", "vilt.layernorm.weight"),
            ("transformer.norm.bias", "vilt.layernorm.bias"),
            ("pooler.dense.weight", "vilt.pooler.dense.weight"),
            ("pooler.dense.bias", "vilt.pooler.dense.bias"),
        ]
    )
    
    # 分类器头部
    if vqa_model:
        # 如果是 VQA 模型，添加 VQA 分类器的参数路径映射
        rename_keys.extend(
            [
                ("vqa_classifier.0.weight", "classifier.0.weight"),
                ("vqa_classifier.0.bias", "classifier.0.bias"),
                ("vqa_classifier.1.weight", "classifier.1.weight"),
                ("vqa_classifier.1.bias", "classifier.1.bias"),
                ("vqa_classifier.3.weight", "classifier.3.weight"),
                ("vqa_classifier.3.bias", "classifier.3.bias"),
            ]
        )
    elif nlvr_model:
        # 如果是 NLVR 模型，添加 NLVR2 分类器的参数路径映射
        rename_keys.extend(
            [
                ("nlvr2_classifier.0.weight", "classifier.0.weight"),
                ("nlvr2_classifier.0.bias", "classifier.0.bias"),
                ("nlvr2_classifier.1.weight", "classifier.1.weight"),
                ("nlvr2_classifier.1.bias", "classifier.1.bias"),
                ("nlvr2_classifier.3.weight", "classifier.3.weight"),
                ("nlvr2_classifier.3.bias", "classifier.3.bias"),
            ]
        )
    else:
        pass
    
    # 返回更新后的 rename_keys 列表，其中包含了所有需要重命名的模型参数路径映射
    return rename_keys
# 按照每个编码器层的要求，从状态字典中读取查询（query）、键（key）和值（value）的权重和偏置
def read_in_q_k_v(state_dict, config):
    # 遍历编码器层的数量
    for i in range(config.num_hidden_layers):
        prefix = "vilt."
        # 读取输入投影层的权重和偏置（在timm中，这是一个单独的矩阵加偏置）
        in_proj_weight = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.weight")
        in_proj_bias = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.bias")
        # 将查询（query）、键（key）、值（value）依次添加到状态字典中
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[config.hidden_size : config.hidden_size * 2, :]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[config.hidden_size : config.hidden_size * 2]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]


# 从状态字典中移除分类头部分的权重和偏置
def remove_classification_head_(state_dict):
    ignore_keys = ["head.weight", "head.bias"]
    for k in ignore_keys:
        state_dict.pop(k, None)


# 重命名字典中的键名
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val


# 转换ViLT模型的检查点，将其权重复制/粘贴/调整到我们的ViLT结构中
@torch.no_grad()
def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    """
    复制/粘贴/调整模型的权重到我们的ViLT结构中。
    """

    # 定义配置并初始化HuggingFace模型
    config = ViltConfig(image_size=384, patch_size=32, tie_word_embeddings=False)
    mlm_model = False
    vqa_model = False
    nlvr_model = False
    irtr_model = False
    
    # 根据checkpoint_url的内容选择初始化不同的模型
    if "vqa" in checkpoint_url:
        vqa_model = True
        config.num_labels = 3129
        repo_id = "huggingface/label-files"
        filename = "vqa2-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
        model = ViltForQuestionAnswering(config)
    elif "nlvr" in checkpoint_url:
        nlvr_model = True
        config.num_labels = 2
        config.id2label = {0: "False", 1: "True"}
        config.label2id = {v: k for k, v in config.id2label.items()}
        config.modality_type_vocab_size = 3
        model = ViltForImagesAndTextClassification(config)
    elif "irtr" in checkpoint_url:
        irtr_model = True
        model = ViltForImageAndTextRetrieval(config)
    elif "mlm_itm" in checkpoint_url:
        # 如果 URL 中包含 "mlm_itm"，则设置 mlm_model 为 True，并使用 ViltForMaskedLM 创建模型对象
        mlm_model = True
        model = ViltForMaskedLM(config)
    else:
        # 如果 URL 不包含 "mlm_itm"，则抛出 ValueError，表示未知的模型类型
        raise ValueError("Unknown model type")

    # 加载原始模型的 state_dict，移除和重命名一些键
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
    rename_keys = create_rename_keys(config, vqa_model, nlvr_model, irtr_model)
    for src, dest in rename_keys:
        # 调用 rename_key 函数，用新的键名重命名 state_dict 中的键
        rename_key(state_dict, src, dest)
    # 处理 state_dict，读入 query、key 和 value 相关信息
    read_in_q_k_v(state_dict, config)
    if mlm_model or irtr_model:
        # 如果是 mlm_model 或 irtr_model，则忽略特定的键
        ignore_keys = ["itm_score.fc.weight", "itm_score.fc.bias"]
        for k in ignore_keys:
            # 从 state_dict 中移除指定的键
            state_dict.pop(k, None)

    # 将 state_dict 加载到 HuggingFace 模型中
    model.eval()
    if mlm_model:
        # 如果是 mlm_model，使用非严格模式加载 state_dict，并验证缺失的键
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
        assert missing_keys == ["mlm_score.decoder.bias"]
    else:
        # 否则，使用严格模式加载 state_dict
        model.load_state_dict(state_dict)

    # 定义处理器对象
    image_processor = ViltImageProcessor(size=384)
    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
    processor = ViltProcessor(image_processor, tokenizer)

    # 对示例输入进行前向传播（图像 + 文本）
    if nlvr_model:
        # 如果是 nlvr_model，加载两个相同的图像和文本描述，使用 processor 对象编码
        image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
        image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
        text = (
            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
            " standing."
        )
        encoding_1 = processor(image1, text, return_tensors="pt")
        encoding_2 = processor(image2, text, return_tensors="pt")
        # 将编码后的输入传递给模型进行推断
        outputs = model(
            input_ids=encoding_1.input_ids,
            pixel_values=encoding_1.pixel_values,
            pixel_values_2=encoding_2.pixel_values,
        )
    else:
        # 否则，加载单个图像和相应的文本描述，使用 processor 对象编码
        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
        if mlm_model:
            # 如果是 mlm_model，使用包含 [MASK] 的文本描述
            text = "a bunch of [MASK] laying on a [MASK]."
        else:
            # 否则，使用问句描述
            text = "How many cats are there?"
        encoding = processor(image, text, return_tensors="pt")
        # 将编码后的输入传递给模型进行推断
        outputs = model(**encoding)

    # 验证模型输出
    if mlm_model:
        # 如果是 mlm_model，验证输出的形状和特定位置的数值
        expected_shape = torch.Size([1, 11, 30522])
        expected_slice = torch.tensor([-12.5061, -12.5123, -12.5174])
        assert outputs.logits.shape == expected_shape
        assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)

        # 验证预测的 MASK 标记是否等于 "cats"
        predicted_id = outputs.logits[0, 4, :].argmax(-1).item()
        assert tokenizer.decode([predicted_id]) == "cats"
    # 如果是 VQA 模型，则执行以下操作
    elif vqa_model:
        # 预期的输出形状为 [1, 3129]
        expected_shape = torch.Size([1, 3129])
        # 预期的输出切片为 [-15.9495, -18.1472, -10.3041]
        expected_slice = torch.tensor([-15.9495, -18.1472, -10.3041])
        # 检查模型输出的前三个元素是否与预期切片接近
        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
        # 检查模型输出的形状是否与预期形状一致
        assert outputs.logits.shape == expected_shape
        # 再次检查模型输出的前三个元素是否与预期切片接近
        assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)

        # 验证 VQA 模型的预测结果是否等于 "2"
        predicted_idx = outputs.logits.argmax(-1).item()
        assert model.config.id2label[predicted_idx] == "2"
    
    # 如果是 NLVR 模型，则执行以下操作
    elif nlvr_model:
        # 预期的输出形状为 [1, 2]
        expected_shape = torch.Size([1, 2])
        # 预期的输出切片为 [-2.8721, 2.1291]
        expected_slice = torch.tensor([-2.8721, 2.1291])
        # 检查模型输出的前三个元素是否与预期切片接近
        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
        # 检查模型输出的形状是否与预期形状一致
        assert outputs.logits.shape == expected_shape

    # 确保目录存在，如果不存在则创建
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印信息，说明正在保存模型和处理器到指定路径
    print(f"Saving model and processor to {pytorch_dump_folder_path}")
    # 将模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)
    # 将处理器保存到指定路径
    processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果脚本作为主程序执行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--checkpoint_url",
        default="https://github.com/dandelin/ViLT/releases/download/200k/vilt_200k_mlm_itm.ckpt",
        type=str,
        help="URL of the checkpoint you'd like to convert."
    )
    # 添加必需的命令行参数：checkpoint_url，指定了默认的模型检查点 URL

    parser.add_argument(
        "--pytorch_dump_folder_path", 
        default=None, 
        type=str, 
        help="Path to the output PyTorch model directory."
    )
    # 添加命令行参数：pytorch_dump_folder_path，用于指定输出的 PyTorch 模型目录的路径

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在 args 变量中

    convert_vilt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
    # 调用函数 convert_vilt_checkpoint，传入解析得到的参数 checkpoint_url 和 pytorch_dump_folder_path

`.\models\vilt\feature_extraction_vilt.py`

# 设置脚本的编码格式为 UTF-8
# 版权声明，版权归 HuggingFace Inc. 团队所有，保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权：
# 您可以在符合许可证的情况下使用此文件。
# 您可以从以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件根据“原样”分发，
# 不附带任何明示或暗示的担保或条件。
# 请参阅许可证获取更多信息。
"""ViLT 的特征提取器类。"""

# 导入警告模块
import warnings

# 导入日志记录模块
from ...utils import logging
# 导入 ViLT 图像处理模块中的 ViltImageProcessor 类
from .image_processing_vilt import ViltImageProcessor

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# 定义 ViLT 特征提取器类，继承自 ViltImageProcessor 类
class ViltFeatureExtractor(ViltImageProcessor):
    # 初始化方法，接受任意位置参数和关键字参数
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，提示 ViltFeatureExtractor 类即将在 Transformers 版本 5 中被移除，建议使用 ViltImageProcessor 代替
        warnings.warn(
            "The class ViltFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use ViltImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类 ViltImageProcessor 的初始化方法，传递所有接收到的位置参数和关键字参数
        super().__init__(*args, **kwargs)

`.\models\vilt\image_processing_vilt.py`

# 设置文件编码为 UTF-8
# 版权声明，指明版权归 HuggingFace Inc. 团队所有
# 在 Apache 许可证 2.0 版本下许可使用本文件，详情请见 http://www.apache.org/licenses/LICENSE-2.0
# 如果不符合许可证要求，则不得使用本文件
# 该脚本从 Vision 能力可用性角度导入所需模块和函数
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

import numpy as np

# 导入图像处理相关工具和函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 导入日志工具
from ...utils import TensorType, is_vision_available, logging

# 如果 Vision 能力可用，则导入 PIL 模块
if is_vision_available():
    import PIL

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    返回可迭代值中各索引位置的最大值列表。
    """
    return [max(values_i) for values_i in zip(*values)]


def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    创建图像的像素掩码，其中 1 表示有效像素，0 表示填充像素。

    Args:
        image (`np.ndarray`):
            要创建像素掩码的图像。
        output_size (`Tuple[int, int]`):
            掩码的输出大小。
    """
    # 获取图像的高度和宽度
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    # 创建一个与输出大小相同的零矩阵
    mask = np.zeros(output_size, dtype=np.int64)
    # 将有效像素的区域设为 1
    mask[:input_height, :input_width] = 1
    return mask


def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    获取批量图像中所有图像的最大高度和宽度。
    """
    # 如果未指定输入数据格式，则推断第一个图像的通道维度格式
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])

    # 根据通道维度格式计算最大高度和宽度
    if input_data_format == ChannelDimension.FIRST:
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    elif input_data_format == ChannelDimension.LAST:
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)
# 定义函数以计算调整后图像的大小，确保长宽比例并且符合给定的尺寸要求
def get_resize_output_image_size(
    input_image: np.ndarray,
    shorter: int = 800,  # 最短边调整后的目标长度，默认为800像素
    longer: int = 1333,   # 最长边调整后的目标长度，默认为1333像素
    size_divisor: int = 32,  # 调整后的图像大小应为32的倍数
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:  # 返回调整后的图像高度和宽度

    # 获取输入图像的高度和宽度
    input_height, input_width = get_image_size(input_image, input_data_format)

    # 确定调整后的最小和最大尺寸
    min_size, max_size = shorter, longer

    # 计算缩放比例，以确保最小边调整到指定长度
    scale = min_size / min(input_height, input_width)

    # 根据图像的长宽比进行调整
    if input_height < input_width:
        new_height = min_size
        new_width = scale * input_width
    else:
        new_height = scale * input_height
        new_width = min_size

    # 如果调整后的最大边超过了指定的最大尺寸，则再次缩放图像大小
    if max(new_height, new_width) > max_size:
        scale = max_size / max(new_height, new_width)
        new_height = scale * new_height
        new_width = scale * new_width

    # 将浮点数的像素大小四舍五入为整数
    new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)

    # 将调整后的图像大小调整为指定的大小倍数
    new_height = new_height // size_divisor * size_divisor
    new_width = new_width // size_divisor * size_divisor

    # 返回调整后的图像高度和宽度
    return new_height, new_width


# ViltImageProcessor 类，继承自 BaseImageProcessor 类
class ViltImageProcessor(BaseImageProcessor):
    r"""
    构建 ViLT 图像处理器。

    """

    # 模型的输入名称列表
    model_input_names = ["pixel_values"]

    # 初始化方法
    def __init__(
        self,
        do_resize: bool = True,  # 是否调整图像大小，默认为 True
        size: Dict[str, int] = None,  # 图像尺寸字典，默认为 {"shortest_edge": 384}
        size_divisor: int = 32,  # 图像大小的调整倍数，默认为32
        resample: PILImageResampling = PILImageResampling.BICUBIC,  # PIL 图像重采样方法，默认为双三次插值
        do_rescale: bool = True,  # 是否对图像进行重新缩放，默认为 True
        rescale_factor: Union[int, float] = 1 / 255,  # 图像重新缩放的因子，默认为 1/255
        do_normalize: bool = True,  # 是否对图像进行标准化，默认为 True
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值，默认为 IMAGENET_STANDARD_MEAN
        image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差，默认为 IMAGENET_STANDARD_STD
        do_pad: bool = True,  # 是否对图像进行填充，默认为 True
        **kwargs,  # 其他关键字参数
    ) -> None:  # 返回空值

        # 如果关键字参数中包含 "pad_and_return_pixel_mask"，则将 do_pad 设置为该值并从 kwargs 中移除该项
        if "pad_and_return_pixel_mask" in kwargs:
            do_pad = kwargs.pop("pad_and_return_pixel_mask")

        # 调用父类的初始化方法，传入所有的关键字参数
        super().__init__(**kwargs)

        # 如果未提供 size 参数，则设置默认的 size 字典，以非正方形图像为默认
        size = size if size is not None else {"shortest_edge": 384}
        size = get_size_dict(size, default_to_square=False)

        # 初始化对象的各种属性
        self.do_resize = do_resize
        self.size = size
        self.size_divisor = size_divisor
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
        self.do_pad = do_pad

        # 定义有效的处理器关键字列表，用于后续验证和处理
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "size_divisor",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_pad",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    @classmethod
    # 重写基类的 `from_dict` 方法，以确保在使用 `from_dict` 创建图像处理器时更新 `reduce_labels`
    # 如果通过 `ViltImageProcessor.from_pretrained(checkpoint, pad_and_return_pixel_mask=False)` 创建图像处理器
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        # 复制输入的字典，以免修改原始参数
        image_processor_dict = image_processor_dict.copy()
        # 如果 kwargs 中包含 `pad_and_return_pixel_mask` 参数，则更新到 `image_processor_dict` 中，并从 kwargs 中移除
        if "pad_and_return_pixel_mask" in kwargs:
            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
        # 调用基类的 `from_dict` 方法，传入更新后的参数 `image_processor_dict` 和额外的 kwargs
        return super().from_dict(image_processor_dict, **kwargs)

    # 调整图像大小的方法
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        size_divisor: int = 32,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image.

        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
        resized to the max size while preserving the aspect ratio.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
            size_divisor (`int`, defaults to 32):
                The image is resized to a size that is a multiple of this value.
            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 根据输入的 size 字典获取调整后的大小，确保不改变图像的长宽比
        size = get_size_dict(size, default_to_square=False)
        # 检查 `size` 字典是否包含 `shortest_edge` 键，如果不包含则抛出 ValueError
        if "shortest_edge" not in size:
            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
        # 获取调整后的最短边和根据比例计算的最长边
        shorter = size["shortest_edge"]
        longer = int(1333 / 800 * shorter)
        # 计算最终的输出大小
        output_size = get_resize_output_image_size(
            image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
        )
        # 调用实际的图像调整函数 `resize`，传入图像、输出大小、重采样方法和格式参数，以及额外的 kwargs
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
    # 使用 self 参数作为方法的第一个参数，表示该方法是类的一部分
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        
        # 获取输出图像的高度和宽度
        output_height, output_width = output_size
        
        # 计算需要在图像底部和右侧填充的像素数
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        
        # 构建填充元组，((top_pad, bottom_pad), (left_pad, right_pad))
        padding = ((0, pad_bottom), (0, pad_right))
        
        # 调用 pad 方法进行图像填充，返回填充后的图像
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        
        # 返回填充后的图像
        return padded_image

    # 使用 self 参数作为方法的第一个参数，表示该方法是类的一部分
    def pad(
        self,
        images: List[np.ndarray],
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.

        Args:
            image (`np.ndarray`):
                Image to pad.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # Determine the maximum height and width in the batch of images
        pad_size = get_max_height_width(images, input_data_format=input_data_format)

        # Pad each image in the batch to match `pad_size`
        padded_images = [
            self._pad_image(
                image,
                pad_size,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
            )
            for image in images
        ]
        
        # Prepare the data dictionary to hold padded images
        data = {"pixel_values": padded_images}

        # Optionally, compute and add pixel masks to the data dictionary
        if return_pixel_mask:
            masks = [
                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
                for image in images
            ]
            data["pixel_mask"] = masks

        # Return BatchFeature object containing padded images and optional masks
        return BatchFeature(data=data, tensor_type=return_tensors)
    # 对输入的图像数据进行预处理的方法
    def preprocess(
        self,
        images: ImageInput,
        do_resize: Optional[bool] = None,  # 是否进行调整大小的标志
        size: Optional[Dict[str, int]] = None,  # 调整大小的目标尺寸
        size_divisor: Optional[int] = None,  # 调整大小的除数
        resample: PILImageResampling = None,  # 重采样方法
        do_rescale: Optional[bool] = None,  # 是否进行重新缩放的标志
        rescale_factor: Optional[float] = None,  # 重新缩放的因子
        do_normalize: Optional[bool] = None,  # 是否进行归一化的标志
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像归一化的均值
        image_std: Optional[Union[float, List[float]]] = None,  # 图像归一化的标准差
        do_pad: Optional[bool] = None,  # 是否进行填充的标志
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回张量的格式
        data_format: ChannelDimension = ChannelDimension.FIRST,  # 数据格式，通道维度优先
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的格式
        **kwargs,  # 其它可选参数

Transformers-源码解析-一百一十五-

Transformers 源码解析（一百一十五）

.\models\unispeech_sat\__init__.py

.\models\univnet\configuration_univnet.py

.\models\univnet\convert_univnet.py

.\models\univnet\feature_extraction_univnet.py

.\models\univnet\modeling_univnet.py

.\models\univnet\__init__.py

.\models\upernet\configuration_upernet.py

.\models\upernet\convert_convnext_upernet_to_pytorch.py

.\models\upernet\convert_swin_upernet_to_pytorch.py

.\models\upernet\modeling_upernet.py

.\models\upernet\__init__.py

.\models\videomae\configuration_videomae.py

.\models\videomae\convert_videomae_to_pytorch.py

.\models\videomae\feature_extraction_videomae.py

.\models\videomae\image_processing_videomae.py

.\models\videomae\modeling_videomae.py

.\models\videomae\__init__.py

.\models\vilt\configuration_vilt.py

.\models\vilt\convert_vilt_original_to_pytorch.py

.\models\vilt\feature_extraction_vilt.py

.\models\vilt\image_processing_vilt.py

`.\models\unispeech_sat\init.py`

`.\models\univnet\configuration_univnet.py`

`.\models\univnet\convert_univnet.py`

`.\models\univnet\feature_extraction_univnet.py`

`.\models\univnet\modeling_univnet.py`

`.\models\univnet\init.py`

`.\models\upernet\configuration_upernet.py`

`.\models\upernet\convert_convnext_upernet_to_pytorch.py`

`.\models\upernet\convert_swin_upernet_to_pytorch.py`

`.\models\upernet\modeling_upernet.py`

`.\models\upernet\init.py`

`.\models\videomae\configuration_videomae.py`

`.\models\videomae\convert_videomae_to_pytorch.py`

`.\models\videomae\feature_extraction_videomae.py`

`.\models\videomae\image_processing_videomae.py`

`.\models\videomae\modeling_videomae.py`

`.\models\videomae\init.py`

`.\models\vilt\configuration_vilt.py`

`.\models\vilt\convert_vilt_original_to_pytorch.py`

`.\models\vilt\feature_extraction_vilt.py`

`.\models\vilt\image_processing_vilt.py`