Transformers-源码解析-一百零八-

2 阅读1分钟

Transformers 源码解析(一百零八)

.\models\swin\modeling_tf_swin.py

# coding=utf-8
# 版权 2022 年 Microsoft Research 和 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本(“许可证”)许可;
# 除非符合许可证,否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,软件根据“原样”分发,
# 没有任何明示或暗示的保证或条件。
# 有关特定语言的权限,请参阅许可证。

""" TF 2.0 Swin Transformer 模型。"""

from __future__ import annotations

import collections.abc  # 导入用于检查抽象基类的标准库模块
import math  # 导入数学函数库
import warnings  # 导入警告处理模块
from dataclasses import dataclass  # 导入用于数据类的装饰器
from functools import partial  # 导入用于创建偏函数的函数
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union  # 导入类型提示相关模块

import tensorflow as tf  # 导入 TensorFlow 库

from ...activations_tf import ACT2FN  # 导入 TensorFlow 激活函数映射
from ...modeling_tf_utils import (  # 导入 TensorFlow 模型相关工具函数
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import shape_list  # 导入 TensorFlow 工具函数,用于获取张量形状
from ...utils import (  # 导入通用工具函数
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_swin import SwinConfig  # 导入 Swin 模型的配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 用于文档的常量和字符串
_CONFIG_FOR_DOC = "SwinConfig"  # Swin 配置类的文档字符串
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"  # 预训练模型检查点的文档字符串
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]  # 预期输出形状的文档字符串

# 用于图像分类的常量和字符串
_IMAGE_CLASS_CHECKPOINT = "microsoft/swin-tiny-patch4-window7-224"  # 图像分类模型检查点的文档字符串
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"  # 图像分类预期输出的文档字符串

# Swin 模型的预训练模型存档列表
TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/swin-tiny-patch4-window7-224",
    # 查看所有 Swin 模型,请访问 https://huggingface.co/models?filter=swin
]

# drop_path, TFSwinPatchEmbeddings, TFSwinPatchMerging 和 TFSwinDropPath 是 TensorFlow
# 中对 timm 库中 PyTorch 功能的实现。

@dataclass
class TFSwinEncoderOutput(ModelOutput):
    """
    Swin 编码器的输出,可能包括隐藏状态和注意力。
    """
    # 定义函数参数及其类型注解,用于接收模型的输出
    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列的张量。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            可选参数,当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回,包含模型每一层的隐藏状态的元组。
            每个张量的形状为 `(batch_size, sequence_length, hidden_size)`。
            包括初始嵌入输出后每个层的模型隐藏状态。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            可选参数,当 `output_attentions=True` 或 `config.output_attentions=True` 时返回,包含模型每个阶段的注意力权重的元组。
            每个张量的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            在注意力 softmax 后的注意力权重,用于计算自注意力头部的加权平均值。
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            可选参数,当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回,包含模型每一层的隐藏状态的元组。
            每个张量的形状为 `(batch_size, hidden_size, height, width)`。
            包括初始嵌入输出后每个层的模型隐藏状态,重塑以包括空间维度。
# 定义一个基于数据类的类 TFSwinModelOutput,继承自 ModelOutput
@dataclass
class TFSwinModelOutput(ModelOutput):
    """
    Swin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    # 定义成员变量并初始化,用来存储模型输出的不同部分
    last_hidden_state: tf.Tensor = None
    pooler_output: tf.Tensor | None = None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    attentions: Tuple[tf.Tensor, ...] | None = None
    reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
        reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    # 初始化属性:损失、重构像素值、隐藏状态、注意力权重和重塑后的隐藏状态,默认为None
    loss: tf.Tensor | None = None
    reconstruction: tf.Tensor = None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    attentions: Tuple[tf.Tensor, ...] | None = None
    reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None

    @property
    def logits(self):
        # 发出警告,提醒用户logits属性即将在Transformers的第5个版本中移除,建议使用reconstruction属性获取最终输出
        warnings.warn(
            "logits attribute is deprecated and will be removed in version 5 of Transformers."
            " Please use the reconstruction attribute to retrieve the final output instead.",
            FutureWarning,
        )
        # 返回重构属性作为输出
        return self.reconstruction
@dataclass
class TFSwinImageClassifierOutput(ModelOutput):
    """
    Swin outputs for image classification.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    loss: tf.Tensor | None = None  # 损失值,如果提供了 `labels` 参数,则返回;用于分类(如果 `config.num_labels==1` 则为回归)的损失。
    logits: tf.Tensor = None  # 分类(或回归,如果 `config.num_labels==1`)得分,未经 SoftMax 处理,形状为 `(batch_size, config.num_labels)`。
    hidden_states: Tuple[tf.Tensor, ...] | None = None  # 模型在每一层输出的隐藏状态和初始嵌入输出的元组,形状为 `(batch_size, sequence_length, hidden_size)`。
    attentions: Tuple[tf.Tensor, ...] | None = None  # 注意力权重,经过注意力 SoftMax 后的结果,用于计算自注意力头部中的加权平均值,形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的元组。
    reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None  # 模型在每一层输出的隐藏状态和初始嵌入输出的重塑版本,包括空间维度,形状为 `(batch_size, hidden_size, height, width)` 的元组。


def window_partition(input_feature: tf.Tensor, window_size: int) -> tf.Tensor:
    """
    Partitions the given input into windows.
    """
    batch_size, height, width, num_channels = shape_list(input_feature)  # 获取输入特征的形状信息
    input_feature = tf.reshape(
        input_feature,
        (batch_size, height // window_size, window_size, width // window_size, window_size, num_channels),  # 将输入特征重塑为窗口的形状
    )
    windows = tf.transpose(input_feature, (0, 1, 3, 2, 4, 5))  # 调整窗口的顺序
    windows = tf.reshape(windows, (-1, window_size, window_size, num_channels))  # 将调整顺序后的窗口展平
    return windows


def window_reverse(windows: tf.Tensor, window_size: int, height: int, width: int) -> tf.Tensor:
    """
    Merges windows to produce higher resolution features.
    """
    x = tf.shape(windows)[0]  # 获取窗口张量的第一维大小
    y = tf.cast(height * width / (window_size * window_size), tf.int32)  # 计算合并后特征的大小
    batch_size = tf.math.floordiv(x, y)  # 计算批次大小
    # 将输入的窗口数据重新形状为指定的多维张量,以便进行后续处理
    windows = tf.reshape(
        windows, (batch_size, height // window_size, width // window_size, window_size, window_size, -1)
    )
    # 转置张量的维度顺序,以便后续处理更方便
    windows = tf.transpose(windows, (0, 1, 3, 2, 4, 5))
    # 将张量重新形状为指定的多维张量,以便进行后续处理
    windows = tf.reshape(windows, (batch_size, height, width, -1))
    # 返回处理后的窗口数据张量
    return windows
def drop_path(
    input: tf.Tensor, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
) -> tf.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    """
    # 如果 drop_prob 为 0 或者不处于训练模式,则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留的概率
    keep_prob = 1 - drop_prob
    # 获取输入张量的形状信息
    input_shape = shape_list(input)
    # 获取张量的维度数
    ndim = len(input_shape)
    # 构建一个形状与输入张量相同的随机张量,用于决定每个元素是否保留
    shape = [input_shape[0]] + [1] * (ndim - 1)  # 适用于不同维度的张量,不仅限于2D卷积网络
    random_tensor = tf.random.uniform(shape)
    # 将随机张量中小于等于保留概率的元素设置为1.0,其余设置为0.0
    random_tensor = tf.where(random_tensor <= keep_prob, 1.0, 0.0)
    # 如果保留概率大于0且需要按保留概率进行缩放,则对随机张量进行缩放处理
    if keep_prob > 0.0 and scale_by_keep:
        random_tensor /= keep_prob
    # 返回经过随机路径丢弃后的输入张量
    return input * random_tensor


class TFSwinEmbeddings(keras.layers.Layer):
    """
    Construct the patch and position embeddings. Optionally, also the mask token.
    """

    def __init__(self, config: SwinConfig, use_mask_token: bool = False, **kwargs) -> None:
        super().__init__(**kwargs)
        # 初始化补丁和位置嵌入
        self.patch_embeddings = TFSwinPatchEmbeddings(config, name="patch_embeddings")
        # 获取补丁数量和网格大小
        self.num_patches = self.patch_embeddings.num_patches
        self.patch_grid = self.patch_embeddings.grid_size
        self.embed_dim = config.embed_dim
        self.use_mask_token = use_mask_token
        self.use_absolute_embeddings = config.use_absolute_embeddings

        # 层归一化
        self.norm = keras.layers.LayerNormalization(name="norm", epsilon=1e-5)
        # dropout
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
        self.config = config

    def build(self, input_shape: tf.TensorShape) -> None:
        # 如果需要使用掩码令牌,则添加掩码令牌的权重
        if self.use_mask_token:
            self.mask_token = self.add_weight(shape=(1, 1, self.embed_dim), initializer="zeros", name="mask_token")
        else:
            self.mask_token = None

        # 如果使用绝对位置嵌入,则添加位置嵌入的权重
        if self.use_absolute_embeddings:
            self.position_embeddings = self.add_weight(
                (1, self.num_patches + 1, self.embed_dim), initializer="zeros", name="positional_embeddings"
            )
        else:
            self.position_embeddings = None

        # 如果已经构建,则直接返回
        if self.built:
            return
        self.built = True
        # 构建补丁嵌入层、层归一化层和dropout层
        if getattr(self, "patch_embeddings", None) is not None:
            with tf.name_scope(self.patch_embeddings.name):
                self.patch_embeddings.build(None)
        if getattr(self, "norm", None) is not None:
            with tf.name_scope(self.norm.name):
                self.norm.build([None, None, self.config.embed_dim])
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)

    def call(
        self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False
    ) -> tf.Tensor:
        # 留待实现,用于调用该层处理输入张量
        pass
    ) -> Tuple[tf.Tensor, Tuple[int, int]]:
        # 计算输入图像的嵌入向量和输出维度
        embeddings, output_dimensions = self.patch_embeddings(pixel_values, training=training)
        
        # 对嵌入向量进行归一化处理
        embeddings = self.norm(embeddings, training=training)
        
        # 获取嵌入向量的形状信息
        batch_size, seq_len, _ = shape_list(embeddings)

        # 如果存在需要屏蔽的位置信息
        if bool_masked_pos is not None:
            # 创建与嵌入向量相同形状的屏蔽标记
            mask_tokens = tf.repeat(self.mask_token, batch_size, 0)
            mask_tokens = tf.repeat(mask_tokens, seq_len, 1)
            # 将屏蔽位置的嵌入向量替换为屏蔽标记
            mask = tf.expand_dims(bool_masked_pos, -1)
            mask = tf.cast(mask, mask_tokens.dtype)

            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask

        # 如果存在位置嵌入向量,则将其加到嵌入向量上
        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings

        # 对嵌入向量进行dropout处理
        embeddings = self.dropout(embeddings, training=training)

        # 返回处理后的嵌入向量和输出维度
        return embeddings, output_dimensions
class TFSwinPatchEmbeddings(keras.layers.Layer):
    """
    Image to Patch Embedding.
    """

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 从配置中获取图像大小和patch大小
        image_size, patch_size = config.image_size, config.patch_size
        # 获取通道数和嵌入维度
        num_channels, hidden_size = config.num_channels, config.embed_dim
        # 如果图像大小和patch大小不是可迭代对象,转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算patch的数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # 设置类属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])

        # 定义投影层,使用Conv2D将patch映射到隐藏维度空间
        self.projection = keras.layers.Conv2D(
            filters=hidden_size,
            kernel_size=self.patch_size,
            strides=self.patch_size,
            padding="valid",
            name="projection",
        )

    def maybe_pad(self, pixel_values: tf.Tensor, height: int, width: int) -> tf.Tensor:
        # 如果宽度不是patch宽度的整数倍,进行填充
        if width % self.patch_size[1] != 0:
            pad_values = ((0, 0), (0, 0), (0, 0), (0, self.patch_size[1] - width % self.patch_size[1]))
            pixel_values = tf.pad(pixel_values, pad_values)
        # 如果高度不是patch高度的整数倍,进行填充
        if height % self.patch_size[0] != 0:
            pad_values = ((0, 0), (0, 0), (0, self.patch_size[0] - height % self.patch_size[0]), (0, 0))
            pixel_values = tf.pad(pixel_values, pad_values)
        return pixel_values

    def call(self, pixel_values: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor, Tuple[int, int]]:
        # 获取输入张量的形状信息
        _, num_channels, height, width = shape_list(pixel_values)
        # 在动态执行环境下,检查通道数是否与配置中设置的一致
        if tf.executing_eagerly() and num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 如果需要,对输入进行填充,使其可以被self.patch_size整除
        pixel_values = self.maybe_pad(pixel_values, height, width)

        # 调整输入张量的维度顺序 B,C,H,W -> B,H,W,C
        pixel_values = tf.transpose(pixel_values, (0, 2, 3, 1))

        # 使用投影层将patch映射到隐藏维度空间
        embeddings = self.projection(pixel_values, training=training)

        # 调整输出张量的维度顺序 B,H,W,C -> B,C,H,W
        embeddings = tf.transpose(embeddings, (0, 3, 1, 2))

        # 获取输出张量的形状信息
        batch_size, channels, height, width = shape_list(embeddings)
        output_dimensions = (height, width)

        # 将输出张量reshape为 B,N,C 的形式,其中N为patch的数量
        embeddings = tf.reshape(embeddings, (batch_size, channels, -1))
        embeddings = tf.transpose(embeddings, (0, 2, 1))
        return embeddings, output_dimensions
    # 定义一个方法用于构建模型,如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 检查是否存在投影层,并在 TensorFlow 的命名空间下构建投影层
        if getattr(self, "projection", None) is not None:
            with tf.name_scope(self.projection.name):
                # 使用投影层的建模方法来构建投影层,传入特定维度的列表
                self.projection.build([None, None, None, self.num_channels])
class TFSwinPatchMerging(keras.layers.Layer):
    """
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`keras.layer.Layer`, *optional*, defaults to `keras.layers.LayerNormalization`):
            Normalization layer class.
    """

    def __init__(
        self, input_resolution: Tuple[int, int], dim: int, norm_layer: Optional[Callable] = None, **kwargs
    ) -> None:
        super().__init__(**kwargs)
        self.input_resolution = input_resolution  # 设置输入特征的分辨率
        self.dim = dim  # 设置输入通道数
        self.reduction = keras.layers.Dense(2 * dim, use_bias=False, name="reduction")  # 创建一个稠密层用于特征降维
        if norm_layer is None:
            # 如果未提供自定义的归一化层,则使用默认的层归一化层,设置标准化的epsilon值与PyTorch相同
            self.norm = keras.layers.LayerNormalization(epsilon=1e-5, name="norm")
        else:
            self.norm = norm_layer(name="norm")  # 使用提供的自定义归一化层

    def maybe_pad(self, input_feature: tf.Tensor, height: int, width: int) -> tf.Tensor:
        should_pad = (height % 2 == 1) or (width % 2 == 1)
        if should_pad:
            pad_values = ((0, 0), (0, height % 2), (0, width % 2), (0, 0))  # 计算需要填充的值
            input_feature = tf.pad(input_feature, pad_values)  # 对输入特征进行填充

        return input_feature

    def call(self, input_feature: tf.Tensor, input_dimensions: Tuple[int, int], training: bool = False) -> tf.Tensor:
        height, width = input_dimensions
        batch_size, _, num_channels = shape_list(input_feature)  # 获取输入特征的形状信息

        input_feature = tf.reshape(input_feature, (batch_size, height, width, num_channels))  # 将输入特征重塑为四维张量
        input_feature = self.maybe_pad(input_feature, height, width)  # 可能对输入特征进行填充,使其尺寸可以被宽度和高度整除
        input_feature_0 = input_feature[:, 0::2, 0::2, :]  # 提取输入特征的每隔一个像素点的子集
        input_feature_1 = input_feature[:, 1::2, 0::2, :]  # 提取输入特征的每隔一个像素点的子集
        input_feature_2 = input_feature[:, 0::2, 1::2, :]  # 提取输入特征的每隔一个像素点的子集
        input_feature_3 = input_feature[:, 1::2, 1::2, :]  # 提取输入特征的每隔一个像素点的子集
        input_feature = tf.concat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)  # 合并这四个子集
        input_feature = tf.reshape(
            input_feature, (batch_size, -1, 4 * num_channels)
        )  # 将合并后的特征重塑为三维张量,以便进一步处理

        input_feature = self.norm(input_feature, training=training)  # 对特征进行归一化
        input_feature = self.reduction(input_feature, training=training)  # 对特征进行降维

        return input_feature
    # 定义 build 方法,用于构建模型,如果已经构建过,则直接返回
    def build(self, input_shape=None):
        # 检查是否已经构建过,如果是则返回,避免重复构建
        if self.built:
            return
        # 将标志设置为已构建
        self.built = True
        
        # 如果有指定的 reduction 属性,则在名为 reduction 的命名空间下构建
        if getattr(self, "reduction", None) is not None:
            with tf.name_scope(self.reduction.name):
                # 使用 4 * self.dim 的输入形状来构建 reduction 属性
                self.reduction.build([None, None, 4 * self.dim])
        
        # 如果有指定的 norm 属性,则在名为 norm 的命名空间下构建
        if getattr(self, "norm", None) is not None:
            with tf.name_scope(self.norm.name):
                # 使用 4 * self.dim 的输入形状来构建 norm 属性
                self.norm.build([None, None, 4 * self.dim])
class TFSwinDropPath(keras.layers.Layer):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: float = None, scale_by_keep: bool = True, **kwargs) -> None:
        super(TFSwinDropPath, self).__init__(**kwargs)
        self.drop_prob = drop_prob  # 初始化丢弃概率
        self.scale_by_keep = scale_by_keep  # 是否按保留比例缩放

    def call(self, input: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 调用 drop_path 函数来应用丢弃路径操作
        return drop_path(input, self.drop_prob, training, self.scale_by_keep)


class TFSwinSelfAttention(keras.layers.Layer):
    def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
        super().__init__(**kwargs)
        if dim % num_heads != 0:
            raise ValueError(
                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
            )

        self.num_attention_heads = num_heads  # 设置注意力头数
        self.attention_head_size = int(dim / num_heads)  # 计算每个注意力头的大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size  # 总的 QKV 大小
        window_size = config.window_size
        self.window_size = (
            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
        )  # 窗口大小

        self.query = keras.layers.Dense(
            self.all_head_size,
            kernel_initializer=get_initializer(config.initializer_range),
            use_bias=config.qkv_bias,
            name="query",
        )  # 查询向量的全连接层

        self.key = keras.layers.Dense(
            self.all_head_size,
            kernel_initializer=get_initializer(config.initializer_range),
            use_bias=config.qkv_bias,
            name="key",
        )  # 键向量的全连接层

        self.value = keras.layers.Dense(
            self.all_head_size,
            kernel_initializer=get_initializer(config.initializer_range),
            use_bias=config.qkv_bias,
            name="value",
        )  # 值向量的全连接层

        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)  # 注意力概率的 dropout 层
    def build(self, input_shape: tf.TensorShape) -> None:
        # 创建一个用于存储相对位置偏置表的权重变量
        self.relative_position_bias_table = self.add_weight(
            shape=(((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)), self.num_attention_heads),
            initializer="zeros",
            name="relative_position_bias_table",
        )
        # 创建一个用于存储相对位置索引的权重变量,这些索引是窗口内每个标记的相对位置
        self.relative_position_index = self.add_weight(
            shape=(self.window_size[0] ** 2, self.window_size[1] ** 2),
            trainable=False,
            dtype=tf.int32,
            name="relative_position_index",
        )

        # 获取窗口内每个标记的成对相对位置索引
        coords_h = tf.range(self.window_size[0])
        coords_w = tf.range(self.window_size[1])
        coords = tf.stack(tf.meshgrid(coords_h, coords_w, indexing="ij"))
        coords_flatten = tf.reshape(coords, (shape_list(coords)[0], -1))
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
        relative_coords = tf.transpose(relative_coords, (1, 2, 0))

        stack_0, stack_1 = tf.unstack(relative_coords, axis=2)
        stack_0 += self.window_size[0] - 1
        stack_0 *= 2 * self.window_size[1] - 1
        stack_1 += self.window_size[1] - 1
        relative_coords = tf.stack([stack_0, stack_1], axis=2)

        # 计算相对位置索引的总和并分配给相对位置索引变量
        self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32))

        # 如果已经构建过,则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在查询、键、值变量,则构建它们的结构
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.all_head_size])
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.all_head_size])
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.all_head_size])

    def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
        # 调整张量的形状以便计算注意力分数
        new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size]
        x = tf.reshape(x, new_x_shape)
        return tf.transpose(x, (0, 2, 1, 3))

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: bool = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor, ...]:
        # 获取隐藏状态的形状信息:批大小、维度等
        batch_size, dim, _ = shape_list(hidden_states)
        # 对隐藏状态进行查询操作,生成混合的查询层
        mixed_query_layer = self.query(hidden_states)

        # 使用self.key对隐藏状态进行键的转换,并调整形状以适应注意力得分计算
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        # 使用self.value对隐藏状态进行值的转换,并调整形状以适应注意力得分计算
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        # 对混合的查询层进行形状调整,以适应注意力得分计算
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算查询层与键层之间的点积,得到原始的注意力得分
        attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, (0, 1, 3, 2)))

        # 对注意力得分进行缩放,以减少数值大小对 softmax 函数计算的影响
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        
        # 根据相对位置索引从相对位置偏置表中获取相对位置偏置
        relative_position_bias = tf.gather(
            self.relative_position_bias_table, tf.reshape(self.relative_position_index, (-1,))
        )
        # 调整相对位置偏置的形状以匹配注意力得分的形状
        relative_position_bias = tf.reshape(
            relative_position_bias,
            (self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1),
        )
        # 转置相对位置偏置的维度顺序,以便与注意力得分相加
        relative_position_bias = tf.transpose(relative_position_bias, (2, 0, 1))
        attention_scores = attention_scores + tf.expand_dims(relative_position_bias, 0)

        # 如果存在注意力掩码,则应用它
        if attention_mask is not None:
            # 获取注意力掩码的形状信息
            mask_shape = shape_list(attention_mask)[0]
            # 调整注意力得分的形状以匹配掩码的形状
            attention_scores = tf.reshape(
                attention_scores, (batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim)
            )
            # 扩展注意力掩码的维度以匹配注意力得分
            attention_mask = tf.expand_dims(attention_mask, 1)
            attention_mask = tf.expand_dims(attention_mask, 0)
            # 将注意力掩码加到注意力得分上
            attention_scores = attention_scores + attention_mask
            # 重新调整注意力得分的形状
            attention_scores = tf.reshape(attention_scores, (-1, self.num_attention_heads, dim, dim))

        # 对注意力得分进行 softmax 归一化,得到注意力概率
        attention_probs = tf.nn.softmax(attention_scores, axis=-1)

        # 使用 dropout 进行注意力概率的随机失活,仅在训练时生效
        attention_probs = self.dropout(attention_probs, training=training)

        # 如果指定了头部掩码,则应用头部掩码
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文层,将注意力概率乘以值层
        context_layer = tf.matmul(attention_probs, value_layer)
        # 调整上下文层的维度顺序,以适应输出格式
        context_layer = tf.transpose(context_layer, (0, 2, 1, 3))
        # 调整上下文层的形状以匹配所有头部的输出大小
        new_context_layer_shape = shape_list(context_layer)[:-2] + [
            self.all_head_size,
        ]
        context_layer = tf.reshape(context_layer, new_context_layer_shape)

        # 输出结果,包括上下文层和可能的注意力概率
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# 定义一个名为 TFSwinSelfOutput 的自定义层,继承自 Keras 的 Layer 类
class TFSwinSelfOutput(keras.layers.Layer):
    # 初始化方法,接受 SwinConfig 对象、整数 dim 和额外的关键字参数
    def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 创建一个 Dense 层,用于线性变换,输出维度为 dim
        self.dense = keras.layers.Dense(dim, name="dense")
        # 创建一个 Dropout 层,使用配置中的 dropout 概率
        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout")
        self.dim = dim

    # 前向传播方法,接受 hidden_states(输入张量)、input_tensor(输入张量)、training(布尔值,指示是否处于训练模式)
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入通过 Dense 层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的结果进行 Dropout 操作
        hidden_states = self.dropout(hidden_states, training=training)
        return hidden_states

    # 构建方法,用于构建层的内部结构
    def build(self, input_shape=None):
        # 如果层已经构建,则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 Dense 层,则构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.dim])
        # 如果存在 Dropout 层,则构建该层
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)


# 定义一个名为 TFSwinAttention 的自定义层,继承自 Keras 的 Layer 类
class TFSwinAttention(keras.layers.Layer):
    # 初始化方法,接受 SwinConfig 对象、整数 dim、整数 num_heads 和额外的关键字参数
    def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 创建一个 TFSwinSelfAttention 层,用于处理注意力机制
        self.self = TFSwinSelfAttention(config, dim, num_heads, name="self")
        # 创建一个 TFSwinSelfOutput 层,用于处理自注意力输出
        self.self_output = TFSwinSelfOutput(config, dim, name="output")
        # 初始化一个空集合,用于存储要剪枝的注意力头
        self.pruned_heads = set()

    # 剪枝注意力头的方法,抛出未实现异常
    def prune_heads(self, heads):
        """
        Prunes heads of the model. See base class PreTrainedModel heads: dict of {layer_num: list of heads to prune in
        this layer}
        """
        raise NotImplementedError

    # 前向传播方法,接受 hidden_states(输入张量)、attention_mask(注意力掩码张量)、head_mask(头部掩码张量)、
    # output_attentions(布尔值,指示是否输出注意力矩阵)、training(布尔值,指示是否处于训练模式)
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: bool = False,
        training: bool = False,
    ) -> tf.Tensor:
        # 使用 self 层处理输入的 hidden_states,得到自注意力输出 self_outputs
        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions, training=training)
        # 使用 self_output 层处理 self_outputs 和原始 hidden_states,得到注意力输出 attention_output
        attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
        # 构建输出元组 outputs,包括注意力输出和可能的注意力矩阵(如果有的话)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs

    # 构建方法,用于构建层的内部结构
    def build(self, input_shape=None):
        # 如果层已经构建,则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 self 层,则构建该层
        if getattr(self, "self", None) is not None:
            with tf.name_scope(self.self.name):
                self.self.build(None)
        # 如果存在 self_output 层,则构建该层
        if getattr(self, "self_output", None) is not None:
            with tf.name_scope(self.self_output.name):
                self.self_output.build(None)


# 定义一个名为 TFSwinIntermediate 的自定义层,继承自 Keras 的 Layer 类
class TFSwinIntermediate(keras.layers.Layer):
    # 初始化方法,用于创建一个新的实例
    def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
        # 调用父类(tf.keras.layers.Layer)的初始化方法
        super().__init__(**kwargs)
        # 创建一个全连接层,输出维度为 config.mlp_ratio * dim,命名为 "dense"
        self.dense = keras.layers.Dense(int(config.mlp_ratio * dim), name="dense")
        
        # 根据配置文件中的 hidden_act 参数确定中间激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
        
        # 将维度信息保存在实例变量 dim 中
        self.dim = dim

    # 调用方法,定义了该层的正向传播逻辑
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 通过全连接层处理输入的 hidden_states,得到输出 hidden_states
        hidden_states = self.dense(hidden_states)
        # 使用中间激活函数处理输出 hidden_states
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states

    # 构建方法,用于构建层的变量(如果尚未构建)
    def build(self, input_shape=None):
        # 如果已经构建过,直接返回
        if self.built:
            return
        # 设置标志位,表明已经构建过
        self.built = True
        
        # 如果存在全连接层 dense,则根据输入形状构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 调用全连接层的 build 方法,指定输入形状 [None, None, self.dim]
                self.dense.build([None, None, self.dim])
# 定义一个名为 TFSwinOutput 的自定义层,继承自 keras 的 Layer 类
class TFSwinOutput(keras.layers.Layer):
    
    # 初始化方法,接受 SwinConfig 对象、维度 dim 和其他关键字参数
    def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
        super().__init__(**kwargs)
        # 创建一个全连接层 dense,输出维度为 dim,命名为 "dense"
        self.dense = keras.layers.Dense(dim, name="dense")
        # 创建一个 Dropout 层,使用 SwinConfig 中的隐藏层 Dropout 概率作为参数,命名为 "dropout"
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
        # 将传入的 SwinConfig 对象保存到 self.config 中
        self.config = config
        # 将传入的维度 dim 保存到 self.dim 中

        self.dim = dim

    # 定义 call 方法,接收隐藏状态 hidden_states 和训练标志 training
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将隐藏状态输入到全连接层 dense 中,得到输出 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对输出 hidden_states 应用 Dropout 操作,使用 training 参数控制是否训练模式
        hidden_states = self.dropout(hidden_states, training=training)
        # 返回经过全连接层和 Dropout 后的 hidden_states

        return hidden_states

    # 定义 build 方法,用于构建层的参数
    def build(self, input_shape=None):
        # 如果已经构建过,直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 检查是否存在 self.dense 属性
        if getattr(self, "dense", None) is not None:
            # 在命名空间 self.dense.name 下,构建全连接层,输入形状为 [None, None, int(self.config.mlp_ratio * self.dim)]
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, int(self.config.mlp_ratio * self.dim)])


# 定义一个名为 TFSwinLayer 的自定义层,继承自 keras 的 Layer 类
class TFSwinLayer(keras.layers.Layer):
    
    # 初始化方法,接受 config 对象、维度 dim、输入分辨率 input_resolution、注意力头数 num_heads 和其他关键字参数
    def __init__(
        self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs
    ) -> None:
        super().__init__(**kwargs)
        # 设置前馈传输块的大小为 config 中的 chunk_size_feed_forward
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 计算输入分辨率的最小值
        min_res = tf.reduce_min(input_resolution)
        # 窗口大小为最小分辨率和 config 中的 window_size 的较小值
        self.window_size = min_res if min_res <= config.window_size else config.window_size
        # 如果最小分辨率小于等于窗口大小,则 shift_size 设为 0;否则使用传入的 shift_size
        self.shift_size = 0 if min_res <= self.window_size else shift_size
        # 保存输入分辨率到 self.input_resolution 中
        self.input_resolution = input_resolution

        # 创建 LayerNormalization 层,epsilon 使用 config 中的 layer_norm_eps,命名为 "layernorm_before"
        self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
        # 创建注意力机制层 TFSwinAttention,使用传入的 config、dim 和 num_heads,命名为 "attention"
        self.attention = TFSwinAttention(config, dim, num_heads, name="attention")
        # 如果 config 中的 drop_path_rate 大于 0.0,则创建 TFSwinDropPath 层,命名为 "drop_path",否则使用线性激活层
        self.drop_path = (
            TFSwinDropPath(config.drop_path_rate, name="drop_path")
            if config.drop_path_rate > 0.0
            else keras.layers.Activation("linear", name="drop_path")
        )
        # 创建 LayerNormalization 层,epsilon 使用 config 中的 layer_norm_eps,命名为 "layernorm_after"
        self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
        # 创建 Swin 模型的中间层 TFSwinIntermediate,使用 config 和 dim,命名为 "intermediate"
        self.intermediate = TFSwinIntermediate(config, dim, name="intermediate")
        # 创建 Swin 模型的输出层 TFSwinOutput,使用 config 和 dim,命名为 "output"
        self.swin_output = TFSwinOutput(config, dim, name="output")
        # 保存维度 dim 到 self.dim 中
        self.dim = dim
    def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None:
        # 创建一个全零的图像掩码,形状为(height, width)
        img_mask = tf.zeros((height, width))
        # 定义高度和宽度的切片范围,用于创建注意力掩码
        height_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
        width_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))

        # 计算 SW-MSA 的注意力掩码
        if shift_size > 0:
            count = 0
            for height_slice in height_slices:
                for width_slice in width_slices:
                    # 计算当前切片内的索引
                    height_inds = tf.range(height_slice[0] % height, height_slice[1] % height + 1)
                    width_inds = tf.range(width_slice[0] % width, width_slice[1] % width + 1)
                    indices = tf.reshape(tf.stack(tf.meshgrid(height_inds, width_inds), axis=-1), (-1, 2))
                    if len(indices) >= 1:
                        # 将更新值为 count 的掩码应用到图像掩码的对应位置
                        updates = tf.ones((len(indices),), dtype=img_mask.dtype) * count
                        img_mask = tf.tensor_scatter_nd_update(img_mask, indices, updates)
                    count += 1

        # 将图像掩码扩展维度以适应后续计算要求
        img_mask = tf.expand_dims(img_mask, -1)
        img_mask = tf.expand_dims(img_mask, 0)

        # 对图像掩码进行窗口划分,用于后续的注意力计算
        mask_windows = window_partition(img_mask, window_size)
        mask_windows = tf.reshape(mask_windows, (-1, window_size * window_size))
        # 构建注意力掩码,对角线上的元素为 -100.0,其余为 0.0
        attn_mask = tf.expand_dims(mask_windows, 1) - tf.expand_dims(mask_windows, 2)
        attn_mask = tf.where(attn_mask != 0, float(-100.0), attn_mask)
        attn_mask = tf.where(attn_mask == 0, float(0.0), attn_mask)
        return attn_mask

    def maybe_pad(
        self, hidden_states: tf.Tensor, window_size: int, height: int, width: int
    ) -> Tuple[tf.Tensor, tf.Tensor]:
        # 计算需要在图像状态中填充的右边和底部的像素数
        pad_right = (window_size - width % window_size) % window_size
        pad_bottom = (window_size - height % window_size) % window_size
        # 定义填充的数值,填充右边和底部,保持其他维度不变
        pad_values = [[0, 0], [0, pad_bottom], [0, pad_right], [0, 0]]
        # 在隐藏状态张量上应用填充
        hidden_states = tf.pad(hidden_states, pad_values)
        # 将填充值转换为一维张量返回
        pad_values = tf.reshape(pad_values, (-1,))
        return hidden_states, pad_values

    def call(
        self,
        hidden_states: tf.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: tf.Tensor | None = None,
        output_attentions: bool = False,
        training: bool = False,
    ):
        # 神经网络层的调用函数,处理输入的隐藏状态和其他参数
    ) -> tf.Tensor:
        # 如果窗口大小大于输入分辨率,则不分割窗口
        min_res = tf.reduce_min(input_dimensions)  # 计算输入维度的最小值
        shift_size = 0 if min_res <= self.window_size else self.shift_size  # 如果最小分辨率小于等于窗口大小,则不进行移动;否则使用预设的移动大小
        window_size = min_res if min_res <= self.window_size else self.window_size  # 窗口大小取决于最小分辨率和设定的窗口大小

        height, width = input_dimensions  # 解包输入维度
        batch_size, _, channels = shape_list(hidden_states)  # 获取隐藏状态的批处理大小、高度、宽度和通道数
        shortcut = hidden_states  # 备份隐藏状态

        hidden_states = self.layernorm_before(hidden_states, training=training)  # 应用层归一化到隐藏状态之前
        hidden_states = tf.reshape(hidden_states, (batch_size, height, width, channels))  # 重新调整隐藏状态的形状为(batch_size, height, width, channels)
        hidden_states, pad_values = self.maybe_pad(hidden_states, window_size, height, width)  # 可能对隐藏状态进行填充,使其成为窗口大小的倍数

        _, height_pad, width_pad, _ = shape_list(hidden_states)  # 获取调整后隐藏状态的形状
        # 循环移位
        if shift_size > 0:
            shifted_hidden_states = tf.roll(hidden_states, shift=(-shift_size, -shift_size), axis=(1, 2))  # 在轴(1, 2)上执行负移位
        else:
            shifted_hidden_states = hidden_states  # 否则不进行移位

        # 分割窗口
        hidden_states_windows = window_partition(shifted_hidden_states, window_size)  # 将移位后的隐藏状态分割成窗口
        hidden_states_windows = tf.reshape(hidden_states_windows, (-1, window_size * window_size, channels))  # 重新调整窗口的形状为(-1, window_size * window_size, channels)
        attn_mask = self.get_attn_mask(
            height=height_pad, width=width_pad, window_size=window_size, shift_size=shift_size
        )  # 获取注意力掩码

        attention_outputs = self.attention(
            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions, training=training
        )  # 应用自注意力机制

        attention_output = attention_outputs[0]  # 提取注意力输出的第一个元素

        attention_windows = tf.reshape(attention_output, (-1, window_size, window_size, channels))  # 重新调整注意力输出的形状为(-1, window_size, window_size, channels)
        shifted_windows = window_reverse(attention_windows, window_size, height_pad, width_pad)  # 反转窗口

        # 反向循环移位
        if shift_size > 0:
            attention_windows = tf.roll(shifted_windows, shift=(shift_size, shift_size), axis=(1, 2))  # 在轴(1, 2)上执行正移位
        else:
            attention_windows = shifted_windows  # 否则不进行移位

        was_padded = pad_values[3] > 0 or pad_values[5] > 0  # 检查是否对隐藏状态进行了填充
        if was_padded:
            attention_windows = attention_windows[:, :height, :width, :]  # 如果进行了填充,则截取有效部分

        attention_windows = tf.reshape(attention_windows, (batch_size, height * width, channels))  # 重新调整注意力窗口的形状为(batch_size, height * width, channels)

        hidden_states = shortcut + self.drop_path(attention_windows, training=training)  # 添加残差连接和DropPath

        layer_output = self.layernorm_after(hidden_states, training=training)  # 应用层归一化到隐藏状态之后
        layer_output = self.intermediate(layer_output)  # 应用中间层变换
        layer_output = hidden_states + self.swin_output(layer_output, training=training)  # 添加Swin Transformer的输出

        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)  # 构造输出元组

        return layer_outputs  # 返回层输出
    # 构建模型的方法,用于设置层的输入形状并构建层的参数
    def build(self, input_shape=None):
        # 如果已经构建过,则直接返回,避免重复构建
        if self.built:
            return
        # 将构建标志设置为已构建
        self.built = True
        
        # 如果存在layernorm_before属性,则构建layernorm_before层
        if getattr(self, "layernorm_before", None) is not None:
            # 使用layernorm_before层的名字作为命名空间
            with tf.name_scope(self.layernorm_before.name):
                # 构建layernorm_before层,设置输入形状为[None, None, self.dim]
                self.layernorm_before.build([None, None, self.dim])
        
        # 如果存在attention属性,则构建attention层
        if getattr(self, "attention", None) is not None:
            # 使用attention层的名字作为命名空间
            with tf.name_scope(self.attention.name):
                # 构建attention层,输入形状为None(表示不确定的形状)
                self.attention.build(None)
        
        # 如果存在drop_path属性,则构建drop_path层
        if getattr(self, "drop_path", None) is not None:
            # 使用drop_path层的名字作为命名空间
            with tf.name_scope(self.drop_path.name):
                # 构建drop_path层,输入形状为None
                self.drop_path.build(None)
        
        # 如果存在layernorm_after属性,则构建layernorm_after层
        if getattr(self, "layernorm_after", None) is not None:
            # 使用layernorm_after层的名字作为命名空间
            with tf.name_scope(self.layernorm_after.name):
                # 构建layernorm_after层,设置输入形状为[None, None, self.dim]
                self.layernorm_after.build([None, None, self.dim])
        
        # 如果存在intermediate属性,则构建intermediate层
        if getattr(self, "intermediate", None) is not None:
            # 使用intermediate层的名字作为命名空间
            with tf.name_scope(self.intermediate.name):
                # 构建intermediate层,输入形状为None
                self.intermediate.build(None)
        
        # 如果存在swin_output属性,则构建swin_output层
        if getattr(self, "swin_output", None) is not None:
            # 使用swin_output层的名字作为命名空间
            with tf.name_scope(self.swin_output.name):
                # 构建swin_output层,输入形状为None
                self.swin_output.build(None)
class TFSwinStage(keras.layers.Layer):
    # 定义一个名为 TFSwinStage 的自定义 Keras 层
    def __init__(
        self,
        config: SwinConfig,
        dim: int,
        input_resolution: Tuple[int, int],
        depth: int,
        num_heads: int,
        drop_path: List[float],
        downsample: Optional[Callable],
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        # 初始化函数,接受多个参数,其中包括 Swin 模型的配置、维度、输入分辨率、深度、头数、路径丢弃率等
        self.config = config
        self.dim = dim
        # 创建一个由 TFSwinLayer 实例组成的列表,每个实例代表一个层
        self.blocks = [
            TFSwinLayer(
                config=config,
                dim=dim,
                input_resolution=input_resolution,
                num_heads=num_heads,
                shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                name=f"blocks.{i}",
            )
            for i in range(depth)
        ]

        # 如果存在下采样函数,创建下采样层
        if downsample is not None:
            self.downsample = downsample(
                input_resolution,
                dim=dim,
                norm_layer=partial(keras.layers.LayerNormalization, epsilon=1e-5),
                name="downsample",
            )
        else:
            self.downsample = None

        # 初始化指向(pointing)为 False
        self.pointing = False

    # 定义调用函数,处理输入并返回输出
    def call(
        self,
        hidden_states: tf.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor, ...]:
        height, width = input_dimensions
        # 遍历所有层,逐层处理隐藏状态
        for i, layer_module in enumerate(self.blocks):
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 调用每个层的处理函数,获取层的输出
            layer_outputs = layer_module(
                hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training
            )

            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

        # 如果存在下采样层,对隐藏状态进行下采样操作
        if self.downsample is not None:
            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
            output_dimensions = (height, width, height_downsampled, width_downsampled)
            hidden_states = self.downsample(layer_outputs[0], input_dimensions, training=training)
        else:
            output_dimensions = (height, width, height, width)

        # 组装阶段的输出,包括隐藏状态和输出尺寸
        stage_outputs = (hidden_states, output_dimensions)

        # 如果需要输出注意力权重,则将它们添加到阶段的输出中
        if output_attentions:
            stage_outputs += layer_outputs[1:]
        return stage_outputs

    # 定义构建函数,在第一次调用时构建层
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在下采样层,构建该层
        if getattr(self, "downsample", None) is not None:
            with tf.name_scope(self.downsample.name):
                self.downsample.build(None)
        # 对每个层调用构建函数,构建所有的子层
        if getattr(self, "blocks", None) is not None:
            for layer in self.blocks:
                with tf.name_scope(layer.name):
                    layer.build(None)


class TFSwinEncoder(keras.layers.Layer):
    # 定义一个名为 TFSwinEncoder 的自定义 Keras 层
    # 初始化函数,接受一个SwinTransformer的配置对象和一个网格大小的元组作为参数
    def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)
        # 计算SwinTransformer模型的层数
        self.num_layers = len(config.depths)
        # 保存传入的配置对象
        self.config = config
        # 计算每一层的DropPath率,并转换为列表
        dpr = list((tf.linspace(0, 1, sum(config.depths)) * config.drop_path_rate).numpy())
        
        # 创建SwinTransformer的各个层
        self.layers = [
            TFSwinStage(
                config=config,
                # 计算当前层的维度
                dim=int(config.embed_dim * 2**i_layer),
                # 计算当前层的输入分辨率
                input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
                # 设置当前层的深度
                depth=config.depths[i_layer],
                # 设置当前层的头数
                num_heads=config.num_heads[i_layer],
                # 为当前层设置DropPath率
                drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                # 如果当前层不是最后一层,设置下采样方法;否则为None
                downsample=TFSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
                # 设置当前层的名称
                name=f"layers.{i_layer}",
            )
            # 对每一层进行迭代
            for i_layer in range(self.num_layers)
        ]
        
        # 默认关闭梯度检查点
        self.gradient_checkpointing = False

    # 模型调用函数,接受隐藏状态张量、输入维度元组等多个参数
    def call(
        self,
        hidden_states: tf.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: tf.Tensor | None = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor, ...], TFSwinEncoderOutput]:
        # 定义函数签名及返回类型,输入为隐藏状态及其他参数,输出为元组或TFSwinEncoderOutput类型
        all_input_dimensions = ()
        # 初始化空元组,用于存储所有输入维度信息
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出隐藏状态,则初始化空元组,否则置为None
        all_reshaped_hidden_states = () if output_hidden_states else None
        # 如果需要输出隐藏状态,则初始化空元组,否则置为None
        all_self_attentions = () if output_attentions else None
        # 如果需要输出注意力权重,则初始化空元组,否则置为None

        if output_hidden_states:
            batch_size, _, hidden_size = shape_list(hidden_states)
            # 获取隐藏状态的批量大小、高、宽、通道数信息
            # 重排形状为 b (h w) c -> b c h w
            reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size))
            reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2))
            # 将形状调整为 b c h w,并进行转置以匹配预期的维度顺序
            all_hidden_states += (hidden_states,)
            all_reshaped_hidden_states += (reshaped_hidden_state,)
            # 将隐藏状态及其重排后的形状信息添加到对应的元组中

        for i, layer_module in enumerate(self.layers):
            # 遍历self.layers中的每一层模块
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的注意力头遮罩,如果未提供则置为None

            layer_outputs = layer_module(
                hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training
            )
            # 调用当前层模块的前向传播方法,计算层的输出结果

            hidden_states = layer_outputs[0]
            # 更新隐藏状态为当前层输出的第一个元素(通常是最终的隐藏状态)
            output_dimensions = layer_outputs[1]
            # 获取当前层输出的维度信息

            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
            # 更新输入维度为当前层输出的高和宽信息
            all_input_dimensions += (input_dimensions,)
            # 将更新后的输入维度信息添加到all_input_dimensions中

            if output_hidden_states:
                batch_size, _, hidden_size = shape_list(hidden_states)
                # 获取隐藏状态的批量大小、高、宽、通道数信息
                # 重排形状为 b (h w) c -> b c h w
                reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size))
                reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2))
                # 将形状调整为 b c h w,并进行转置以匹配预期的维度顺序
                all_hidden_states += (hidden_states,)
                all_reshaped_hidden_states += (reshaped_hidden_state,)
                # 将隐藏状态及其重排后的形状信息添加到对应的元组中

            if output_attentions:
                all_self_attentions += layer_outputs[2:]
                # 如果需要输出注意力权重,则将当前层输出中的注意力权重信息添加到all_self_attentions中

        if not return_dict:
            # 如果不需要返回字典格式的输出结果
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            # 返回所有非空的结果组成的元组

        return TFSwinEncoderOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            reshaped_hidden_states=all_reshaped_hidden_states,
        )
        # 返回以TFSwinEncoderOutput格式封装的输出结果

    def build(self, input_shape=None):
        # 定义build方法,用于构建模型层次结构
        if self.built:
            # 如果模型已构建完成,则直接返回
            return
        self.built = True
        # 将模型标记为已构建
        if getattr(self, "layers", None) is not None:
            # 如果存在模型层列表
            for layer in self.layers:
                # 遍历每一层
                with tf.name_scope(layer.name):
                    # 使用层的名称创建命名空间
                    layer.build(None)
                    # 调用层的build方法构建层次结构
class TFSwinPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 SwinConfig 类作为模型的配置类
    config_class = SwinConfig
    # 基础模型的前缀名为 "swin"
    base_model_prefix = "swin"
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"


SWIN_START_DOCSTRING = r"""
    This model is a Tensorflow
    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a
    regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

SWIN_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.
        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


def normalize_data_format(value: str) -> str:
    """
    From tensorflow addons
    https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/utils/keras_utils.py#L71
    """
    # 如果值为 None,则使用 keras 后端的图像数据格式作为值
    if value is None:
        value = keras.backend.image_data_format()
    # 将值转换为小写
    data_format = value.lower()
    # 如果数据格式不是 "channels_first" 或 "channels_last",则引发 ValueError 异常
    if data_format not in {"channels_first", "channels_last"}:
        raise ValueError(
            'The `data_format` argument must be one of "channels_first", "channels_last". Received: ' + str(value)
        )
    # 返回标准化后的数据格式
    return data_format


class AdaptiveAveragePooling1D(keras.layers.Layer):
    """
    Args:
"""
    """
    Average 1D Pooling with adaptive kernel size.
    output_size: An integer or tuple/list of a single integer, specifying pooled_features.
    The new size of output channels.
    data_format: A string,
    one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs.
    `channels_last` corresponds to inputs with shape `(batch, steps, channels)` while `channels_first` corresponds
    to inputs with shape `(batch, channels, steps)`.
    
    Input shape:
    - If `data_format='channels_last'`: 3D tensor with shape `(batch, steps, channels)`.
    - If `data_format='channels_first'`: 3D tensor with shape `(batch, channels, steps)`.
    
    Output shape:
    - If `data_format='channels_last'`: 3D tensor with shape `(batch_size, pooled_steps, channels)`.
    - If `data_format='channels_first'`: 3D tensor with shape `(batch_size, channels, pooled_steps)`.
    
    Adapted from [tensorflow-addon's adaptive pooling.py](
        https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/layers/adaptive_pooling.py#L90-L120
    )
    """
    
    # 定义一个平均池化层,支持自适应核大小
    class AveragePooling1D(tf.keras.layers.Layer):
    
        def __init__(
            self,
            output_size: Union[int, Iterable[int]],  # 池化后的输出尺寸,可以是整数或整数组成的可迭代对象
            reduce_function: Callable = tf.reduce_mean,  # 池化使用的函数,默认为平均值池化
            data_format: Optional[str] = None,  # 数据格式,默认为 None
            **kwargs,  # 其他参数
        ) -> None:
            self.data_format = normalize_data_format(data_format)  # 标准化数据格式
            self.reduce_function = reduce_function  # 池化函数
            self.output_size = (output_size,) if isinstance(output_size, int) else tuple(output_size)  # 输出尺寸的元组形式
            super().__init__(**kwargs)  # 调用父类初始化方法
    
        def call(self, inputs: tf.Tensor, *args) -> None:
            bins = self.output_size[0]  # 获取输出尺寸中的第一个值作为 bins
            if self.data_format == "channels_last":
                splits = tf.split(inputs, bins, axis=1)  # 在通道维度上分割输入张量
                splits = tf.stack(splits, axis=1)  # 在第二个维度上堆叠分割后的张量
                out_vect = self.reduce_function(splits, axis=2)  # 沿着第三个维度对堆叠后的张量进行池化
            else:
                splits = tf.split(inputs, bins, axis=2)  # 在时间步维度上分割输入张量
                splits = tf.stack(splits, axis=2)  # 在第三个维度上堆叠分割后的张量
                out_vect = self.reduce_function(splits, axis=3)  # 沿着第四个维度对堆叠后的张量进行池化
            return out_vect  # 返回池化后的张量
    
        def compute_output_shape(self, input_shape: Iterable[int]) -> tf.TensorShape:
            input_shape = tf.TensorShape(input_shape).as_list()  # 将输入形状转换为列表形式
            if self.data_format == "channels_last":
                shape = tf.TensorShape([input_shape[0], self.output_size[0], input_shape[2]])  # 计算输出形状,通道在最后
            else:
                shape = tf.TensorShape([input_shape[0], input_shape[1], self.output_size[0]])  # 计算输出形状,通道在中间
            return shape  # 返回输出形状的张量形状对象
    
        def get_config(self) -> Dict[str, Any]:
            config = {
                "output_size": self.output_size,  # 输出尺寸配置
                "data_format": self.data_format,  # 数据格式配置
            }
            base_config = super().get_config()  # 调用父类配置方法
            return {**base_config, **config}  # 返回合并后的配置字典
    # 定义一个 Keras 自定义层 TFSwinMainLayer,并添加了 keras_serializable 装饰器,使其能够序列化
    @keras_serializable
    class TFSwinMainLayer(keras.layers.Layer):
        # 设置配置类为 SwinConfig
        config_class = SwinConfig

        # 初始化函数,接受 SwinConfig 类型的 config 参数,以及其他可选参数
        def __init__(
            self, config: SwinConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
        ) -> None:
            # 调用父类的初始化方法
            super().__init__(**kwargs)
            # 将传入的配置参数 config 赋值给对象的 config 属性
            self.config = config
            # 计算层数,即配置的深度列表的长度
            self.num_layers = len(config.depths)
            # 计算特征数,为配置中的嵌入维度乘以 2 的 (层数 - 1) 次方
            self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))

            # 创建 TFSwinEmbeddings 对象,并赋值给 embeddings 属性
            self.embeddings = TFSwinEmbeddings(config, use_mask_token=use_mask_token, name="embeddings")
            # 创建 TFSwinEncoder 对象,并传入 patch_grid 参数和名称 "encoder",赋值给 encoder 属性
            self.encoder = TFSwinEncoder(config, self.embeddings.patch_grid, name="encoder")

            # 创建 LayerNormalization 层,epsilon 参数为配置中的层归一化 epsilon 值,名称为 "layernorm",赋值给 layernorm 属性
            self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
            
            # 如果 add_pooling_layer 为 True,则创建 AdaptiveAveragePooling1D 层,输出大小为 (1,),赋值给 pooler 属性;否则 pooler 属性为 None
            self.pooler = AdaptiveAveragePooling1D(output_size=(1,)) if add_pooling_layer else None

        # 获取输入嵌入的方法,返回 embeddings 对象的 patch_embeddings 属性
        def get_input_embeddings(self) -> TFSwinPatchEmbeddings:
            return self.embeddings.patch_embeddings

        # 模型头部修剪方法,接受 heads_to_prune 参数,用于剪枝模型中的注意力头
        def _prune_heads(self, heads_to_prune: Dict[int, List]):
            """
            Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
            class PreTrainedModel
            """
            # 遍历 heads_to_prune 字典中的每一层和对应要剪枝的注意力头列表
            for layer, heads in heads_to_prune.items():
                # 在编码器(self.encoder)的指定层(layer)的注意力部分(attention)进行头部剪枝操作
                self.encoder.layer[layer].attention.prune_heads(heads)

        # 获取头部掩码的方法,接受 head_mask 参数,如果非空则抛出未实现错误,否则返回与深度列表长度相同的 None 列表
        def get_head_mask(self, head_mask: Optional[Any]) -> List:
            if head_mask is not None:
                raise NotImplementedError
            return [None] * len(self.config.depths)

        # 调用方法,接受多个参数并进行处理,包括像素值、掩码位置、头部掩码等
        @unpack_inputs
        def call(
            self,
            pixel_values: tf.Tensor | None = None,
            bool_masked_pos: tf.Tensor | None = None,
            head_mask: tf.Tensor | None = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            training: bool = False,
    ) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]:
        # 如果未指定,则根据配置确定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定,则根据配置确定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定,则根据配置确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果像素值为空,则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 准备头部掩码(如果需要)
        # head_mask 中的 1.0 表示保留对应的注意力头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或者 [num_hidden_layers x num_heads]
        # head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask)
        
        # 将像素值传入嵌入层,并获取嵌入层的输出和输入维度
        embedding_output, input_dimensions = self.embeddings(
            pixel_values, bool_masked_pos=bool_masked_pos, training=training
        )

        # 将嵌入层的输出传入编码器,并返回编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器的序列输出,并进行 layer normalization
        sequence_output = encoder_outputs[0]
        sequence_output = self.layernorm(sequence_output, training=training)

        # 初始化池化输出为 None
        pooled_output = None
        # 如果池化器不为空,则对序列输出进行池化
        if self.pooler is not None:
            batch_size, _, num_features = shape_list(sequence_output)
            pooled_output = self.pooler(sequence_output)
            pooled_output = tf.reshape(pooled_output, (batch_size, num_features))

        # 如果不需要返回字典,则返回输出元组
        if not return_dict:
            output = (sequence_output, pooled_output) + encoder_outputs[1:]
            return output

        # 如果需要返回字典格式的输出,则构建 TFSwinModelOutput 对象
        return TFSwinModelOutput(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )

    def build(self, input_shape=None):
        # 如果已经构建过,则直接返回
        if self.built:
            return
        # 标记已经构建
        self.built = True
        # 如果存在嵌入层,则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果存在编码器,则构建编码器
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果存在层归一化,则构建层归一化
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, None, self.num_features])
# 使用装饰器为类添加文档字符串,描述其作为裸的 Swin 模型变换器,输出未经任何特定头部处理的原始隐藏状态
@add_start_docstrings(
    "The bare Swin Model transformer outputting raw hidden-states without any specific head on top.",
    SWIN_START_DOCSTRING,
)
# 定义 TFSwinModel 类,继承自 TFSwinPreTrainedModel
class TFSwinModel(TFSwinPreTrainedModel):
    
    # 初始化方法
    def __init__(
        self, config: SwinConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
    ) -> None:
        # 调用父类的初始化方法
        super().__init__(config, **kwargs)
        # 保存配置信息到实例变量
        self.config = config
        # 创建 TFSwinMainLayer 的实例 swin,并命名为 "swin"
        self.swin = TFSwinMainLayer(config, name="swin")

    # 为 call 方法添加文档字符串,描述其作为模型前向传播的入口点,使用 SWIN_INPUTS_DOCSTRING 作为输入文档字符串
    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    # 使用装饰器添加代码示例文档字符串,展示模型的使用示例
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSwinModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 使用装饰器解包输入,确保正确处理输入参数
    @unpack_inputs
    # 定义 call 方法,接收多个参数并返回 TFSwinModelOutput 或 tf.Tensor 元组
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        bool_masked_pos: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]:
        r"""
        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 根据需要确定是否输出注意力权重,默认使用配置中的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据需要确定是否输出隐藏状态,默认使用配置中的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据需要确定是否返回字典形式的输出,默认使用配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值,则引发值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 调用 self.swin 的前向传播方法,传递所有参数,并获取模型输出
        swin_outputs = self.swin(
            pixel_values=pixel_values,
            bool_masked_pos=bool_masked_pos,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回模型输出
        return swin_outputs

    # 实现 build 方法,用于构建模型层次结构
    def build(self, input_shape=None):
        # 如果已经构建过,直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果 self.swin 已存在,则在命名空间下构建 self.swin
        if getattr(self, "swin", None) is not None:
            with tf.name_scope(self.swin.name):
                self.swin.build(None)


# 定义 TFSwinPixelShuffle 类,继承自 keras.layers.Layer,实现了 torch.nn.PixelShuffle 的 TensorFlow 版本的层
class TFSwinPixelShuffle(keras.layers.Layer):
    """TF layer implementation of torch.nn.PixelShuffle"""

    # 初始化方法
    def __init__(self, upscale_factor: int, **kwargs) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 如果 upscale_factor 不是整数或小于 2,则引发值错误
        if not isinstance(upscale_factor, int) or upscale_factor < 2:
            raise ValueError(f"upscale_factor must be an integer value >= 2 got {upscale_factor}")
        # 保存 upscale_factor 到实例变量
        self.upscale_factor = upscale_factor
    # 定义一个方法,接受一个张量 x 作为输入,返回一个张量作为输出
    def call(self, x: tf.Tensor) -> tf.Tensor:
        # 将输入张量赋值给 hidden_states
        hidden_states = x
        # 调用 shape_list 函数获取 hidden_states 的形状信息,并解包得到 batch_size, _, _, num_input_channels
        batch_size, _, _, num_input_channels = shape_list(hidden_states)
        # 计算块大小的平方
        block_size_squared = self.upscale_factor**2
        # 计算输出深度,即 num_input_channels 除以块大小的平方后取整
        output_depth = int(num_input_channels / block_size_squared)
        # 创建一个常量张量 permutation,用于存储一个通道排列顺序的索引
        permutation = tf.constant(
            # 使用列表推导式生成的二维数组,每个元素是一个索引,按照不同通道和块的顺序排列
            [[i + j * block_size_squared for i in range(block_size_squared) for j in range(output_depth)]]
        )
        # 使用 tf.gather 函数根据 permutation 中的索引重新组织 hidden_states 的通道
        hidden_states = tf.gather(params=hidden_states, indices=tf.tile(permutation, [batch_size, 1]), batch_dims=-1)
        # 使用 tf.nn.depth_to_space 函数进行深度到空间的转换,根据 upscale_factor 参数进行块的重新排列
        hidden_states = tf.nn.depth_to_space(hidden_states, block_size=self.upscale_factor, data_format="NHWC")
        # 返回处理后的 hidden_states 作为结果
        return hidden_states
# 自定义的 TensorFlow 2.x 模型层,用于实现 TFSwin 模型的解码器部分
class TFSwinDecoder(keras.layers.Layer):
    def __init__(self, config: SwinConfig, **kwargs):
        super().__init__(**kwargs)
        # 定义一个 1x1 卷积层,用于特征变换
        self.conv2d = keras.layers.Conv2D(
            filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, strides=1, name="0"
        )
        # 像素重排层,用于反向像素重排
        self.pixel_shuffle = TFSwinPixelShuffle(config.encoder_stride, name="1")
        # 保存 Swin 模型的配置信息
        self.config = config

    def call(self, x: tf.Tensor) -> tf.Tensor:
        # 将输入张量从 B,C,H,W 转置为 B,H,W,C
        hidden_states = x
        hidden_states = tf.transpose(hidden_states, (0, 2, 3, 1))
        # 经过 1x1 卷积层变换
        hidden_states = self.conv2d(hidden_states)
        # 经过像素重排层
        hidden_states = self.pixel_shuffle(hidden_states)
        # 将输出张量从 B,H,W,C 转置为 B,C,H,W
        hidden_states = tf.transpose(hidden_states, (0, 3, 1, 2))
        return hidden_states

    def build(self, input_shape=None):
        # 如果已经构建过,直接返回
        if self.built:
            return
        self.built = True
        # 构建卷积层
        if getattr(self, "conv2d", None) is not None:
            with tf.name_scope(self.conv2d.name):
                self.conv2d.build([None, None, None, self.config.hidden_size])
        # 构建像素重排层
        if getattr(self, "pixel_shuffle", None) is not None:
            with tf.name_scope(self.pixel_shuffle.name):
                self.pixel_shuffle.build(None)


# 基于 Swin 模型的一个变体,用于处理带掩码的图像建模,参考 SimMIM 论文提出的方法
@add_start_docstrings(
    "Swin Model with a decoder on top for masked image modeling, as proposed in"
    " [SimMIM](https://arxiv.org/abs/2111.09886).",
    SWIN_START_DOCSTRING,
)
class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
    def __init__(self, config: SwinConfig):
        super().__init__(config)
        # Swin 主层,不包含池化层,使用掩码标记
        self.swin = TFSwinMainLayer(config, add_pooling_layer=False, use_mask_token=True, name="swin")
        # Swin 解码器层
        self.decoder = TFSwinDecoder(config, name="decoder")

    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFSwinMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        bool_masked_pos: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ):
        # 略
        pass

    def build(self, input_shape=None):
        # 如果已经构建过,直接返回
        if self.built:
            return
        self.built = True
        # 构建 Swin 主层
        if getattr(self, "swin", None) is not None:
            with tf.name_scope(self.swin.name):
                self.swin.build(None)
        # 构建 Swin 解码器层
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)


# Swin 模型的图像分类变体,顶部附加了一个分类头部的线性层(在 [CLS] 标记的最终隐藏状态之上),例如用于 ImageNet
@add_start_docstrings(
    """
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    """,
    SWIN_START_DOCSTRING,
)
class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificationLoss):
    # 略
    pass
    # 初始化函数,接受一个 SwinConfig 类型的配置对象作为参数
    def __init__(self, config: SwinConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 设置类的属性,表示分类数目
        self.num_labels = config.num_labels
        # 创建一个 TFSwinMainLayer 类的实例,命名为 "swin"
        self.swin = TFSwinMainLayer(config, name="swin")

        # 分类器头部
        # 如果配置的标签数目大于 0,则创建一个全连接层作为分类器
        # 否则创建一个线性激活层作为分类器
        self.classifier = (
            keras.layers.Dense(config.num_labels, name="classifier")
            if config.num_labels > 0
            else keras.layers.Activation("linear", name="classifier")
        )

    # 根据装饰器提供的文档字符串,定义了模型前向传播的方法
    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFSwinImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        labels: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor, ...], TFSwinImageClassifierOutput]:
        """
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确定是否返回字典类型的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 Swin 模型的前向传播方法
        outputs = self.swin(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取池化后的输出
        pooled_output = outputs[1]

        # 将池化输出传递给分类器进行预测
        logits = self.classifier(pooled_output, training=training)

        # 如果有提供标签,则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典类型的输出,则按需返回输出的元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 否则返回 TFSwinImageClassifierOutput 类型的对象
        return TFSwinImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )

    # 构建模型,设置模型的输入形状
    def build(self, input_shape=None):
        # 如果模型已经构建过,则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 Swin 层,则在其命名空间下构建 Swin 层
        if getattr(self, "swin", None) is not None:
            with tf.name_scope(self.swin.name):
                self.swin.build(None)
        # 如果存在分类器,则在其命名空间下构建分类器,并传入 Swin 特征数目作为输入形状的一部分
        if getattr(self, "classifier", None) is not None:
            if hasattr(self.classifier, "name"):
                with tf.name_scope(self.classifier.name):
                    self.classifier.build([None, None, self.swin.num_features])

.\models\swin\__init__.py

# 引入类型检查的模块
from typing import TYPE_CHECKING

# 引入异常类,用于处理可选依赖不可用的情况
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available

# 定义模块的导入结构,包含配置和模型相关的导入信息
_import_structure = {"configuration_swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig", "SwinOnnxConfig"]}

# 检查是否有torch可用,若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若torch可用,则添加相关的模型定义到_import_structure中
    _import_structure["modeling_swin"] = [
        "SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SwinForImageClassification",
        "SwinForMaskedImageModeling",
        "SwinModel",
        "SwinPreTrainedModel",
        "SwinBackbone",
    ]

# 检查是否有tensorflow可用,若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若tensorflow可用,则添加相关的tensorflow模型定义到_import_structure中
    _import_structure["modeling_tf_swin"] = [
        "TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFSwinForImageClassification",
        "TFSwinForMaskedImageModeling",
        "TFSwinModel",
        "TFSwinPreTrainedModel",
    ]

# 如果当前是类型检查阶段
if TYPE_CHECKING:
    # 从配置模块中导入特定的配置类和常量
    from .configuration_swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig, SwinOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从模型定义模块中导入特定的torch模型类
        from .modeling_swin import (
            SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
            SwinBackbone,
            SwinForImageClassification,
            SwinForMaskedImageModeling,
            SwinModel,
            SwinPreTrainedModel,
        )

    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从tensorflow模型定义模块中导入特定的tensorflow模型类
        from .modeling_tf_swin import (
            TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFSwinForImageClassification,
            TFSwinForMaskedImageModeling,
            TFSwinModel,
            TFSwinPreTrainedModel,
        )

# 如果不是类型检查阶段,则执行延迟模块加载的逻辑
else:
    import sys

    # 将当前模块替换为LazyModule,以实现延迟导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\swin2sr\configuration_swin2sr.py

# 设置文件编码为 UTF-8
# 版权声明,版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本使用本文件,除非符合许可证,否则不得使用此文件
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按"原样"分发本软件
# 本软件没有任何明示或暗示的保证或条件
# 详细信息请参阅许可证

""" Swin2SR Transformer model configuration"""

# 导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入日志记录工具
from ...utils import logging

# 获取名为 __name__ 的日志记录器
logger = logging.get_logger(__name__)

# Swin2SR 预训练配置映射表,包含了模型名称及其配置文件的 URL
SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "caidas/swin2sr-classicalsr-x2-64": (
        "https://huggingface.co/caidas/swin2sr-classicalsr-x2-64/resolve/main/config.json"
    ),
}

# Swin2SRConfig 类,用于存储 Swin2SRModel 的配置
class Swin2SRConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Swin2SRModel`]. It is used to instantiate a Swin
    Transformer v2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Swin Transformer v2
    [caidas/swin2sr-classicalsr-x2-64](https://huggingface.co/caidas/swin2sr-classicalsr-x2-64) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import Swin2SRConfig, Swin2SRModel

    >>> # Initializing a Swin2SR caidas/swin2sr-classicalsr-x2-64 style configuration
    >>> configuration = Swin2SRConfig()

    >>> # Initializing a model (with random weights) from the caidas/swin2sr-classicalsr-x2-64 style configuration
    >>> model = Swin2SRModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "swin2sr"
    model_type = "swin2sr"

    # 属性映射,将类的属性名映射到预训练模型配置中的参数名
    attribute_map = {
        "hidden_size": "embed_dim",
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
    }

    # Swin2SRConfig 类的构造函数,定义了 Swin2SR 模型的各种配置参数
    def __init__(
        self,
        image_size=64,
        patch_size=1,
        num_channels=3,
        num_channels_out=None,
        embed_dim=180,
        depths=[6, 6, 6, 6, 6, 6],
        num_heads=[6, 6, 6, 6, 6, 6],
        window_size=8,
        mlp_ratio=2.0,
        qkv_bias=True,
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        drop_path_rate=0.1,
        hidden_act="gelu",
        use_absolute_embeddings=False,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        upscale=2,
        img_range=1.0,
        resi_connection="1conv",
        upsampler="pixelshuffle",
        **kwargs,
        # 调用父类的初始化方法,传入所有关键字参数
        super().__init__(**kwargs)

        # 设置模型的图像大小
        self.image_size = image_size
        # 设置每个图像块的大小
        self.patch_size = patch_size
        # 输入图像的通道数
        self.num_channels = num_channels
        # 输出图像的通道数,默认与输入通道数相同
        self.num_channels_out = num_channels if num_channels_out is None else num_channels_out
        # 嵌入维度
        self.embed_dim = embed_dim
        # 注意力层的深度列表
        self.depths = depths
        # 注意力层的数量,即深度列表的长度
        self.num_layers = len(depths)
        # 头部的数量
        self.num_heads = num_heads
        # 窗口大小
        self.window_size = window_size
        # MLP(多层感知机)扩展比例
        self.mlp_ratio = mlp_ratio
        # 是否使用查询、键、值的偏置
        self.qkv_bias = qkv_bias
        # 隐藏层的dropout率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 注意力概率的dropout率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 路径丢弃率
        self.drop_path_rate = drop_path_rate
        # 隐藏层的激活函数类型
        self.hidden_act = hidden_act
        # 是否使用绝对位置嵌入
        self.use_absolute_embeddings = use_absolute_embeddings
        # 层归一化的epsilon值
        self.layer_norm_eps = layer_norm_eps
        # 初始化范围
        self.initializer_range = initializer_range
        # 是否进行上采样
        self.upscale = upscale
        # 图像的像素范围
        self.img_range = img_range
        # 是否使用残差连接
        self.resi_connection = resi_connection
        # 上采样器
        self.upsampler = upsampler

.\models\swin2sr\convert_swin2sr_original_to_pytorch.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Swin2SR checkpoints from the original repository. URL: https://github.com/mv-lab/swin2sr"""

import argparse  # 导入解析命令行参数的模块

import requests  # 导入发送 HTTP 请求的模块
import torch  # 导入 PyTorch 深度学习框架
from PIL import Image  # 导入处理图像的模块
from torchvision.transforms import Compose, Normalize, Resize, ToTensor  # 导入图像转换相关模块

from transformers import Swin2SRConfig, Swin2SRForImageSuperResolution, Swin2SRImageProcessor  # 导入转换器相关模块


def get_config(checkpoint_url):
    config = Swin2SRConfig()  # 创建 Swin2SRConfig 的实例

    if "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
        config.upscale = 4  # 设置放大倍数为4
    elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
        config.upscale = 4  # 设置放大倍数为4
        config.image_size = 48  # 设置图像尺寸为48
        config.upsampler = "pixelshuffle_aux"  # 设置上采样方法为 pixelshuffle_aux
    elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
        config.depths = [6, 6, 6, 6]  # 设置深度参数列表
        config.embed_dim = 60  # 设置嵌入维度为60
        config.num_heads = [6, 6, 6, 6]  # 设置注意力头数列表
        config.upsampler = "pixelshuffledirect"  # 设置上采样方法为 pixelshuffledirect
    elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
        config.upscale = 4  # 设置放大倍数为4
        config.upsampler = "nearest+conv"  # 设置上采样方法为 nearest+conv
    elif "Swin2SR_Jpeg_dynamic" in checkpoint_url:
        config.num_channels = 1  # 设置通道数为1
        config.upscale = 1  # 设置放大倍数为1
        config.image_size = 126  # 设置图像尺寸为126
        config.window_size = 7  # 设置窗口大小为7
        config.img_range = 255.0  # 设置图像像素范围为255.0
        config.upsampler = ""  # 设置上采样方法为空字符串

    return config  # 返回配置对象


def rename_key(name, config):
    if "patch_embed.proj" in name and "layers" not in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "embeddings.patch_embeddings.layernorm")
    if "layers" in name:
        name = name.replace("layers", "encoder.stages")
    if "residual_group.blocks" in name:
        name = name.replace("residual_group.blocks", "layers")
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    if "attn" in name:
        name = name.replace("attn", "attention.self")
    if "norm1" in name:
        name = name.replace("norm1", "layernorm_before")
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")
    if "q_bias" in name:
        name = name.replace("q_bias", "query.bias")
    if "k_bias" in name:
        name = name.replace("k_bias", "key.bias")
    # 如果变量名中包含 "v_bias",则替换为 "value.bias"
    if "v_bias" in name:
        name = name.replace("v_bias", "value.bias")
    
    # 如果变量名中包含 "cpb_mlp",则替换为 "continuous_position_bias_mlp"
    if "cpb_mlp" in name:
        name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
    
    # 如果变量名中包含 "patch_embed.proj",则替换为 "patch_embed.projection"
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "patch_embed.projection")
    
    # 如果变量名为 "norm.weight",则替换为 "layernorm.weight"
    if name == "norm.weight":
        name = "layernorm.weight"
    
    # 如果变量名为 "norm.bias",则替换为 "layernorm.bias"
    if name == "norm.bias":
        name = "layernorm.bias"
    
    # 如果变量名中包含 "conv_first",则替换为 "first_convolution"
    if "conv_first" in name:
        name = name.replace("conv_first", "first_convolution")
    
    # 如果变量名中包含以下任意一个字符串,将其替换为相应的名称或前缀
    if (
        "upsample" in name
        or "conv_before_upsample" in name
        or "conv_bicubic" in name
        or "conv_up" in name
        or "conv_hr" in name
        or "conv_last" in name
        or "aux" in name
    ):
        # 对于特定的字符串替换规则
        if "conv_last" in name:
            name = name.replace("conv_last", "final_convolution")
    
        # 根据 config.upsampler 的不同取值进行不同的替换
        if config.upsampler in ["pixelshuffle", "pixelshuffle_aux", "nearest+conv"]:
            if "conv_before_upsample.0" in name:
                name = name.replace("conv_before_upsample.0", "conv_before_upsample")
            if "upsample.0" in name:
                name = name.replace("upsample.0", "upsample.convolution_0")
            if "upsample.2" in name:
                name = name.replace("upsample.2", "upsample.convolution_1")
            # 统一添加前缀 "upsample."
            name = "upsample." + name
        elif config.upsampler == "pixelshuffledirect":
            # 特定替换规则
            name = name.replace("upsample.0.weight", "upsample.conv.weight")
            name = name.replace("upsample.0.bias", "upsample.conv.bias")
        else:
            pass
    else:
        # 如果不符合以上任何替换条件,则添加前缀 "swin2sr."
        name = "swin2sr." + name
    
    # 返回处理后的变量名
    return name
# 转换给定的原始状态字典,根据配置更新键名
def convert_state_dict(orig_state_dict, config):
    # 遍历原始状态字典的复制键列表
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键名包含"qkv"
        if "qkv" in key:
            # 拆分键名以获取阶段号、块号和维度
            key_split = key.split(".")
            stage_num = int(key_split[1])
            block_num = int(key_split[4])
            dim = config.embed_dim

            # 如果键名中包含"weight"
            if "weight" in key:
                # 更新查询权重、键权重和值权重的新键名和对应的值
                orig_state_dict[
                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.weight"
                ] = val[:dim, :]
                orig_state_dict[
                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight"
                ] = val[dim : dim * 2, :]
                orig_state_dict[
                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.weight"
                ] = val[-dim:, :]
            else:
                # 更新查询偏置、键偏置和值偏置的新键名和对应的值
                orig_state_dict[
                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias"
                ] = val[:dim]
                orig_state_dict[
                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias"
                ] = val[dim : dim * 2]
                orig_state_dict[
                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias"
                ] = val[-dim:]
            pass
        else:
            # 对于其他键,使用配置中的重命名函数处理键名,并更新原始状态字典
            orig_state_dict[rename_key(key, config)] = val

    # 返回更新后的原始状态字典
    return orig_state_dict


def convert_swin2sr_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
    # 获取模型配置
    config = get_config(checkpoint_url)
    # 根据配置创建模型实例
    model = Swin2SRForImageSuperResolution(config)
    # 将模型设置为评估模式
    model.eval()

    # 从给定的 URL 加载模型状态字典到本地
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
    # 使用转换函数将状态字典转换为适用于当前模型的新状态字典
    new_state_dict = convert_state_dict(state_dict, config)
    # 加载新的状态字典到模型中,并获取缺失键和意外键
    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)

    # 如果存在缺失键,抛出值错误
    if len(missing_keys) > 0:
        raise ValueError("Missing keys when converting: {}".format(missing_keys))
    # 对于每个意外的键,如果不包含指定的子字符串,则抛出值错误
    for key in unexpected_keys:
        if not ("relative_position_index" in key or "relative_coords_table" in key or "self_mask" in key):
            raise ValueError(f"Unexpected key {key} in state_dict")

    # 验证加载的图像 URL
    url = "https://github.com/mv-lab/swin2sr/blob/main/testsets/real-inputs/shanghai.jpg?raw=true"
    # 使用请求获取并打开图像,并将其转换为 RGB 模式
    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
    # 创建图像处理器实例
    processor = Swin2SRImageProcessor()
    
    # 根据模型类型设置图像大小
    image_size = 126 if "Jpeg" in checkpoint_url else 256
    # 定义图像转换步骤,包括调整大小、转换为张量和归一化处理
    transforms = Compose(
        [
            Resize((image_size, image_size)),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )
    # 对图像应用转换步骤,并扩展维度以匹配模型输入
    pixel_values = transforms(image).unsqueeze(0)

    # 如果配置中指定的通道数为 1,只保留第一个通道的像素值
    if config.num_channels == 1:
        pixel_values = pixel_values[:, 0, :, :].unsqueeze(1)
    # 使用模型对输入像素值进行推理,得到输出结果
    outputs = model(pixel_values)

    # 根据不同的 checkpoint_url 设置预期的输出形状和切片
    if "Swin2SR_ClassicalSR_X2_64" in checkpoint_url:
        expected_shape = torch.Size([1, 3, 512, 512])
        expected_slice = torch.tensor(
            [[-0.7087, -0.7138, -0.6721], [-0.8340, -0.8095, -0.7298], [-0.9149, -0.8414, -0.7940]]
        )
    elif "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
        expected_shape = torch.Size([1, 3, 1024, 1024])
        expected_slice = torch.tensor(
            [[-0.7775, -0.8105, -0.8933], [-0.7764, -0.8356, -0.9225], [-0.7976, -0.8686, -0.9579]]
        )
    elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
        # TODO 值在这里并不完全匹配
        expected_shape = torch.Size([1, 3, 1024, 1024])
        expected_slice = torch.tensor(
            [[-0.8035, -0.7504, -0.7491], [-0.8538, -0.8124, -0.7782], [-0.8804, -0.8651, -0.8493]]
        )
    elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
        expected_shape = torch.Size([1, 3, 512, 512])
        expected_slice = torch.tensor(
            [[-0.7669, -0.8662, -0.8767], [-0.8810, -0.9962, -0.9820], [-0.9340, -1.0322, -1.1149]]
        )
    elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
        expected_shape = torch.Size([1, 3, 1024, 1024])
        expected_slice = torch.tensor(
            [[-0.5238, -0.5557, -0.6321], [-0.6016, -0.5903, -0.6391], [-0.6244, -0.6334, -0.6889]]
        )

    # 断言输出重建的形状是否与预期一致
    assert (
        outputs.reconstruction.shape == expected_shape
    ), f"Shape of reconstruction should be {expected_shape}, but is {outputs.reconstruction.shape}"

    # 断言输出重建的部分数据是否与预期一致,容差为 1e-3
    assert torch.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-3)
    
    # 打印提示信息,表明检查通过
    print("Looks ok!")

    # 将 checkpoint_url 映射到模型名称的字典
    url_to_name = {
        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth": (
            "swin2SR-classical-sr-x2-64"
        ),
        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X4_64.pth": (
            "swin2SR-classical-sr-x4-64"
        ),
        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_CompressedSR_X4_48.pth": (
            "swin2SR-compressed-sr-x4-48"
        ),
        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_Lightweight_X2_64.pth": (
            "swin2SR-lightweight-x2-64"
        ),
        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR.pth": (
            "swin2SR-realworld-sr-x4-64-bsrgan-psnr"
        ),
    }
    
    # 根据 checkpoint_url 获取模型名称
    model_name = url_to_name[checkpoint_url]

    # 如果指定了 pytorch_dump_folder_path,保存模型和处理器到该路径
    if pytorch_dump_folder_path is not None:
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        print(f"Saving image processor to {pytorch_dump_folder_path}")
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果设置了 push_to_hub 标志,将模型和处理器推送到 Hub
    if push_to_hub:
        model.push_to_hub(f"caidas/{model_name}")
        processor.push_to_hub(f"caidas/{model_name}")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行,则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    # 必需的参数设定
    parser.add_argument(
        "--checkpoint_url",
        default="https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth",
        type=str,
        help="URL of the original Swin2SR checkpoint you'd like to convert.",
    )
    # 添加名为 "checkpoint_url" 的参数,设定默认值为 Swin2SR 模型的下载地址,类型为字符串,帮助信息指定用途

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加名为 "pytorch_dump_folder_path" 的参数,设定默认值为 None,类型为字符串,帮助信息指定输出 PyTorch 模型的目录路径

    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the converted model to the hub.")
    # 添加名为 "push_to_hub" 的参数,设定为布尔类型,表示是否将转换后的模型推送到模型中心(hub)

    args = parser.parse_args()
    # 解析命令行参数,并将结果存储在 args 对象中

    convert_swin2sr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 convert_swin2sr_checkpoint,传递解析后的参数对象 args 的相应属性作为函数参数

.\models\swin2sr\image_processing_swin2sr.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Swin2SR."""

from typing import Optional, Union

import numpy as np

# 导入基础的图像处理工具和转换函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import get_image_size, pad, to_channel_dimension_format
from ...image_utils import (
    ChannelDimension,
    ImageInput,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 导入日志记录工具
from ...utils import TensorType, logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义图像处理器类 Swin2SRImageProcessor,继承自 BaseImageProcessor
class Swin2SRImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Swin2SR image processor.

    Args:
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
    """

    # 模型输入名称列表
    model_input_names = ["pixel_values"]

    # 初始化方法
    def __init__(
        self,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_pad: bool = True,
        pad_size: int = 8,
        **kwargs,
    ) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 初始化各参数
        self.do_rescale = do_rescale            # 是否进行图像缩放
        self.rescale_factor = rescale_factor    # 缩放因子,默认为 1/255
        self.do_pad = do_pad                    # 是否进行填充
        self.pad_size = pad_size                # 填充尺寸
        self._valid_processor_keys = [          # 可接受的处理器关键字列表
            "images",
            "do_rescale",
            "rescale_factor",
            "do_pad",
            "pad_size",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 图像填充方法
    def pad(
        self,
        image: np.ndarray,
        size: int,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,

        # 实现图像填充操作,接受以下参数:
        #   - image: 待填充的图像数组
        #   - size: 填充尺寸
        #   - data_format: 输出数据格式(通道维度格式),可选
        #   - input_data_format: 输入数据格式(通道维度格式),可选
    ):
        """
        Pad an image to make the height and width divisible by `size`.

        Args:
            image (`np.ndarray`):
                Image to pad.
            size (`int`):
                The size to make the height and width divisible by.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The padded image.
        """
        # 获取输入图像的原始高度和宽度
        old_height, old_width = get_image_size(image, input_data_format)
        # 计算需要填充的高度和宽度
        pad_height = (old_height // size + 1) * size - old_height
        pad_width = (old_width // size + 1) * size - old_width

        # 调用 pad 函数进行填充操作
        return pad(
            image,
            ((0, pad_height), (0, pad_width)),  # 在高度和宽度两个维度上进行填充
            mode="symmetric",  # 使用对称模式进行填充
            data_format=data_format,  # 指定输出图像的通道维度格式
            input_data_format=input_data_format,  # 指定输入图像的通道维度格式
        )

    def preprocess(
        self,
        images: ImageInput,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_pad: Optional[bool] = None,
        pad_size: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

.\models\swin2sr\modeling_swin2sr.py

# 设置文件编码为 UTF-8
# 版权声明,版权归 Microsoft Research 和 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本使用本文件
# 除非符合许可证规定,否则不得使用本文件
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,本软件是基于“原样”提供的,不提供任何明示或暗示的保证或条件
# 请参阅许可证获取更多信息
""" PyTorch Swin2SR Transformer model."""

# 导入必要的库和模块
import collections.abc  # 导入 collections.abc 模块
import math  # 导入 math 模块
from dataclasses import dataclass  # 从 dataclasses 模块导入 dataclass 装饰器
from typing import Optional, Tuple, Union  # 导入类型提示的相关类和类型

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 工具
from torch import nn  # 从 PyTorch 导入 nn 模块

# 导入模型相关的子模块和函数
from ...activations import ACT2FN  # 从 ...activations 模块导入 ACT2FN 函数
from ...modeling_outputs import BaseModelOutput, ImageSuperResolutionOutput  # 从 ...modeling_outputs 模块导入输出类
from ...modeling_utils import PreTrainedModel  # 从 ...modeling_utils 模块导入预训练模型相关类
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer  # 从 ...pytorch_utils 导入相关工具函数
from ...utils import (
    ModelOutput,  # 从 ...utils 模块导入 ModelOutput 类
    add_code_sample_docstrings,  # 从 ...utils 模块导入相关函数和类
    add_start_docstrings,  # 从 ...utils 模块导入相关函数和类
    add_start_docstrings_to_model_forward,  # 从 ...utils 模块导入相关函数和类
    logging,  # 从 ...utils 模块导入 logging 模块
    replace_return_docstrings,  # 从 ...utils 模块导入相关函数
)

# 导入 Swin2SR 的配置类
from .configuration_swin2sr import Swin2SRConfig  # 从当前目录下的 configuration_swin2sr 模块导入 Swin2SRConfig 类

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的一般信息
_CONFIG_FOR_DOC = "Swin2SRConfig"

# 用于文档的基本检查点信息
_CHECKPOINT_FOR_DOC = "caidas/swin2SR-classical-sr-x2-64"

# 预期的输出形状
_EXPECTED_OUTPUT_SHAPE = [1, 180, 488, 648]

# Swin2SR 预训练模型存档列表
SWIN2SR_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "caidas/swin2SR-classical-sr-x2-64",
    # 查看所有 Swin2SR 模型,请访问 https://huggingface.co/models?filter=swin2sr
]

@dataclass
class Swin2SREncoderOutput(ModelOutput):
    """
    Swin2SR 编码器的输出,可能包含隐藏状态和注意力权重。

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列输出。
        hidden_states (`tuple(torch.FloatTensor)`, *可选*, 当 `output_hidden_states=True` 传递或当 `config.output_hidden_states=True` 时返回):
            模型每层的隐藏状态的元组,包括初始嵌入的输出。

            模型每层的隐藏状态以及初始嵌入的输出。
        attentions (`tuple(torch.FloatTensor)`, *可选*, 当 `output_attentions=True` 传递或当 `config.output_attentions=True` 时返回):
            模型每阶段的注意力权重的元组。

            注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
    """
    # 声明一个变量 last_hidden_state,类型为 torch.FloatTensor,初始值为 None
    last_hidden_state: torch.FloatTensor = None
    # 声明一个变量 hidden_states,类型为可选的元组,元素类型为 torch.FloatTensor
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 声明一个变量 attentions,类型为可选的元组,元素类型为 torch.FloatTensor
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.swin.modeling_swin.window_partition
def window_partition(input_feature, window_size):
    """
    Partitions the given input into windows.
    """
    # 获取输入特征的尺寸信息:批量大小、高度、宽度、通道数
    batch_size, height, width, num_channels = input_feature.shape
    # 将输入特征按窗口大小进行划分,重塑为新的形状
    input_feature = input_feature.view(
        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
    )
    # 对划分后的窗口进行重新排序,以便后续处理
    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
    return windows


# Copied from transformers.models.swin.modeling_swin.window_reverse
def window_reverse(windows, window_size, height, width):
    """
    Merges windows to produce higher resolution features.
    """
    # 确定窗口的通道数
    num_channels = windows.shape[-1]
    # 将窗口合并为更高分辨率的特征
    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
    # 对合并后的特征进行重新排序,以符合原始输入的形状
    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
    return windows


# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果 drop_prob 为 0 或不处于训练模式,则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留的概率
    keep_prob = 1 - drop_prob
    # 创建一个与输入形状相同的随机张量
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化
    # 应用 drop path 操作,并返回处理后的输出
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.swin.modeling_swin.SwinDropPath with Swin->Swin2SR
class Swin2SRDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数来执行 drop path 操作
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class Swin2SREmbeddings(nn.Module):
    """
    Construct the patch and optional position embeddings.
    """
    # 初始化函数,接受一个配置参数config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 使用配置参数初始化Swin2SRPatchEmbeddings对象,赋值给self.patch_embeddings
        self.patch_embeddings = Swin2SRPatchEmbeddings(config)
        
        # 获取patch数目,用于后续位置编码的初始化
        num_patches = self.patch_embeddings.num_patches

        # 根据配置决定是否创建位置编码的参数
        if config.use_absolute_embeddings:
            # 创建一个形状为(1, num_patches + 1, config.embed_dim)的可学习参数,初始值为全零
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
        else:
            # 如果不使用绝对位置编码,则置为None
            self.position_embeddings = None

        # 初始化一个dropout层,使用给定的dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # 保存配置中的窗口大小参数
        self.window_size = config.window_size

    # 前向传播函数,接受一个可选的torch.FloatTensor类型的像素值作为输入,返回一个torch.Tensor类型的元组
    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
        # 调用patch_embeddings对象处理输入像素值,返回嵌入张量和输出维度信息
        embeddings, output_dimensions = self.patch_embeddings(pixel_values)

        # 如果位置编码参数不为None,则将嵌入张量和位置编码相加
        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings

        # 对嵌入张量应用dropout操作
        embeddings = self.dropout(embeddings)

        # 返回处理后的嵌入张量和输出维度信息的元组
        return embeddings, output_dimensions
class Swin2SRPatchEmbeddings(nn.Module):
    # Swin2SRPatchEmbeddings 类的定义,继承自 nn.Module
    def __init__(self, config, normalize_patches=True):
        super().__init__()
        # 初始化函数,接收配置参数和是否标准化补丁的标志

        num_channels = config.embed_dim
        # 从配置中获取嵌入维度

        image_size, patch_size = config.image_size, config.patch_size
        # 从配置中获取图像尺寸和补丁尺寸

        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 确保图像尺寸和补丁尺寸是可迭代对象,如果不是,则转换为元组形式

        patches_resolution = [image_size[0] // patch_size[0], image_size[1] // patch_size[1]]
        # 计算补丁的分辨率,即图像被划分成的补丁数目

        self.patches_resolution = patches_resolution
        self.num_patches = patches_resolution[0] * patches_resolution[1]
        # 设置补丁的分辨率和补丁的总数

        self.projection = nn.Conv2d(num_channels, config.embed_dim, kernel_size=patch_size, stride=patch_size)
        # 使用卷积层进行投影,将输入的通道数转换为嵌入维度,卷积核大小为补丁大小,步长为补丁大小

        self.layernorm = nn.LayerNorm(config.embed_dim) if normalize_patches else None
        # 如果需要对补丁进行标准化,则使用 LayerNorm 进行处理,否则设为 None

    def forward(self, embeddings: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
        # 前向传播函数,接收嵌入向量作为输入,返回嵌入后的张量和输出维度的元组

        embeddings = self.projection(embeddings)
        # 使用定义的投影层对输入的嵌入向量进行投影变换

        _, _, height, width = embeddings.shape
        # 获取投影后张量的高度和宽度信息

        output_dimensions = (height, width)
        # 记录输出的高度和宽度信息

        embeddings = embeddings.flatten(2).transpose(1, 2)
        # 将投影后的张量按照第三维度展平,然后进行转置操作

        if self.layernorm is not None:
            embeddings = self.layernorm(embeddings)
        # 如果定义了 LayerNorm 层,则对嵌入向量进行标准化处理

        return embeddings, output_dimensions
        # 返回处理后的嵌入向量和输出的尺寸信息


class Swin2SRPatchUnEmbeddings(nn.Module):
    # Swin2SRPatchUnEmbeddings 类的定义,继承自 nn.Module
    r"""Image to Patch Unembedding"""

    def __init__(self, config):
        super().__init__()
        # 初始化函数,接收配置参数

        self.embed_dim = config.embed_dim
        # 设置嵌入维度为配置中指定的值

    def forward(self, embeddings, x_size):
        # 前向传播函数,接收嵌入向量和图像尺寸作为输入

        batch_size, height_width, num_channels = embeddings.shape
        # 获取输入嵌入向量的批量大小、高度宽度乘积以及通道数

        embeddings = embeddings.transpose(1, 2).view(batch_size, self.embed_dim, x_size[0], x_size[1])  # B Ph*Pw C
        # 将嵌入向量进行转置和视图变换,以重构原始图像尺寸

        return embeddings
        # 返回重构后的嵌入向量


# Copied from transformers.models.swinv2.modeling_swinv2.Swinv2PatchMerging with Swinv2->Swin2SR
class Swin2SRPatchMerging(nn.Module):
    # Swin2SRPatchMerging 类的定义,继承自 nn.Module
    """
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    """

    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        # 初始化函数,接收输入特征的分辨率、输入通道数和可选的标准化层

        self.input_resolution = input_resolution
        # 设置输入特征的分辨率属性

        self.dim = dim
        # 设置输入通道数属性

        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        # 使用线性层进行维度减少,从 4*dim 到 2*dim,无偏置项

        self.norm = norm_layer(2 * dim)
        # 使用指定的标准化层对输出进行标准化处理

    def maybe_pad(self, input_feature, height, width):
        # 辅助函数,可能对输入特征进行填充,使得其高度和宽度为偶数

        should_pad = (height % 2 == 1) or (width % 2 == 1)
        # 检查输入特征的高度或宽度是否为奇数

        if should_pad:
            pad_values = (0, 0, 0, width % 2, 0, height % 2)
            # 计算需要填充的值

            input_feature = nn.functional.pad(input_feature, pad_values)
            # 使用 PyTorch 的函数进行填充操作

        return input_feature
        # 返回填充后的输入特征
    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
        # 解包输入维度元组
        height, width = input_dimensions
        # `dim` 是输入特征的维度,即 height * width
        batch_size, dim, num_channels = input_feature.shape

        # 将输入特征重新视图化为四维张量 [batch_size, height, width, num_channels]
        input_feature = input_feature.view(batch_size, height, width, num_channels)
        
        # 如果需要,对输入进行填充使其可以被 width 和 height 整除
        input_feature = self.maybe_pad(input_feature, height, width)
        
        # 提取四个子块,每个块大小为 [batch_size, height/2, width/2, num_channels]
        input_feature_0 = input_feature[:, 0::2, 0::2, :]
        input_feature_1 = input_feature[:, 1::2, 0::2, :]
        input_feature_2 = input_feature[:, 0::2, 1::2, :]
        input_feature_3 = input_feature[:, 1::2, 1::2, :]
        
        # 将四个子块沿最后一个维度拼接,形成新的特征张量 [batch_size, height/2, width/2, 4*num_channels]
        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
        
        # 将特征张量重新视图化为 [batch_size, height/2 * width/2, 4*num_channels]
        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)

        # 使用 reduction 方法对特征张量进行降维处理
        input_feature = self.reduction(input_feature)
        
        # 使用 norm 方法对降维后的特征张量进行归一化处理
        input_feature = self.norm(input_feature)

        # 返回处理后的特征张量作为输出
        return input_feature
# 从transformers.models.swinv2.modeling_swinv2.Swinv2SelfAttention复制而来,将Swinv2改为Swin2SR
class Swin2SRSelfAttention(nn.Module):
    # 将输入张量x重新形状以用于注意力分数计算
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 自注意力机制的前向传播
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:  # 函数声明,返回类型为包含单个张量的元组
        batch_size, dim, num_channels = hidden_states.shape  # 获取隐藏状态张量的形状信息
        mixed_query_layer = self.query(hidden_states)  # 使用 query 网络对隐藏状态进行处理得到混合查询层

        key_layer = self.transpose_for_scores(self.key(hidden_states))  # 使用 key 网络对隐藏状态进行处理,然后转置以用于注意力计算
        value_layer = self.transpose_for_scores(self.value(hidden_states))  # 使用 value 网络对隐藏状态进行处理,然后转置以用于注意力计算
        query_layer = self.transpose_for_scores(mixed_query_layer)  # 对混合查询层进行转置以用于注意力计算

        # cosine attention
        attention_scores = nn.functional.normalize(query_layer, dim=-1) @ nn.functional.normalize(
            key_layer, dim=-1
        ).transpose(-2, -1)  # 计算注意力分数,使用余弦相似度进行归一化,然后进行乘积计算

        logit_scale = torch.clamp(self.logit_scale, max=math.log(1.0 / 0.01)).exp()  # 限制并指数化对数缩放参数
        attention_scores = attention_scores * logit_scale  # 缩放注意力分数

        relative_position_bias_table = self.continuous_position_bias_mlp(self.relative_coords_table).view(
            -1, self.num_attention_heads
        )  # 使用位置偏置 MLP 计算连续位置偏置表,并进行形状重塑

        # [window_height*window_width,window_height*window_width,num_attention_heads]
        relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
        )  # 根据相对位置索引选择相对位置偏置表中的偏置,并进行形状调整

        # [num_attention_heads,window_height*window_width,window_height*window_width]
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # 调整相对位置偏置的维度顺序

        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)  # 对相对位置偏置进行 sigmoid 处理并乘以常数 16
        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)  # 添加相对位置偏置到注意力分数中

        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in Swin2SRModel forward() function)
            mask_shape = attention_mask.shape[0]  # 获取注意力掩码的形状信息
            attention_scores = attention_scores.view(
                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
            ) + attention_mask.unsqueeze(1).unsqueeze(0)  # 将注意力分数调整为与掩码相匹配的形状,并应用掩码

            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)  # 再次应用掩码

            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)  # 调整注意力分数的形状

        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)  # 对注意力分数进行 softmax 归一化

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)  # 使用 dropout 对注意力概率进行处理

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = attention_probs * head_mask  # 如果有头部掩码,则将其应用到注意力概率上

        context_layer = torch.matmul(attention_probs, value_layer)  # 使用注意力概率与值层进行加权求和得到上下文层
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()  # 调整上下文层的维度顺序

        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)  # 计算新的上下文层形状
        context_layer = context_layer.view(new_context_layer_shape)  # 根据计算的形状调整上下文层的形状

        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)  # 根据是否需要输出注意力分数来选择输出内容

        return outputs  # 返回上下文层和(如果需要)注意力分数
# 从 transformers.models.swin.modeling_swin.SwinSelfOutput 复制并修改为 Swin2SRSelfOutput 类
class Swin2SRSelfOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 创建一个线性层,输入和输出维度均为 dim
        self.dense = nn.Linear(dim, dim)
        # 创建一个 dropout 层,使用 config 中指定的 dropout 概率
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 传入 dense 线性层
        hidden_states = self.dense(hidden_states)
        # 将经过线性层的 hidden_states 应用 dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# 从 transformers.models.swinv2.modeling_swinv2.Swinv2Attention 复制并修改为 Swin2SRAttention 类
class Swin2SRAttention(nn.Module):
    def __init__(self, config, dim, num_heads, window_size, pretrained_window_size=0):
        super().__init__()
        # 初始化 self 层,即 Swin2SRSelfAttention 对象
        self.self = Swin2SRSelfAttention(
            config=config,
            dim=dim,
            num_heads=num_heads,
            window_size=window_size,
            pretrained_window_size=pretrained_window_size
            if isinstance(pretrained_window_size, collections.abc.Iterable)
            else (pretrained_window_size, pretrained_window_size),
        )
        # 初始化 output 层,即 Swin2SRSelfOutput 对象
        self.output = Swin2SRSelfOutput(config, dim)
        # 初始化一个空集合,用于存储剪枝的注意力头信息
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 查找可剪枝的注意力头和其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对线性层进行剪枝
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储已剪枝的头信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 执行自注意力机制,并获取 self_outputs
        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
        # 将 self_outputs[0] 作为输入,hidden_states 作为辅助输入,传入 output 层
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果输出注意力信息,则将 attentions 添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出 attentions,则添加到 outputs 中
        return outputs


# 从 transformers.models.swin.modeling_swin.SwinIntermediate 复制并修改为 Swin2SRIntermediate 类
class Swin2SRIntermediate(nn.Module):
    # 初始化函数,用于创建一个新的神经网络层
    def __init__(self, config, dim):
        # 调用父类的初始化函数
        super().__init__()
        # 创建一个全连接层,将输入维度 dim 映射到 int(config.mlp_ratio * dim) 的输出维度
        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
        
        # 根据配置选择隐藏层激活函数,如果配置中隐藏层激活函数是字符串,则从预定义的映射中选择对应的函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则直接使用配置中指定的隐藏层激活函数
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数,处理输入的隐藏状态张量并返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的隐藏状态张量通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 将线性变换后的张量输入到中间激活函数中进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回变换后的张量作为输出
        return hidden_states
# 从transformers.models.swin.modeling_swin.SwinOutput复制并将Swin改为Swin2SR
class Swin2SROutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 创建一个线性层,将输入维度乘以config.mlp_ratio,输出维度为dim
        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
        # 创建一个Dropout层,以config.hidden_dropout_prob的概率丢弃神经元
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 前向传播函数,首先通过线性层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 然后对处理后的状态应用Dropout操作
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的隐藏状态
        return hidden_states


# 从transformers.models.swinv2.modeling_swinv2.Swinv2Layer复制并将Swinv2改为Swin2SR
class Swin2SRLayer(nn.Module):
    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
        super().__init__()
        # 设置输入分辨率
        self.input_resolution = input_resolution
        # 计算窗口大小和移动尺寸
        window_size, shift_size = self._compute_window_shift(
            (config.window_size, config.window_size), (shift_size, shift_size)
        )
        # 选择第一个维度的窗口大小和移动尺寸
        self.window_size = window_size[0]
        self.shift_size = shift_size[0]
        # 创建Swin2SRAttention层,传入config、dim、num_heads、window_size和pretrained_window_size参数
        self.attention = Swin2SRAttention(
            config=config,
            dim=dim,
            num_heads=num_heads,
            window_size=self.window_size,
            pretrained_window_size=pretrained_window_size
            if isinstance(pretrained_window_size, collections.abc.Iterable)
            else (pretrained_window_size, pretrained_window_size),
        )
        # 创建LayerNorm层,归一化dim维度的输入,eps为config.layer_norm_eps
        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建Swin2SRDropPath层,如果config.drop_path_rate大于0.0则应用DropPath,否则为恒等映射
        self.drop_path = Swin2SRDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
        # 创建Swin2SRIntermediate层,处理输入为config和dim的中间层
        self.intermediate = Swin2SRIntermediate(config, dim)
        # 创建Swin2SROutput层,处理输入为config和dim的输出层
        self.output = Swin2SROutput(config, dim)
        # 创建LayerNorm层,归一化dim维度的输出,eps为config.layer_norm_eps
        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)

    def _compute_window_shift(self, target_window_size, target_shift_size) -> Tuple[Tuple[int, int], Tuple[int, int]]:
        # 计算窗口大小和移动尺寸的函数,返回目标窗口大小和目标移动尺寸
        window_size = [r if r <= w else w for r, w in zip(self.input_resolution, target_window_size)]
        shift_size = [0 if r <= w else s for r, w, s in zip(self.input_resolution, window_size, target_shift_size)]
        return window_size, shift_size
    ````
        # 根据窗口移动大小生成注意力掩码,用于移位窗口的多头自注意力
        def get_attn_mask(self, height, width, dtype):
            if self.shift_size > 0:
                # 创建一个全零的张量作为图像的注意力掩码
                img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
                # 定义高度和宽度的切片,用于生成多个窗口
                height_slices = (
                    slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None),
                )
                width_slices = (
                    slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None),
                )
                count = 0
                # 在图像的每个窗口位置设置对应的编号
                for height_slice in height_slices:
                    for width_slice in width_slices:
                        img_mask[:, height_slice, width_slice, :] = count
                        count += 1
    
                # 将图像分块,每个块的大小为窗口大小乘以窗口大小
                mask_windows = window_partition(img_mask, self.window_size)
                mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
                # 创建注意力掩码,表示不同窗口之间的相对位置
                attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
                # 使用特定值填充掩码,0位置用0.0填充,非0位置用-100.0填充
                attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
            else:
                # 如果不需要移位,返回空的注意力掩码
                attn_mask = None
            return attn_mask
    
        # 在需要时对隐藏状态进行填充,以适应窗口大小的整数倍
        def maybe_pad(self, hidden_states, height, width):
            # 计算需要右侧和底部填充的像素数,使其可以被窗口大小整除
            pad_right = (self.window_size - width % self.window_size) % self.window_size
            pad_bottom = (self.window_size - height % self.window_size) % self.window_size
            # 定义填充值的元组,格式为(top, bottom, left, right, ...)
            pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
            # 对隐藏状态进行填充操作
            hidden_states = nn.functional.pad(hidden_states, pad_values)
            return hidden_states,  def forward(
            self,
            hidden_states: torch.Tensor,
            input_dimensions: Tuple[int, int],
            head_mask: Optional[torch.FloatTensor] = None,
            output_attentions: Optional[bool] = False,
        ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 定义函数签名,指定输入和输出类型为 torch.Tensor 的元组
        height, width = input_dimensions
        # 解包输入维度
        batch_size, _, channels = hidden_states.size()
        # 获取隐藏状态的批量大小、高度、宽度和通道数
        shortcut = hidden_states
        # 保存隐藏状态的快捷方式

        # pad hidden_states to multiples of window size
        # 将隐藏状态填充到窗口大小的倍数
        hidden_states = hidden_states.view(batch_size, height, width, channels)
        # 调整隐藏状态的形状为 [batch_size, height, width, channels]
        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
        # 调用 maybe_pad 方法,可能对隐藏状态进行填充,同时获取填充值
        _, height_pad, width_pad, _ = hidden_states.shape
        # 解包填充后的隐藏状态的形状

        # cyclic shift
        # 循环移位操作
        if self.shift_size > 0:
            # 如果 shift_size 大于 0
            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            # 在维度 (1, 2) 上对隐藏状态进行负向移位操作
        else:
            shifted_hidden_states = hidden_states
            # 否则,不进行移位操作,保持隐藏状态不变

        # partition windows
        # 划分窗口
        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
        # 调用 window_partition 方法,将移位后的隐藏状态划分为窗口
        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
        # 将划分后的窗口重新视图为 [batch_size * num_windows, window_size * window_size, channels]
        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
        # 调用 get_attn_mask 方法,获取注意力掩码
        if attn_mask is not None:
            attn_mask = attn_mask.to(hidden_states_windows.device)
            # 如果注意力掩码不为空,则将其移到与 hidden_states_windows 相同的设备上

        attention_outputs = self.attention(
            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
        )
        # 调用 attention 方法,进行注意力计算

        attention_output = attention_outputs[0]
        # 获取注意力输出的第一个元素

        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
        # 将注意力输出重新视图为 [batch_size * num_windows, window_size, window_size, channels]
        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
        # 调用 window_reverse 方法,逆转注意力窗口

        # reverse cyclic shift
        # 逆转循环移位
        if self.shift_size > 0:
            # 如果 shift_size 大于 0
            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
            # 在维度 (1, 2) 上对注意力窗口进行正向移位操作
        else:
            attention_windows = shifted_windows
            # 否则,不进行移位操作,保持注意力窗口不变

        was_padded = pad_values[3] > 0 or pad_values[5] > 0
        # 判断是否进行了填充
        if was_padded:
            attention_windows = attention_windows[:, :height, :width, :].contiguous()
            # 如果进行了填充,则截取注意力窗口的有效部分

        attention_windows = attention_windows.view(batch_size, height * width, channels)
        # 将注意力窗口重新视图为 [batch_size, height * width, channels]
        hidden_states = self.layernorm_before(attention_windows)
        # 调用 layernorm_before 方法,对注意力窗口进行层归一化处理
        hidden_states = shortcut + self.drop_path(hidden_states)
        # 将快捷方式与经过 drop_path 处理后的隐藏状态相加

        layer_output = self.intermediate(hidden_states)
        # 调用 intermediate 方法,生成中间层输出
        layer_output = self.output(layer_output)
        # 调用 output 方法,生成输出层输出
        layer_output = hidden_states + self.drop_path(self.layernorm_after(layer_output))
        # 将隐藏状态与经过 drop_path 和层归一化后的输出相加

        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
        # 如果需要输出注意力,则返回包含注意力输出的元组,否则只返回输出层输出的元组
        return layer_outputs
        # 返回层输出的元组
class Swin2SRStage(nn.Module):
    """
    This corresponds to the Residual Swin Transformer Block (RSTB) in the original implementation.
    """

    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, pretrained_window_size=0):
        super().__init__()
        self.config = config  # 初始化模型配置参数
        self.dim = dim  # 初始化模型维度参数

        # 创建包含多个Swin2SRLayer层的ModuleList
        self.layers = nn.ModuleList(
            [
                Swin2SRLayer(
                    config=config,
                    dim=dim,
                    input_resolution=input_resolution,
                    num_heads=num_heads,
                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                    pretrained_window_size=pretrained_window_size,
                )
                for i in range(depth)
            ]
        )

        # 根据配置参数选择不同的残差连接方式
        if config.resi_connection == "1conv":
            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
        elif config.resi_connection == "3conv":
            # 采用序列化方式创建多层卷积神经网络
            self.conv = nn.Sequential(
                nn.Conv2d(dim, dim // 4, 3, 1, 1),
                nn.LeakyReLU(negative_slope=0.2, inplace=True),
                nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
                nn.LeakyReLU(negative_slope=0.2, inplace=True),
                nn.Conv2d(dim // 4, dim, 3, 1, 1),
            )

        # 创建Swin2SRPatchEmbeddings对象
        self.patch_embed = Swin2SRPatchEmbeddings(config, normalize_patches=False)

        # 创建Swin2SRPatchUnEmbeddings对象
        self.patch_unembed = Swin2SRPatchUnEmbeddings(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        residual = hidden_states  # 保存输入的隐藏状态作为残差

        height, width = input_dimensions  # 获取输入图像的高度和宽度
        for i, layer_module in enumerate(self.layers):
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 调用Swin2SRLayer的forward方法进行前向传播
            layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)

            hidden_states = layer_outputs[0]  # 更新隐藏状态输出

        output_dimensions = (height, width, height, width)  # 设置输出的图像维度

        hidden_states = self.patch_unembed(hidden_states, input_dimensions)  # 反向解嵌入处理
        hidden_states = self.conv(hidden_states)  # 应用卷积层处理隐藏状态
        hidden_states, _ = self.patch_embed(hidden_states)  # 应用图像嵌入处理

        hidden_states = hidden_states + residual  # 加上残差连接

        stage_outputs = (hidden_states, output_dimensions)  # 定义阶段输出结果

        if output_attentions:
            stage_outputs += layer_outputs[1:]  # 如果需要输出注意力,将其添加到输出结果中
        return stage_outputs  # 返回阶段输出结果
    # 初始化函数,接受配置对象和网格大小作为参数
    def __init__(self, config, grid_size):
        # 调用父类初始化方法
        super().__init__()
        # 计算阶段数量,即深度列表的长度
        self.num_stages = len(config.depths)
        # 保存配置对象
        self.config = config
        # 计算丢弃路径率数组,根据配置的丢弃路径率和各个阶段的深度
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        # 创建阶段列表,每个阶段是一个Swin2SRStage模块
        self.stages = nn.ModuleList(
            [
                Swin2SRStage(
                    config=config,
                    dim=config.embed_dim,
                    input_resolution=(grid_size[0], grid_size[1]),
                    depth=config.depths[stage_idx],
                    num_heads=config.num_heads[stage_idx],
                    drop_path=dpr[sum(config.depths[:stage_idx]) : sum(config.depths[: stage_idx + 1])],
                    pretrained_window_size=0,
                )
                for stage_idx in range(self.num_stages)
            ]
        )

        # 是否启用梯度检查点,默认为False
        self.gradient_checkpointing = False

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, Swin2SREncoderOutput]:
        # 初始化所有输入尺寸为空元组
        all_input_dimensions = ()
        # 如果需要输出隐藏状态,则初始化为空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重,则初始化为空元组
        all_self_attentions = () if output_attentions else None

        # 如果需要输出隐藏状态,则添加当前隐藏状态到all_hidden_states中
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # 遍历所有阶段
        for i, stage_module in enumerate(self.stages):
            # 获取当前阶段的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用了梯度检查点并且正在训练阶段,则使用梯度检查点函数
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    stage_module.__call__, hidden_states, input_dimensions, layer_head_mask, output_attentions
                )
            else:
                # 否则,直接调用阶段模块进行前向传播
                layer_outputs = stage_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 更新输入尺寸为当前层输出的维度
            output_dimensions = layer_outputs[1]
            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
            # 将当前层的输出维度添加到所有输入尺寸中
            all_input_dimensions += (input_dimensions,)

            # 如果需要输出隐藏状态,则添加当前隐藏状态到all_hidden_states中
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            # 如果需要输出注意力权重,则将当前层的注意力权重添加到all_self_attentions中
            if output_attentions:
                all_self_attentions += layer_outputs[2:]

        # 如果不需要返回字典形式的输出,则返回隐藏状态、所有隐藏状态和所有注意力权重
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)

        # 否则,返回Swin2SREncoderOutput对象,包含最终隐藏状态、所有隐藏状态和所有注意力权重
        return Swin2SREncoderOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
class Swin2SRPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类
    config_class = Swin2SRConfig
    # 基础模型前缀
    base_model_prefix = "swin2sr"
    # 主输入名称
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 初始化模块的权重
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            torch.nn.init.trunc_normal_(module.weight.data, std=self.config.initializer_range)
            # 如果存在偏置,将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            # 初始化 LayerNorm 的偏置为零,权重为1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# Swin2SRModel 类的文档字符串
SWIN2SR_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Swin2SRConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# Swin2SRModel 类的输入文档字符串
SWIN2SR_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`Swin2SRImageProcessor.__call__`] for details.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top.",
    SWIN2SR_START_DOCSTRING,
)
# Swin2SRModel 类,继承自 Swin2SRPreTrainedModel,用于构建模型
class Swin2SRModel(Swin2SRPreTrainedModel):
    def __init__(self, config):
        # 调用父类构造函数初始化对象
        super().__init__(config)
        # 保存配置信息到对象属性
        self.config = config

        # 根据配置信息设置均值张量
        if config.num_channels == 3 and config.num_channels_out == 3:
            rgb_mean = (0.4488, 0.4371, 0.4040)
            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
        else:
            self.mean = torch.zeros(1, 1, 1, 1)
        self.img_range = config.img_range

        # 创建第一个卷积层
        self.first_convolution = nn.Conv2d(config.num_channels, config.embed_dim, 3, 1, 1)
        # 创建嵌入层
        self.embeddings = Swin2SREmbeddings(config)
        # 创建编码器
        self.encoder = Swin2SREncoder(config, grid_size=self.embeddings.patch_embeddings.patches_resolution)

        # 创建层归一化层
        self.layernorm = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)
        # 创建补丁解嵌入层
        self.patch_unembed = Swin2SRPatchUnEmbeddings(config)
        # 创建主体后的卷积层
        self.conv_after_body = nn.Conv2d(config.embed_dim, config.embed_dim, 3, 1, 1)

        # 调用后初始化方法,初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回嵌入层的补丁嵌入对象
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历要修剪的头信息,对编码器中对应层的自注意力机制进行修剪
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def pad_and_normalize(self, pixel_values):
        _, _, height, width = pixel_values.size()

        # 1. 执行填充操作
        window_size = self.config.window_size
        modulo_pad_height = (window_size - height % window_size) % window_size
        modulo_pad_width = (window_size - width % window_size) % window_size
        pixel_values = nn.functional.pad(pixel_values, (0, modulo_pad_width, 0, modulo_pad_height), "reflect")

        # 2. 执行归一化操作
        self.mean = self.mean.type_as(pixel_values)
        pixel_values = (pixel_values - self.mean) * self.img_range

        return pixel_values

    @add_start_docstrings_to_model_forward(SWIN2SR_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutput]:
        # 如果没有显式指定,根据配置决定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有显式指定,根据配置决定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有显式指定,根据配置决定是否使用返回字典形式
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 准备头部掩码(如果需要)
        # 在头部掩码中,1.0 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, len(self.config.depths))

        _, _, height, width = pixel_values.shape

        # 一些预处理:填充 + 归一化
        pixel_values = self.pad_and_normalize(pixel_values)

        # 第一个卷积层处理像素值
        embeddings = self.first_convolution(pixel_values)
        # 将卷积后的结果传递给嵌入层处理,同时获取输入维度信息
        embedding_output, input_dimensions = self.embeddings(embeddings)

        # 编码器处理嵌入输出
        encoder_outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 取出编码器的输出的第一个元素作为序列输出
        sequence_output = encoder_outputs[0]
        # 序列输出经过 LayerNormalization 处理
        sequence_output = self.layernorm(sequence_output)

        # 将序列输出重新映射到原始尺寸上
        sequence_output = self.patch_unembed(sequence_output, (height, width))
        # 经过主体后的卷积操作,加上初始的嵌入值
        sequence_output = self.conv_after_body(sequence_output) + embeddings

        # 如果不使用返回字典形式,则输出为包含序列输出和其他编码器输出的元组
        if not return_dict:
            output = (sequence_output,) + encoder_outputs[1:]
            return output

        # 如果使用返回字典形式,则构造 BaseModelOutput 对象返回
        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
class PixelShuffleUpsampler(nn.Module):
    """PixelShuffleUpsampler module.

    This module performs upsampling using PixelShuffle.

    Args:
        config (`object`):
            Configuration object containing parameters.
        num_features (`int`):
            Number of intermediate features.

    Attributes:
        conv_before_upsample (`nn.Conv2d`):
            Convolutional layer before upsampling.
        activation (`nn.LeakyReLU`):
            LeakyReLU activation function.
        upsample (`Upsample`):
            Upsample module.
        final_convolution (`nn.Conv2d`):
            Final convolutional layer.

    """

    def __init__(self, config, num_features):
        super().__init__()
        
        # Initialize convolution before upsampling
        self.conv_before_upsample = nn.Conv2d(config.embed_dim, num_features, 3, 1, 1)
        # Initialize activation function
        self.activation = nn.LeakyReLU(inplace=True)
        # Initialize upsampling module
        self.upsample = Upsample(config.upscale, num_features)
        # Initialize final convolutional layer
        self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)

    def forward(self, sequence_output):
        # Apply convolution before upsampling
        x = self.conv_before_upsample(sequence_output)
        # Apply activation function
        x = self.activation(x)
        # Apply upsampling using the Upsample module
        x = self.upsample(x)
        # Apply final convolutional layer
        x = self.final_convolution(x)

        return x


class NearestConvUpsampler(nn.Module):
    """NearestConvUpsampler module.

    This module performs upsampling using nearest-neighbor interpolation followed by convolution.

    Args:
        scale (`int`):
            Scale factor for upsampling.
        in_channels (`int`):
            Number of input channels.
        out_channels (`int`):
            Number of output channels.

    Attributes:
        upsample (`nn.Upsample`):
            Upsampling layer.
        conv (`nn.Conv2d`):
            Convolutional layer.

    """
    def __init__(self, config, num_features):
        super().__init__()
        # 检查是否需要进行4倍上采样,否则抛出数值错误异常
        if config.upscale != 4:
            raise ValueError("The nearest+conv upsampler only supports an upscale factor of 4 at the moment.")

        # 第一层卷积,将输入特征维度转换为num_features,卷积核大小为3x3,填充为1
        self.conv_before_upsample = nn.Conv2d(config.embed_dim, num_features, 3, 1, 1)
        # 激活函数,使用LeakyReLU
        self.activation = nn.LeakyReLU(inplace=True)
        # 上采样卷积层1,输入和输出特征维度都为num_features,卷积核大小为3x3,填充为1
        self.conv_up1 = nn.Conv2d(num_features, num_features, 3, 1, 1)
        # 上采样卷积层2,输入和输出特征维度都为num_features,卷积核大小为3x3,填充为1
        self.conv_up2 = nn.Conv2d(num_features, num_features, 3, 1, 1)
        # 高分辨率恢复卷积层,输入和输出特征维度都为num_features,卷积核大小为3x3,填充为1
        self.conv_hr = nn.Conv2d(num_features, num_features, 3, 1, 1)
        # 最终卷积层,将特征维度转换为config.num_channels_out,卷积核大小为3x3,填充为1
        self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)
        # LeakyReLU激活函数,斜率为0.2,inplace操作
        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

    def forward(self, sequence_output):
        # 序列输出先经过第一层卷积
        sequence_output = self.conv_before_upsample(sequence_output)
        # 经过激活函数
        sequence_output = self.activation(sequence_output)
        # 上采样至原始大小的两倍,并经过LeakyReLU激活函数
        sequence_output = self.lrelu(
            self.conv_up1(torch.nn.functional.interpolate(sequence_output, scale_factor=2, mode="nearest"))
        )
        # 再次上采样至原始大小的四倍,并经过LeakyReLU激活函数
        sequence_output = self.lrelu(
            self.conv_up2(torch.nn.functional.interpolate(sequence_output, scale_factor=2, mode="nearest"))
        )
        # 最终的重建,经过高分辨率恢复卷积层和LeakyReLU激活函数
        reconstruction = self.final_convolution(self.lrelu(self.conv_hr(sequence_output)))
        # 返回重建的结果
        return reconstruction
# 定义像素混洗辅助上采样器模块的类,用于图像超分辨率和恢复任务
class PixelShuffleAuxUpsampler(nn.Module):
    def __init__(self, config, num_features):
        super().__init__()

        # 从配置中获取上采样比例
        self.upscale = config.upscale
        # 定义使用三通道卷积进行双三次插值的卷积层
        self.conv_bicubic = nn.Conv2d(config.num_channels, num_features, 3, 1, 1)
        # 定义用于上采样前的卷积层
        self.conv_before_upsample = nn.Conv2d(config.embed_dim, num_features, 3, 1, 1)
        # 定义激活函数为LeakyReLU
        self.activation = nn.LeakyReLU(inplace=True)
        # 定义用于辅助任务的卷积层,将序列输出映射到通道数为config.num_channels的张量
        self.conv_aux = nn.Conv2d(num_features, config.num_channels, 3, 1, 1)
        # 定义用于辅助任务后续处理的序列卷积和LeakyReLU激活函数的顺序层
        self.conv_after_aux = nn.Sequential(nn.Conv2d(3, num_features, 3, 1, 1), nn.LeakyReLU(inplace=True))
        # 定义上采样模块
        self.upsample = Upsample(config.upscale, num_features)
        # 定义最终的卷积层,将上采样后的特征映射到config.num_channels_out的输出通道
        self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)

    def forward(self, sequence_output, bicubic, height, width):
        # 对双三次插值结果进行卷积操作
        bicubic = self.conv_bicubic(bicubic)
        # 对序列输出进行上采样前的卷积操作
        sequence_output = self.conv_before_upsample(sequence_output)
        # 序列输出经过激活函数处理
        sequence_output = self.activation(sequence_output)
        # 对序列输出进行辅助任务的卷积操作
        aux = self.conv_aux(sequence_output)
        # 经过辅助任务卷积后的序列输出再次进行卷积和激活函数处理
        sequence_output = self.conv_after_aux(aux)
        # 序列输出经过上采样模块,根据指定的高度和宽度进行裁剪
        sequence_output = (
            self.upsample(sequence_output)[:, :, : height * self.upscale, : width * self.upscale]
            + bicubic[:, :, : height * self.upscale, : width * self.upscale]
        )
        # 最终将上采样后的序列输出进行最终卷积操作,生成重建图像
        reconstruction = self.final_convolution(sequence_output)

        return reconstruction, aux


# 使用添加文档字符串装饰器为Swin2SRForImageSuperResolution类添加说明
@add_start_docstrings(
    """
    Swin2SR模型的变压器,顶部带有一个上采样器头部,用于图像超分辨率和恢复。
    """,
    SWIN2SR_START_DOCSTRING,
)
class Swin2SRForImageSuperResolution(Swin2SRPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化Swin2SR模型
        self.swin2sr = Swin2SRModel(config)
        # 获取配置中的上采样器类型和上采样比例
        self.upsampler = config.upsampler
        self.upscale = config.upscale

        # 根据上采样器类型选择对应的上采样器模块
        num_features = 64
        if self.upsampler == "pixelshuffle":
            self.upsample = PixelShuffleUpsampler(config, num_features)
        elif self.upsampler == "pixelshuffle_aux":
            self.upsample = PixelShuffleAuxUpsampler(config, num_features)
        elif self.upsampler == "pixelshuffledirect":
            # 轻量级超分辨率模型,只进行一步上采样
            self.upsample = UpsampleOneStep(config.upscale, config.embed_dim, config.num_channels_out)
        elif self.upsampler == "nearest+conv":
            # 适用于真实世界超分辨率,减少伪影的最近邻插值加卷积上采样器
            self.upsample = NearestConvUpsampler(config, num_features)
        else:
            # 用于图像去噪和JPEG压缩伪影减少的最终卷积层
            self.final_convolution = nn.Conv2d(config.embed_dim, config.num_channels_out, 3, 1, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用添加文档字符串装饰器为forward方法添加输入说明
    @add_start_docstrings_to_model_forward(SWIN2SR_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ImageSuperResolutionOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法 `forward`,用于模型的前向传播
    # 
    # 参数说明:
    # - pixel_values: 可选的 torch.FloatTensor,表示输入的像素值
    # - head_mask: 可选的 torch.FloatTensor,表示注意力头部的掩码
    # - labels: 可选的 torch.LongTensor,表示标签数据
    # - output_attentions: 可选的 bool 值,控制是否输出注意力权重
    # - output_hidden_states: 可选的 bool 值,控制是否输出隐藏状态
    # - return_dict: 可选的 bool 值,控制是否以字典形式返回结果

.\models\swin2sr\__init__.py

# 版权声明和许可证信息,声明代码版权归 HuggingFace 团队所有,使用 Apache License 2.0 许可证发布
# 可以在符合许可证的情况下使用此文件。许可证详细信息可在 http://www.apache.org/licenses/LICENSE-2.0 获取
#
# 如果不符合适用法律或未经书面同意,则根据"AS IS"基础分发软件,无任何明示或暗示的担保或条件
from typing import TYPE_CHECKING

# 导入 OptionalDependencyNotAvailable 异常类、_LazyModule 类以及检查 torch 和 vision 是否可用的函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义导入结构的字典,包含模块到需要导入的类、函数的映射
_import_structure = {
    "configuration_swin2sr": ["SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swin2SRConfig"],
}

# 检查是否可以导入 torch
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()  # 如果 torch 不可用则抛出 OptionalDependencyNotAvailable 异常
except OptionalDependencyNotAvailable:
    pass  # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
else:
    # 如果 torch 可用,则添加 modeling_swin2sr 模块到导入结构中
    _import_structure["modeling_swin2sr"] = [
        "SWIN2SR_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Swin2SRForImageSuperResolution",
        "Swin2SRModel",
        "Swin2SRPreTrainedModel",
    ]

# 检查是否可以导入 vision
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()  # 如果 vision 不可用则抛出 OptionalDependencyNotAvailable 异常
except OptionalDependencyNotAvailable:
    pass  # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
else:
    # 如果 vision 可用,则添加 image_processing_swin2sr 模块到导入结构中
    _import_structure["image_processing_swin2sr"] = ["Swin2SRImageProcessor"]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入 configuration_swin2sr 模块中的特定类和变量
    from .configuration_swin2sr import SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP, Swin2SRConfig

    # 检查是否可以导入 torch
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()  # 如果 torch 不可用则抛出 OptionalDependencyNotAvailable 异常
    except OptionalDependencyNotAvailable:
        pass  # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
    else:
        # 导入 modeling_swin2sr 模块中的特定类和变量
        from .modeling_swin2sr import (
            SWIN2SR_PRETRAINED_MODEL_ARCHIVE_LIST,
            Swin2SRForImageSuperResolution,
            Swin2SRModel,
            Swin2SRPreTrainedModel,
        )

    # 检查是否可以导入 vision
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()  # 如果 vision 不可用则抛出 OptionalDependencyNotAvailable 异常
    except OptionalDependencyNotAvailable:
        pass  # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
    else:
        # 导入 image_processing_swin2sr 模块中的特定类
        from .image_processing_swin2sr import Swin2SRImageProcessor

# 如果不在类型检查模式下,则将当前模块映射到 _LazyModule,延迟导入模块,以及动态导入 _import_structure 中定义的模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\swinv2\configuration_swinv2.py

# 设置编码格式为 UTF-8

# 版权声明和许可证,声明代码版权归 HuggingFace Inc. 团队所有,遵循 Apache License 2.0 版本
# 只有在遵守许可证的情况下才能使用此文件。您可以在以下网址获取许可证的副本:
# http://www.apache.org/licenses/LICENSE-2.0

# 如果适用法律要求或书面同意,本软件按"原样"分发,不提供任何明示或暗示的担保或条件。
# 有关详细信息,请参阅许可证。

""" Swinv2 Transformer model configuration"""

# 导入必要的模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取日志记录器
logger = logging.get_logger(__name__)

# Swinv2 模型预训练配置文件映射,指定模型的预训练配置文件位置
SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/swinv2-tiny-patch4-window8-256": (
        "https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256/resolve/main/config.json"
    ),
}

# Swinv2Config 类,用于存储 Swinv2 模型的配置信息
class Swinv2Config(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Swinv2Model`]. It is used to instantiate a Swin
    Transformer v2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Swin Transformer v2
    [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import Swinv2Config, Swinv2Model

    >>> # Initializing a Swinv2 microsoft/swinv2-tiny-patch4-window8-256 style configuration
    >>> configuration = Swinv2Config()

    >>> # Initializing a model (with random weights) from the microsoft/swinv2-tiny-patch4-window8-256 style configuration
    >>> model = Swinv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    
    """

    # 模型类型为 Swinv2
    model_type = "swinv2"

    # 属性映射表,将一些属性名映射为另一些属性名
    attribute_map = {
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
    }
    # 初始化函数,用于初始化一个Swing Transformer模型的参数
    def __init__(
        self,
        image_size=224,  # 图像尺寸,默认为224
        patch_size=4,  # 每个patch的大小,默认为4
        num_channels=3,  # 输入图像的通道数,默认为3(RGB图像)
        embed_dim=96,  # 嵌入维度,默认为96
        depths=[2, 2, 6, 2],  # 各个阶段的深度列表,默认为[2, 2, 6, 2]
        num_heads=[3, 6, 12, 24],  # 各个阶段的注意力头数列表,默认为[3, 6, 12, 24]
        window_size=7,  # 窗口大小,默认为7
        pretrained_window_sizes=[0, 0, 0, 0],  # 预训练窗口大小列表,默认为[0, 0, 0, 0]
        mlp_ratio=4.0,  # MLP放大比例,默认为4.0
        qkv_bias=True,  # 是否使用注意力的查询、键、值偏置,默认为True
        hidden_dropout_prob=0.0,  # 隐藏层dropout概率,默认为0.0(无dropout)
        attention_probs_dropout_prob=0.0,  # 注意力概率dropout概率,默认为0.0(无dropout)
        drop_path_rate=0.1,  # drop path的概率,默认为0.1
        hidden_act="gelu",  # 隐藏层激活函数,默认为gelu
        use_absolute_embeddings=False,  # 是否使用绝对位置嵌入,默认为False
        initializer_range=0.02,  # 初始化范围,默认为0.02
        layer_norm_eps=1e-5,  # LayerNorm的epsilon,默认为1e-5
        encoder_stride=32,  # 编码器步长,默认为32
        out_features=None,  # 输出特征列表,默认为None
        out_indices=None,  # 输出索引列表,默认为None
        **kwargs,  # 其他关键字参数
    ):
        super().__init__(**kwargs)
    
        # 设置各种参数到对象的属性中
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.embed_dim = embed_dim
        self.depths = depths
        self.num_layers = len(depths)  # 设置阶段的数量为depths列表的长度
        self.num_heads = num_heads
        self.window_size = window_size
        self.pretrained_window_sizes = pretrained_window_sizes
        self.mlp_ratio = mlp_ratio
        self.qkv_bias = qkv_bias
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.drop_path_rate = drop_path_rate
        self.hidden_act = hidden_act
        self.use_absolute_embeddings = use_absolute_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.initializer_range = initializer_range
        self.encoder_stride = encoder_stride
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
        # 获取对齐的输出特征和输出索引,以便与VisionEncoderDecoderModel兼容
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )
        # 设置hidden_size属性,表示模型最后一个阶段之后的通道维度
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))

.\models\swinv2\convert_swinv2_timm_to_pytorch.py

# 设置编码格式为UTF-8

# 版权声明和许可信息,声明本代码版权归HuggingFace Inc.团队所有,并遵循Apache License 2.0许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Convert Swinv2 checkpoints from the timm library."""

import argparse  # 导入命令行参数解析模块
import json  # 导入JSON处理模块
from pathlib import Path  # 导入路径操作模块

import requests  # 导入HTTP请求库
import timm  # 导入模型库timm
import torch  # 导入PyTorch深度学习库
from huggingface_hub import hf_hub_download  # 导入Hugging Face模型中心下载函数
from PIL import Image  # 导入PIL图像处理库

from transformers import AutoImageProcessor, Swinv2Config, Swinv2ForImageClassification  # 导入transformers库中相关模块


def get_swinv2_config(swinv2_name):
    config = Swinv2Config()  # 创建一个Swinv2Config配置对象
    name_split = swinv2_name.split("_")  # 使用下划线分割模型名称

    model_size = name_split[1]  # 提取模型尺寸信息
    if "to" in name_split[3]:
        img_size = int(name_split[3][-3:])  # 提取图像尺寸信息
    else:
        img_size = int(name_split[3])
    if "to" in name_split[2]:
        window_size = int(name_split[2][-2:])  # 提取窗口大小信息
    else:
        window_size = int(name_split[2][6:])

    # 根据模型尺寸选择对应的嵌入维度、深度和头数配置
    if model_size == "tiny":
        embed_dim = 96
        depths = (2, 2, 6, 2)
        num_heads = (3, 6, 12, 24)
    elif model_size == "small":
        embed_dim = 96
        depths = (2, 2, 18, 2)
        num_heads = (3, 6, 12, 24)
    elif model_size == "base":
        embed_dim = 128
        depths = (2, 2, 18, 2)
        num_heads = (4, 8, 16, 32)
    else:
        embed_dim = 192
        depths = (2, 2, 18, 2)
        num_heads = (6, 12, 24, 48)

    # 如果模型名称中包含'to',设置预训练窗口大小配置
    if "to" in swinv2_name:
        config.pretrained_window_sizes = (12, 12, 12, 6)

    # 根据模型名称和数据集情况设置相应的类别数和标签映射
    if ("22k" in swinv2_name) and ("to" not in swinv2_name):
        num_classes = 21841
        repo_id = "huggingface/label-files"
        filename = "imagenet-22k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}
    else:
        num_classes = 1000
        repo_id = "huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    # 设置配置对象的图像大小、类别数、嵌入维度、深度、头数和窗口大小
    config.image_size = img_size
    config.num_labels = num_classes
    config.embed_dim = embed_dim
    config.depths = depths
    config.num_heads = num_heads
    config.window_size = window_size

    return config


def rename_key(name):
    # 如果文件名中包含 "patch_embed.proj",替换为 "embeddings.patch_embeddings.projection"
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    
    # 如果文件名中包含 "patch_embed.norm",替换为 "embeddings.norm"
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "embeddings.norm")
    
    # 如果文件名中包含 "layers",在前面加上 "encoder."
    if "layers" in name:
        name = "encoder." + name
    
    # 如果文件名中包含 "attn.proj",替换为 "attention.output.dense"
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    
    # 如果文件名中包含 "attn",替换为 "attention.self"
    if "attn" in name:
        name = name.replace("attn", "attention.self")
    
    # 如果文件名中包含 "norm1",替换为 "layernorm_before"
    if "norm1" in name:
        name = name.replace("norm1", "layernorm_before")
    
    # 如果文件名中包含 "norm2",替换为 "layernorm_after"
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    
    # 如果文件名中包含 "mlp.fc1",替换为 "intermediate.dense"
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    
    # 如果文件名中包含 "mlp.fc2",替换为 "output.dense"
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")
    
    # 如果文件名中包含 "q_bias",替换为 "query.bias"
    if "q_bias" in name:
        name = name.replace("q_bias", "query.bias")
    
    # 如果文件名中包含 "k_bias",替换为 "key.bias"
    if "k_bias" in name:
        name = name.replace("k_bias", "key.bias")
    
    # 如果文件名中包含 "v_bias",替换为 "value.bias"
    if "v_bias" in name:
        name = name.replace("v_bias", "value.bias")
    
    # 如果文件名中包含 "cpb_mlp",替换为 "continuous_position_bias_mlp"
    if "cpb_mlp" in name:
        name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
    
    # 如果文件名为 "norm.weight",替换为 "layernorm.weight"
    if name == "norm.weight":
        name = "layernorm.weight"
    
    # 如果文件名为 "norm.bias",替换为 "layernorm.bias"
    if name == "norm.bias":
        name = "layernorm.bias"
    
    # 如果文件名中包含 "head",替换为 "classifier";否则在文件名前面加上 "swinv2."
    if "head" in name:
        name = name.replace("head", "classifier")
    else:
        name = "swinv2." + name
    
    # 返回处理后的文件名
    return name
# 定义一个函数,用于转换模型的状态字典,以适配特定模型结构
def convert_state_dict(orig_state_dict, model):
    # 遍历原始状态字典的键(复制的列表),逐一处理
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键名中包含 "mask",则跳过当前循环
        if "mask" in key:
            continue
        # 如果键名中包含 "qkv"
        elif "qkv" in key:
            # 拆分键名为列表
            key_split = key.split(".")
            # 获取层号和块号
            layer_num = int(key_split[1])
            block_num = int(key_split[3])
            # 获取注意力机制的维度
            dim = model.swinv2.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size

            # 如果键名中包含 "weight"
            if "weight" in key:
                # 更新状态字典,设置查询权重
                orig_state_dict[
                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                ] = val[:dim, :]
                # 更新状态字典,设置键权重
                orig_state_dict[
                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
                ] = val[dim : dim * 2, :]
                # 更新状态字典,设置值权重
                orig_state_dict[
                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                ] = val[-dim:, :]
            else:
                # 更新状态字典,设置查询偏置
                orig_state_dict[
                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
                ] = val[:dim]
                # 更新状态字典,设置键偏置
                orig_state_dict[
                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
                ] = val[dim : dim * 2]
                # 更新状态字典,设置值偏置
                orig_state_dict[
                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
                ] = val[-dim:]
        else:
            # 对于其余键,通过 rename_key 函数重命名后存储
            orig_state_dict[rename_key(key)] = val

    # 返回更新后的原始状态字典
    return orig_state_dict


# 定义一个函数,用于将 timm 模型的状态字典转换为 swinv2 模型的状态字典
def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
    # 使用 timm 库创建指定预训练模型的模型对象
    timm_model = timm.create_model(swinv2_name, pretrained=True)
    # 将模型设置为评估模式
    timm_model.eval()

    # 获取 swinv2 模型的配置
    config = get_swinv2_config(swinv2_name)
    # 创建 swinv2 模型对象
    model = Swinv2ForImageClassification(config)
    # 将 swinv2 模型设置为评估模式
    model.eval()

    # 转换 timm 模型的状态字典为适应 swinv2 模型的新状态字典
    new_state_dict = convert_state_dict(timm_model.state_dict(), model)
    # 加载新的状态字典到 swinv2 模型中
    model.load_state_dict(new_state_dict)

    # 定义要使用的示例图像的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"

    # 使用 AutoImageProcessor 从预训练模型加载图像处理器
    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
    # 打开图像并转换为 PIL 图像对象
    image = Image.open(requests.get(url, stream=True).raw)
    # 使用图像处理器将图像转换为模型输入的张量表示
    inputs = image_processor(images=image, return_tensors="pt")

    # 使用 timm 模型对输入图像进行推理
    timm_outs = timm_model(inputs["pixel_values"])
    # 使用 swinv2 模型对输入图像进行推理,获取分类 logits
    hf_outs = model(**inputs).logits

    # 断言两个模型输出的值在给定的误差范围内接近
    assert torch.allclose(timm_outs, hf_outs, atol=1e-3)

    # 打印保存模型的信息
    print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}")
    # 将 swinv2 模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)

    # 打印保存图像处理器的信息
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到指定路径
    image_processor.save_pretrained(pytorch_dump_folder_path)

    # 将模型推送到指定的 Hub 仓库
    model.push_to_hub(
        repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name),
        organization="nandwalritik",
        commit_message="Add model",
    )


# 如果当前脚本作为主程序运行,则执行以下代码
if __name__ == "__main__":
    # 创建解析器对象
    parser = argparse.ArgumentParser()
    # 添加必需的参数说明
    # (这里省略了具体的参数添加,因为没有提供详细的代码示例)
    parser.add_argument(
        "--swinv2_name",  # 定义一个命令行参数,用于指定要转换的Swinv2模型的名称
        default="swinv2_tiny_patch4_window8_256",  # 默认参数值为"swinv2_tiny_patch4_window8_256"
        type=str,  # 参数类型为字符串
        help="Name of the Swinv2 timm model you'd like to convert.",  # 参数的帮助信息,解释了该参数的作用
    )
    parser.add_argument(
        "--pytorch_dump_folder_path",  # 定义另一个命令行参数,用于指定输出PyTorch模型的目录路径
        default=None,  # 默认值为None
        type=str,  # 参数类型为字符串
        help="Path to the output PyTorch model directory."  # 参数的帮助信息,解释了该参数的作用
    )

    args = parser.parse_args()  # 解析命令行参数,将参数存储在args对象中
    convert_swinv2_checkpoint(args.swinv2_name, args.pytorch_dump_folder_path)
    # 调用函数convert_swinv2_checkpoint,传入解析后的参数args中的swinv2_name和pytorch_dump_folder_path作为参数

.\models\swinv2\modeling_swinv2.py

# 设置文件编码为 UTF-8
# 版权声明:2022 年由 Microsoft Research 和 The HuggingFace Inc. 团队保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权,除非符合许可证规定,否则不得使用此文件
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则依据“原样”提供,不提供任何明示或暗示的保证或条件
# 请参阅许可证获取详细信息
""" PyTorch Swinv2 Transformer model."""

import collections.abc  # 导入集合抽象基类,用于类型检查
import math  # 导入数学库,用于数学计算
import warnings  # 导入警告模块,用于处理警告
from dataclasses import dataclass  # 导入 dataclass 装饰器,用于创建数据类
from typing import Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 模块,用于实现模型的内存优化
from torch import Tensor, nn  # 导入 PyTorch 的张量和神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 导入损失函数

from ...activations import ACT2FN  # 导入激活函数映射
from ...modeling_outputs import BackboneOutput  # 导入模型输出类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer  # 导入模型工具函数
from ...utils import (
    ModelOutput,  # 导入模型输出基类
    add_code_sample_docstrings,  # 导入用于添加代码示例文档字符串的函数
    add_start_docstrings,  # 导入用于添加起始文档字符串的函数
    add_start_docstrings_to_model_forward,  # 导入用于模型前向方法的起始文档字符串函数
    logging,  # 导入日志模块
    replace_return_docstrings,  # 导入用于替换返回文档字符串的函数
)
from ...utils.backbone_utils import BackboneMixin  # 导入骨干网络相关的工具函数
from .configuration_swinv2 import Swinv2Config  # 导入 Swinv2 模型的配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 用于文档的配置文件名
_CONFIG_FOR_DOC = "Swinv2Config"

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"

# 预期输出形状的说明
_EXPECTED_OUTPUT_SHAPE = [1, 64, 768]

# 图像分类检查点信息
_IMAGE_CLASS_CHECKPOINT = "microsoft/swinv2-tiny-patch4-window8-256"

# 图像分类预期输出的示例
_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"

# Swinv2 预训练模型的存档列表
SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/swinv2-tiny-patch4-window8-256",
    # 可在 https://huggingface.co/models?filter=swinv2 查看所有 Swinv2 模型
]

# 以下定义部分来自 https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/swin_transformer_v2.py.

@dataclass
# 从 transformers.models.swin.modeling_swin.SwinEncoderOutput 复制并将 Swin->Swinv2
class Swinv2EncoderOutput(ModelOutput):
    """
    Swinv2 编码器的输出,可能包含隐藏状态和注意力权重。
    # 最后一层模型的隐藏状态,形状为(batch_size, sequence_length, hidden_size)
    last_hidden_state: torch.FloatTensor = None
    # 模型每一层的隐藏状态的元组,形状为(batch_size, sequence_length, hidden_size),可选项,当`output_hidden_states=True`时返回
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 注意力权重的元组,形状为(batch_size, num_heads, sequence_length, sequence_length),可选项,当`output_attentions=True`时返回
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 模型每一层的隐藏状态的元组,形状为(batch_size, hidden_size, height, width),包括空间维度,可选项,当`output_hidden_states=True`且输出被重塑时返回
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 使用 dataclass 装饰器定义 Swinv2ModelOutput 类,它继承自 ModelOutput 类
# ModelOutput 是一个基础类,可能在 transformers 库中定义
@dataclass
# 从 transformers.models.swin.modeling_swin.SwinModelOutput 复制的类定义,将 Swin 替换为 Swinv2
class Swinv2ModelOutput(ModelOutput):
    """
    Swinv2 模型的输出,同时包含最后隐藏状态的池化结果。

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列输出。
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, 当 `add_pooling_layer=True` 时返回):
            最后一层隐藏状态的平均池化结果。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            包含模型每一层隐藏状态的元组,以及初始嵌入输出。
            形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            自注意力机制 softmax 后的注意力权重,用于计算自注意力头的加权平均值。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            包含模型每一层隐藏状态的元组,以及初始嵌入输出,重塑为包含空间维度的形状。
            形状为 `(batch_size, hidden_size, height, width)`。

"""


@dataclass
# 从 transformers.models.swin.modeling_swin.SwinMaskedImageModelingOutput 复制的类定义,将 Swin 替换为 Swinv2
class Swinv2MaskedImageModelingOutput(ModelOutput):
    """
    Swinv2 掩码图像模型的输出。

    这个类定义可能还需要填充完整,以匹配 Swinv2 模型的具体输出内容和结构。
    通常来说,这些数据类定义了模型输出的结构,包括各个部分的详细说明。
    你可以根据实际的 Swinv2 模型输出来进一步补充这个类的内容。
    
    例如,可以包括类似于上面 Swinv2ModelOutput 类的参数说明,描述具体的模型输出内容和形状。
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
            图像模型的掩码损失(MLM损失)。
        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
            重建的像素数值。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            模型在每一层输出的隐藏状态,包括初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
            注意力权重经过注意力softmax后的结果,用于计算自注意力头中的加权平均。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
            模型在每一层输出的隐藏状态,包括重塑以包括空间维度的初始嵌入输出。

    """

    # 定义属性loss,类型为Optional[torch.FloatTensor],默认值为None
    loss: Optional[torch.FloatTensor] = None
    # 定义属性reconstruction,类型为torch.FloatTensor,默认值为None
    reconstruction: torch.FloatTensor = None
    # 定义属性hidden_states,类型为Optional[Tuple[torch.FloatTensor, ...]],默认值为None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义属性attentions,类型为Optional[Tuple[torch.FloatTensor, ...]],默认值为None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 定义属性reshaped_hidden_states,类型为Optional[Tuple[torch.FloatTensor, ...]],默认值为None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

    @property
    def logits(self):
        # 警告信息,提醒logits属性在Transformers的版本5中将被移除,建议使用reconstruction属性获取最终输出。
        warnings.warn(
            "logits attribute is deprecated and will be removed in version 5 of Transformers."
            " Please use the reconstruction attribute to retrieve the final output instead.",
            FutureWarning,
        )
        # 返回属性reconstruction的值作为logits属性的输出
        return self.reconstruction
@dataclass
# 从transformers.models.swin.modeling_swin.SwinImageClassifierOutput复制到Swinv2ImageClassifierOutput
class Swinv2ImageClassifierOutput(ModelOutput):
    """
    Swinv2图像分类的输出。

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当提供`labels`时返回):
            分类(如果config.num_labels==1则是回归)损失。
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类(如果config.num_labels==1则是回归)得分(SoftMax之前)。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当`output_hidden_states=True`时返回或者当`config.output_hidden_states=True`时返回):
            包含每层输出的`torch.FloatTensor`元组,形状为`(batch_size, sequence_length, hidden_size)`。

            每个层的模型隐藏状态加上初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当`output_attentions=True`时返回或者当`config.output_attentions=True`时返回):
            包含每个阶段`torch.FloatTensor`元组,形状为`(batch_size, num_heads, sequence_length, sequence_length)`。

            注意力softmax后的注意力权重,用于计算自注意力头的加权平均值。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当`output_hidden_states=True`时返回或者当`config.output_hidden_states=True`时返回):
            包含每层输出的`torch.FloatTensor`元组,形状为`(batch_size, hidden_size, height, width)`。

            每个层的模型隐藏状态加上初始嵌入输出,重塑以包含空间维度。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None


# 从transformers.models.swin.modeling_swin.window_partition复制
def window_partition(input_feature, window_size):
    """
    将给定输入分区为窗口。
    """
    batch_size, height, width, num_channels = input_feature.shape
    input_feature = input_feature.view(
        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
    )
    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
    return windows


# 从transformers.models.swin.modeling_swin.window_reverse复制
def window_reverse(windows, window_size, height, width):
    """
    合并窗口以产生更高分辨率的特征。
    """
    # 获取窗口数组的通道数量
    num_channels = windows.shape[-1]
    # 将窗口数组重塑为指定窗口大小的网格结构
    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
    # 对重塑后的窗口数组进行维度置换,以便重新排列窗口的顺序
    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous()
    # 再次将重排后的窗口数组展平为原始形状
    windows = windows.view(-1, height, width, num_channels)
    # 返回重新排列和重塑后的窗口数组
    return windows
# Copied from transformers.models.swin.modeling_swin.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果 drop_prob 为 0 或者不处于训练模式,则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留概率
    keep_prob = 1 - drop_prob
    # 确定随机张量的形状
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    # 生成均匀分布的随机张量,并进行二值化处理
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # binarize
    # 对输入进行按元素除法,并应用二值化的随机张量
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.swin.modeling_swin.SwinDropPath with Swin->Swinv2
class Swinv2DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数,传递当前实例的 drop_prob 属性和训练模式
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->Swinv2
class Swinv2Embeddings(nn.Module):
    """
    Construct the patch and position embeddings. Optionally, also the mask token.
    """

    def __init__(self, config, use_mask_token=False):
        super().__init__()

        # 初始化 Swinv2PatchEmbeddings 实例
        self.patch_embeddings = Swinv2PatchEmbeddings(config)
        num_patches = self.patch_embeddings.num_patches
        self.patch_grid = self.patch_embeddings.grid_size
        # 如果 use_mask_token 为真,则初始化一个用于掩码的张量参数
        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None

        # 根据配置决定是否初始化位置编码张量参数
        if config.use_absolute_embeddings:
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
        else:
            self.position_embeddings = None

        # 初始化 LayerNorm 层和 Dropout 层
        self.norm = nn.LayerNorm(config.embed_dim)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
    ):
        # 省略了 forward 方法的其余部分,用于构造图像块和位置编码的嵌入
        pass
    ) -> Tuple[torch.Tensor]:
        # 获取图像块的嵌入表示和输出维度信息
        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
        # 对嵌入表示进行归一化处理
        embeddings = self.norm(embeddings)
        # 获取批处理大小、序列长度以及嵌入表示的最后一个维度大小
        batch_size, seq_len, _ = embeddings.size()

        # 如果存在掩码位置信息
        if bool_masked_pos is not None:
            # 使用mask_token在整个批次上扩展以替换掩码的视觉标记
            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
            # 创建掩码,使其类型与mask_tokens一致,并在嵌入表示中应用
            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask

        # 如果存在位置嵌入,则将其加到嵌入表示中
        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings

        # 对嵌入表示进行dropout处理
        embeddings = self.dropout(embeddings)

        # 返回处理后的嵌入表示和输出维度信息
        return embeddings, output_dimensions
# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->Swinv2
class Swinv2PatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        # Extract configuration parameters
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.embed_dim
        # Ensure image_size and patch_size are iterable
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # Calculate number of patches
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # Initialize instance variables
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])

        # Projection layer: Conv2d for patch embedding
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def maybe_pad(self, pixel_values, height, width):
        # Pad pixel_values if height or width is not divisible by patch_size
        if width % self.patch_size[1] != 0:
            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        if height % self.patch_size[0] != 0:
            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        return pixel_values

    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
        # Retrieve dimensions of pixel_values
        _, num_channels, height, width = pixel_values.shape
        # Check if number of channels matches self.num_channels
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # Pad input pixel_values to ensure divisibility by patch_size
        pixel_values = self.maybe_pad(pixel_values, height, width)
        # Project pixel_values into patch embeddings
        embeddings = self.projection(pixel_values)
        # Retrieve dimensions of embeddings after projection
        _, _, height, width = embeddings.shape
        # Flatten embeddings and transpose dimensions for further processing
        embeddings = embeddings.flatten(2).transpose(1, 2)

        # Return embeddings and output dimensions
        return embeddings, (height, width)


class Swinv2PatchMerging(nn.Module):
    """
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    """
    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        self.input_resolution = input_resolution  # 设置输入分辨率
        self.dim = dim  # 设置维度
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)  # 初始化线性变换层,减少维度
        self.norm = norm_layer(2 * dim)  # 初始化规范化层

    def maybe_pad(self, input_feature, height, width):
        should_pad = (height % 2 == 1) or (width % 2 == 1)  # 检查是否需要填充
        if should_pad:
            pad_values = (0, 0, 0, width % 2, 0, height % 2)  # 计算填充值
            input_feature = nn.functional.pad(input_feature, pad_values)  # 执行填充操作

        return input_feature

    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
        height, width = input_dimensions  # 解析输入尺寸
        # `dim` is height * width
        batch_size, dim, num_channels = input_feature.shape  # 获取输入特征的形状信息

        input_feature = input_feature.view(batch_size, height, width, num_channels)  # 重新组织输入特征的形状
        # pad input to be disible by width and height, if needed
        input_feature = self.maybe_pad(input_feature, height, width)  # 调用填充函数,确保特征是宽高可整除的
        # [batch_size, height/2, width/2, num_channels]
        input_feature_0 = input_feature[:, 0::2, 0::2, :]  # 提取特征的子块1
        # [batch_size, height/2, width/2, num_channels]
        input_feature_1 = input_feature[:, 1::2, 0::2, :]  # 提取特征的子块2
        # [batch_size, height/2, width/2, num_channels]
        input_feature_2 = input_feature[:, 0::2, 1::2, :]  # 提取特征的子块3
        # [batch_size, height/2, width/2, num_channels]
        input_feature_3 = input_feature[:, 1::2, 1::2, :]  # 提取特征的子块4
        # [batch_size, height/2 * width/2, 4*num_channels]
        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)  # 将四个子块合并
        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # 重新组织合并后的特征形状

        input_feature = self.reduction(input_feature)  # 执行线性变换
        input_feature = self.norm(input_feature)  # 执行规范化操作

        return input_feature  # 返回处理后的特征
# 定义一个名为Swinv2SelfAttention的自定义神经网络模块类
class Swinv2SelfAttention(nn.Module):
    # 定义一个用于将输入张量x转换为注意力分数形状的方法
    def transpose_for_scores(self, x):
        # 计算新的张量形状,保留除了最后一维外的所有维度,并增加注意力头数和每个头的大小
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        # 重新调整张量的形状
        x = x.view(new_x_shape)
        # 对调张量的维度顺序,将第0和第2个维度互换,第1和第3个维度互换
        return x.permute(0, 2, 1, 3)

    # 定义前向传播方法,接受隐藏状态张量、注意力掩码、头部掩码和输出注意力的可选参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 获取输入张量的维度信息
        batch_size, dim, num_channels = hidden_states.shape
        # 使用 self.query 对隐藏状态进行查询操作,生成混合的查询层
        mixed_query_layer = self.query(hidden_states)

        # 使用 self.key 对隐藏状态进行键操作,并转置以便计算注意力分数
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        # 使用 self.value 对隐藏状态进行值操作,并转置以便后续计算上下文层
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        # 对混合的查询层也进行转置以便计算注意力分数
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 使用余弦相似度计算注意力分数
        attention_scores = nn.functional.normalize(query_layer, dim=-1) @ nn.functional.normalize(
            key_layer, dim=-1
        ).transpose(-2, -1)

        # 对注意力分数进行缩放,使用 torch.clamp 限制缩放因子的最大值
        logit_scale = torch.clamp(self.logit_scale, max=math.log(1.0 / 0.01)).exp()
        attention_scores = attention_scores * logit_scale

        # 使用 MLP 模块计算相对位置偏置,并重新组织形状以匹配注意力分数
        relative_position_bias_table = self.continuous_position_bias_mlp(self.relative_coords_table).view(
            -1, self.num_attention_heads
        )
        relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
        )
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)

        # 如果存在注意力遮罩,则将其应用于注意力分数
        if attention_mask is not None:
            mask_shape = attention_mask.shape[0]
            attention_scores = attention_scores.view(
                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
            ) + attention_mask.unsqueeze(1).unsqueeze(0)
            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)

        # 将注意力分数归一化为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力概率进行 dropout
        attention_probs = self.dropout(attention_probs)

        # 如果存在头部掩码,则将其应用于注意力概率
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算最终的上下文层,将注意力概率与值层相乘
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()

        # 调整上下文层的形状以符合输出的预期形状
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据输出设置,构造最终输出结果
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->Swinv2
class Swinv2SelfOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 定义一个线性层,输入和输出维度都为 dim
        self.dense = nn.Linear(dim, dim)
        # 定义一个 Dropout 层,使用配置中的注意力概率作为丢弃概率
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 通过线性层进行变换
        hidden_states = self.dense(hidden_states)
        # 应用 Dropout 进行随机丢弃
        hidden_states = self.dropout(hidden_states)

        return hidden_states


class Swinv2Attention(nn.Module):
    def __init__(self, config, dim, num_heads, window_size, pretrained_window_size=0):
        super().__init__()
        # 初始化自注意力层对象,传入配置、维度、头数、窗口大小等参数
        self.self = Swinv2SelfAttention(
            config=config,
            dim=dim,
            num_heads=num_heads,
            window_size=window_size,
            pretrained_window_size=pretrained_window_size
            if isinstance(pretrained_window_size, collections.abc.Iterable)
            else (pretrained_window_size, pretrained_window_size),
        )
        # 初始化自注意力输出层对象,传入配置和维度参数
        self.output = Swinv2SelfOutput(config, dim)
        # 初始化被修剪的注意力头集合为空集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 寻找可修剪的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪后的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用自注意力层的前向传播函数
        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
        # 将自注意力输出作为输入,通过自注意力输出层进行变换
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果输出注意力权重,将它们添加到输出元组中
        outputs = (attention_output,) + self_outputs[1:]
        return outputs


# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->Swinv2
class Swinv2Intermediate(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 定义一个线性层,输入维度为 dim,输出维度为 config.mlp_ratio * dim
        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
        # 如果配置中的隐藏层激活函数是字符串,使用对应的激活函数;否则使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
    # 定义神经网络的前向传播函数,接受隐藏状态作为输入张量,返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层对隐藏状态进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态应用激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的隐藏状态张量作为输出
        return hidden_states
# 从 transformers.models.swin.modeling_swin.SwinOutput 复制代码,并将类名中的 Swin 改为 Swinv2
class Swinv2Output(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 创建一个全连接层,将输入的特征维度缩放为 config.mlp_ratio * dim
        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
        # 定义一个 dropout 层,用于随机丢弃神经元,防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 全连接层计算
        hidden_states = self.dense(hidden_states)
        # dropout 操作
        hidden_states = self.dropout(hidden_states)
        return hidden_states


# Swinv2Layer 类定义
class Swinv2Layer(nn.Module):
    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
        super().__init__()
        # 计算窗口大小和位移大小,确保它们不超过输入分辨率
        window_size, shift_size = self._compute_window_shift(
            (config.window_size, config.window_size), (shift_size, shift_size)
        )
        # 设置当前层的窗口大小和位移大小
        self.window_size = window_size[0]
        self.shift_size = shift_size[0]
        
        # 创建 Swinv2Attention 层,用于执行注意力机制
        self.attention = Swinv2Attention(
            config=config,
            dim=dim,
            num_heads=num_heads,
            window_size=self.window_size,
            pretrained_window_size=pretrained_window_size
                if isinstance(pretrained_window_size, collections.abc.Iterable)
                else (pretrained_window_size, pretrained_window_size),
        )
        
        # LayerNorm 层,用于归一化输入数据
        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        
        # 如果设置了 drop path rate,则创建 Swinv2DropPath 层;否则创建一个恒等映射(Identity)层
        self.drop_path = Swinv2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
        
        # Swinv2Intermediate 类,用于中间层的计算
        self.intermediate = Swinv2Intermediate(config, dim)
        
        # Swinv2Output 类,用于最终输出的全连接层
        self.output = Swinv2Output(config, dim)
        
        # LayerNorm 层,用于归一化输出数据
        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)

    def _compute_window_shift(self, target_window_size, target_shift_size) -> Tuple[Tuple[int, int], Tuple[int, int]]:
        # 计算适应于输入分辨率的窗口大小和位移大小
        window_size = [r if r <= w else w for r, w in zip(self.input_resolution, target_window_size)]
        shift_size = [0 if r <= w else s for r, w, s in zip(self.input_resolution, window_size, target_shift_size)]
        return window_size, shift_size
    # 返回注意力掩码
    def get_attn_mask(self, height, width, dtype):
        if self.shift_size > 0:
            # 为了实现窗口移位的多头自注意力机制,计算注意力掩码
            # 创建一个全零张量作为初始掩码
            img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
            # 定义高度和宽度的切片,用于生成窗口之外的掩码
            height_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            width_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            count = 0
            # 遍历高度和宽度切片,为每个窗口分配唯一的计数值
            for height_slice in height_slices:
                for width_slice in width_slices:
                    img_mask[:, height_slice, width_slice, :] = count
                    count += 1

            # 将整个图像分成窗口,并展平为二维数组
            mask_windows = window_partition(img_mask, self.window_size)
            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
            # 创建注意力掩码,基于窗口计数之间的差异
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
            # 将非零值的位置设为-100.0,零值位置设为0.0
            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
        else:
            # 如果不需要窗口移位,返回空的注意力掩码
            attn_mask = None
        return attn_mask

    # 对隐藏状态进行可能的填充,使其高度和宽度可被窗口大小整除
    def maybe_pad(self, hidden_states, height, width):
        # 计算需要在右侧和底部填充的像素数,以确保整数倍窗口大小
        pad_right = (self.window_size - width % self.window_size) % self.window_size
        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
        # 定义填充的数值(左、右、上、下)
        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
        # 使用 PyTorch 的函数进行填充操作
        hidden_states = nn.functional.pad(hidden_states, pad_values)
        return hidden_states, pad_values

    # 前向传播函数,接受隐藏状态张量和输入维度,可选的头部掩码和输出注意力
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 解析输入维度
        height, width = input_dimensions
        # 解析隐藏状态的批处理大小、高度、宽度、通道数
        batch_size, _, channels = hidden_states.size()
        # 保存隐藏状态的快捷方式
        shortcut = hidden_states

        # 将隐藏状态重新形状为(batch_size, height, width, channels)
        hidden_states = hidden_states.view(batch_size, height, width, channels)
        # 可能对隐藏状态进行填充以使其成为窗口大小的倍数,并获取填充的值
        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
        # 获取填充后的高度和宽度
        _, height_pad, width_pad, _ = hidden_states.shape

        # 如果设定了shift_size,则进行循环移位操作
        if self.shift_size > 0:
            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
        else:
            shifted_hidden_states = hidden_states

        # 将移位后的隐藏状态分割成窗口
        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
        # 将窗口重新形状为(-1, self.window_size * self.window_size, channels)
        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)

        # 获取注意力掩码
        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
        # 如果注意力掩码存在,则将其移动到hidden_states_windows的设备上
        if attn_mask is not None:
            attn_mask = attn_mask.to(hidden_states_windows.device)

        # 应用注意力机制
        attention_outputs = self.attention(
            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
        )

        # 获取注意力输出
        attention_output = attention_outputs[0]

        # 将注意力输出重新形状为(-1, self.window_size, self.window_size, channels)
        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)

        # 将窗口反转恢复为原始形状
        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)

        # 如果设定了shift_size,则反转循环移位操作
        if self.shift_size > 0:
            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
        else:
            attention_windows = shifted_windows

        # 检查是否进行了填充,如果是,则截取有效部分
        was_padded = pad_values[3] > 0 or pad_values[5] > 0
        if was_padded:
            attention_windows = attention_windows[:, :height, :width, :].contiguous()

        # 将窗口形状重新调整为(batch_size, height * width, channels)
        attention_windows = attention_windows.view(batch_size, height * width, channels)

        # 通过layernorm进行前处理
        hidden_states = self.layernorm_before(attention_windows)
        # 添加shortcut并进行drop_path操作
        hidden_states = shortcut + self.drop_path(hidden_states)

        # 应用中间层操作
        layer_output = self.intermediate(hidden_states)
        # 应用输出层操作
        layer_output = self.output(layer_output)
        # 应用layernorm后处理并添加drop_path
        layer_output = hidden_states + self.drop_path(self.layernorm_after(layer_output))

        # 如果需要输出注意力信息,则返回注意力输出
        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
        # 返回层输出
        return layer_outputs
# 定义 Swinv2Stage 类,作为 Swin Transformer V2 模型的一个阶段
class Swinv2Stage(nn.Module):
    # 初始化方法
    def __init__(
        self, config, dim, input_resolution, depth, num_heads, drop_path, downsample, pretrained_window_size=0
    ):
        super().__init__()
        self.config = config  # 保存配置参数
        self.dim = dim  # 特征维度
        blocks = []
        # 循环创建指定数量的 Swinv2Layer 块
        for i in range(depth):
            # 创建 Swinv2Layer 块并添加到 blocks 列表中
            block = Swinv2Layer(
                config=config,
                dim=dim,
                input_resolution=input_resolution,
                num_heads=num_heads,
                shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                pretrained_window_size=pretrained_window_size,
            )
            blocks.append(block)
        self.blocks = nn.ModuleList(blocks)  # 将 blocks 转为 nn.ModuleList

        # 如果有下采样层,则初始化下采样方法
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
        else:
            self.downsample = None

        self.pointing = False  # 初始化指向状态为 False

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        height, width = input_dimensions  # 获取输入图片的高度和宽度
        for i, layer_module in enumerate(self.blocks):
            layer_head_mask = head_mask[i] if head_mask is not None else None  # 获取当前层的注意力掩码

            # 调用每个 Swinv2Layer 块的 forward 方法进行前向传播
            layer_outputs = layer_module(
                hidden_states,
                input_dimensions,
                layer_head_mask,
                output_attentions,
            )

            hidden_states = layer_outputs[0]  # 更新隐藏状态为当前层的输出

        hidden_states_before_downsampling = hidden_states  # 保存下采样前的隐藏状态
        if self.downsample is not None:
            # 计算下采样后的图片尺寸
            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
            output_dimensions = (height, width, height_downsampled, width_downsampled)  # 输出尺寸信息
            # 调用下采样方法对隐藏状态进行下采样处理
            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
        else:
            output_dimensions = (height, width, height, width)  # 如果没有下采样,输出尺寸不变

        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)  # 阶段输出信息

        if output_attentions:
            stage_outputs += layer_outputs[1:]  # 如果需要输出注意力信息,则将其添加到输出中
        return stage_outputs  # 返回阶段的输出结果
    # 初始化函数,用于创建一个 Swin Transformer 模型
    def __init__(self, config, grid_size, pretrained_window_sizes=(0, 0, 0, 0)):
        # 调用父类的初始化方法
        super().__init__()
        # 计算模型的层数
        self.num_layers = len(config.depths)
        # 将配置信息保存到对象中
        self.config = config
        # 如果配置中指定了预训练窗口大小,则使用配置中的值
        if self.config.pretrained_window_sizes is not None:
            pretrained_window_sizes = config.pretrained_window_sizes
        # 生成一个按照 config.drop_path_rate 线性分布的列表,并转换成 Python 列表
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]

        # 初始化一个空列表用于保存每个阶段的 Swin Transformer 层
        layers = []
        # 遍历每个层
        for i_layer in range(self.num_layers):
            # 创建一个 Swin Transformer 的阶段(stage)
            stage = Swinv2Stage(
                config=config,
                # 设置当前层的维度大小为 config.embed_dim * 2^i_layer
                dim=int(config.embed_dim * 2**i_layer),
                # 设置输入分辨率为原始网格大小除以 2^i_layer
                input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
                # 设置当前层的深度为 config.depths[i_layer]
                depth=config.depths[i_layer],
                # 设置当前层的注意力头数为 config.num_heads[i_layer]
                num_heads=config.num_heads[i_layer],
                # 设置当前层的 drop path 策略
                drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                # 如果不是最后一层,则进行下采样
                downsample=Swinv2PatchMerging if (i_layer < self.num_layers - 1) else None,
                # 设置当前层的预训练窗口大小
                pretrained_window_size=pretrained_window_sizes[i_layer],
            )
            # 将当前创建的阶段加入到层列表中
            layers.append(stage)
        # 将所有的阶段组成的层列表转换为 nn.ModuleList 类型,并保存到对象的 layers 属性中
        self.layers = nn.ModuleList(layers)

        # 默认关闭梯度检查点
        self.gradient_checkpointing = False

    # 前向传播函数,定义了模型的前向计算逻辑
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        output_hidden_states_before_downsampling: Optional[bool] = False,
        return_dict: Optional[bool] = True,
# 从transformers.models.swin.modeling_swin.SwinPreTrainedModel复制的代码,并将Swin->Swinv2,swin->swinv2
class Swinv2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定该模型使用的配置类
    config_class = Swinv2Config
    # 基础模型的前缀名称
    base_model_prefix = "swinv2"
    # 主要输入的名称
    main_input_name = "pixel_values"
    # 支持梯度检查点的标志
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是线性层或卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重,平均值为0,标准差为self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项,则初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是LayerNorm层
        elif isinstance(module, nn.LayerNorm):
            # 初始化偏置项为零
            module.bias.data.zero_()
            # 初始化权重为1
            module.weight.data.fill_(1.0)


# SWINV2_START_DOCSTRING文档字符串
SWINV2_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Swinv2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# SWINV2_INPUTS_DOCSTRING文档字符串
SWINV2_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

@add_start_docstrings(
    "The bare Swinv2 Model transformer outputting raw hidden-states without any specific head on top.",
    SWINV2_START_DOCSTRING,
)
# 从transformers.models.swin.modeling_swin.SwinModel复制并修改为Swinv2Model,SWIN->SWINV2,Swin->Swinv2
class Swinv2Model(Swinv2PreTrainedModel):
    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
        super().__init__(config)
        self.config = config
        # 计算模型的层数和特征维度
        self.num_layers = len(config.depths)
        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))

        # 初始化嵌入层和编码器
        self.embeddings = Swinv2Embeddings(config, use_mask_token=use_mask_token)
        self.encoder = Swinv2Encoder(config, self.embeddings.patch_grid)

        # 初始化层归一化和池化层(如果指定添加池化层)
        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回输入嵌入的Patch嵌入层
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        对模型的注意力头进行剪枝。
        heads_to_prune: {layer_num: 需要在该层剪枝的头列表} 参见基类PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Swinv2ModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数详见SWINV2_INPUTS_DOCSTRING,传入像素值、布尔掩码位置、头掩码等信息
        # 返回Swinv2ModelOutput类型的预期输出
        ) -> Union[Tuple, Swinv2ModelOutput]:
        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 根据需要设定是否输出注意力权重
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据需要设定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据需要设定是否使用返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            # 如果未提供像素值,则抛出数值错误
            raise ValueError("You have to specify pixel_values")

        # 准备头部遮罩(如果需要)
        # head_mask 中的 1.0 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
        # 将 head_mask 转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, len(self.config.depths))

        # 嵌入层输出和输入尺寸
        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        # 编码器输出
        encoder_outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 序列输出
        sequence_output = encoder_outputs[0]
        # 序列输出进行 LayerNorm 处理
        sequence_output = self.layernorm(sequence_output)

        pooled_output = None
        if self.pooler is not None:
            # 如果存在池化器,则计算池化输出
            pooled_output = self.pooler(sequence_output.transpose(1, 2))
            pooled_output = torch.flatten(pooled_output, 1)

        if not return_dict:
            # 如果不使用返回字典形式,则返回元组形式的输出
            output = (sequence_output, pooled_output) + encoder_outputs[1:]
            return output

        # 使用 Swinv2ModelOutput 类构建返回字典形式的输出
        return Swinv2ModelOutput(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )
@add_start_docstrings(
    """
    Swinv2 Model with a decoder on top for masked image modeling, as proposed in
    [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    """,
    SWINV2_START_DOCSTRING,
)
# 定义 Swinv2ForMaskedImageModeling 类,用于进行面向掩膜图像建模的解码器模型
# 该类基于 Swinv2PreTrainedModel,并包含了 Swinv2 模型和一个解码器
class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 Swinv2 模型,设置不添加池化层并使用掩膜令牌
        self.swinv2 = Swinv2Model(config, add_pooling_layer=False, use_mask_token=True)

        # 计算特征数量用于解码器
        num_features = int(config.embed_dim * 2 ** (config.num_layers - 1))
        
        # 定义解码器的结构
        self.decoder = nn.Sequential(
            nn.Conv2d(
                in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
            ),
            nn.PixelShuffle(config.encoder_stride),
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Swinv2MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
    # 重写 forward 方法,接收输入并返回 Swinv2MaskedImageModelingOutput 类型的输出
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数详见 SWINV2_INPUTS_DOCSTRING



@add_start_docstrings(
    """
    Swinv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    """,
    SWINV2_START_DOCSTRING,
)
# 定义 Swinv2ForImageClassification 类,用于图像分类的 Swinv2 模型
# 该类基于 Swinv2PreTrainedModel,并包含了 Swinv2 模型和分类器头部
class Swinv2ForImageClassification(Swinv2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 设置类别数量
        self.num_labels = config.num_labels

        # 初始化 Swinv2 模型
        self.swinv2 = Swinv2Model(config)

        # 分类器头部
        self.classifier = (
            nn.Linear(self.swinv2.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=Swinv2ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 重写 forward 方法,接收输入并返回 Swinv2ImageClassifierOutput 类型的输出
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,  # 输入的像素值张量,可选
        head_mask: Optional[torch.FloatTensor] = None,     # 头部掩码张量,可选
        labels: Optional[torch.LongTensor] = None,          # 图像分类/回归的标签张量,可选
        output_attentions: Optional[bool] = None,           # 是否输出注意力张量,可选
        output_hidden_states: Optional[bool] = None,        # 是否输出隐藏状态张量,可选
        return_dict: Optional[bool] = None,                 # 是否返回字典形式的输出,可选
    ) -> Union[Tuple, Swinv2ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确定是否使用返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用Swin Transformer模型进行前向传播
        outputs = self.swinv2(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取池化后的输出
        pooled_output = outputs[1]

        # 使用分类器对池化后的输出进行分类得到logits
        logits = self.classifier(pooled_output)

        # 初始化损失值为None
        loss = None

        # 如果存在标签
        if labels is not None:
            # 如果问题类型未定义,则根据标签数据类型和类别数目设置问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不使用返回字典形式的输出,则将logits与其他输出合并返回
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用Swinv2ImageClassifierOutput类封装输出并返回
        return Swinv2ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )
@add_start_docstrings(
    """
    Swinv2 backbone, to be used with frameworks like DETR and MaskFormer.
    """,
    SWINV2_START_DOCSTRING,
)
class Swinv2Backbone(Swinv2PreTrainedModel, BackboneMixin):
    def __init__(self, config):
        # 调用父类的初始化方法,传入配置参数
        super().__init__(config)
        # 调用父类的_backbone初始化方法
        super()._init_backbone(config)

        # 计算特征维度列表,从config.embed_dim开始,按2的幂级增加,直到config.depths的长度
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
        
        # 初始化Swinv2的嵌入层
        self.embeddings = Swinv2Embeddings(config)
        
        # 初始化Swinv2的编码器,传入嵌入层的patch grid
        self.encoder = Swinv2Encoder(config, self.embeddings.patch_grid)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回patch embeddings作为输入嵌入
        return self.embeddings.patch_embeddings

    @add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Tensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        """
        根据给定的参数返回 BackboneOutput 对象。

        参数:
            return_dict (bool, optional): 是否返回字典形式的输出,默认为使用配置中的设定。
            output_hidden_states (bool, optional): 是否输出隐藏状态,默认为使用配置中的设定。
            output_attentions (bool, optional): 是否输出注意力权重,默认为使用配置中的设定。

        返回:
            BackboneOutput: 包含特征图、隐藏状态和注意力权重的对象。

        示例:

        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swinv2-tiny-patch4-window8-256", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 2048, 7, 7]
        ```
        """
        # 如果 return_dict 为 None,则使用配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 output_hidden_states 为 None,则使用配置中的设定
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 output_attentions 为 None,则使用配置中的设定
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # 调用 self.embeddings 处理输入像素值,获得嵌入输出和输入尺寸
        embedding_output, input_dimensions = self.embeddings(pixel_values)

        # 调用 self.encoder 处理嵌入输出和输入尺寸,返回输出结果
        outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=None,
            output_attentions=output_attentions,
            output_hidden_states=True,
            output_hidden_states_before_downsampling=True,
            return_dict=return_dict,
        )

        # 根据是否返回字典决定取得隐藏状态的方式
        hidden_states = outputs.reshaped_hidden_states if return_dict else outputs[-1]

        # 初始化空元组用于存储特征图
        feature_maps = ()
        # 遍历阶段名称和隐藏状态,如果阶段在输出特征列表中,则加入特征图元组
        for stage, hidden_state in zip(self.stage_names, hidden_states):
            if stage in self.out_features:
                feature_maps += (hidden_state,)

        # 如果不返回字典形式的结果,则按照指定顺序组装输出元组
        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs[1],)
            if output_attentions:
                output += (outputs[2],)
            return output

        # 返回 BackboneOutput 对象,包含特征图、隐藏状态和注意力权重
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )