Transformers 源码解析（四十三）

`.\models\efficientformer\image_processing_efficientformer.py`

# 导入所需模块和类，包括类型提示和必要的功能函数
from typing import Dict, List, Optional, Union

import numpy as np  # 导入numpy库，用于数值计算

# 导入图像处理所需的工具函数和类
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    get_resize_output_image_size,  # 导入获取调整后图像尺寸的函数
    resize,  # 导入图像调整大小的函数
    to_channel_dimension_format,  # 导入将图像转换为通道维度格式的函数
)
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,  # 导入图像处理中的默认均值
    IMAGENET_DEFAULT_STD,   # 导入图像处理中的默认标准差
    ChannelDimension,       # 导入通道维度相关的枚举
    ImageInput,             # 导入图像输入的相关类型
    PILImageResampling,     # 导入PIL图像重采样相关的枚举
    infer_channel_dimension_format,  # 导入推断图像通道维度格式的函数
    is_batched,             # 导入判断图像是否批处理的函数
    is_scaled_image,        # 导入判断图像是否已经缩放的函数
    to_numpy_array,         # 导入将图像转换为numpy数组的函数
    valid_images,           # 导入验证图像是否有效的函数
    validate_kwargs,        # 导入验证关键字参数的函数
    validate_preprocess_arguments,  # 导入验证预处理参数的函数
)
from ...utils import TensorType, logging  # 导入TensorType和日志记录相关的工具函数

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象


class EfficientFormerImageProcessor(BaseImageProcessor):
    r"""
    Constructs a EfficientFormer image processor.
    
    """

    def __init__(self):
        super().__init__()  # 调用父类BaseImageProcessor的构造函数，初始化基础图像处理器
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
            `preprocess` method.
        crop_size (`Dict[str, int]` *optional*, defaults to 224):
            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
            method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize:
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    """

    # 初始化方法，设置预处理参数
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_resize: bool = True,
        size: Optional[Dict[str, int]] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_center_crop: bool = True,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        crop_size: Dict[str, int] = None,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        **kwargs,
    # 初始化函数，继承父类并设置参数
    def __init__(
        self,
        do_resize: bool = False,
        do_rescale: bool = False,
        do_normalize: bool = False,
        do_center_crop: bool = False,
        size: Optional[Dict[str, int]] = None,
        crop_size: Optional[Dict[str, int]] = None,
        resample: Optional[PILImageResampling] = None,
        rescale_factor: Optional[float] = None,
        image_mean: Optional[List[float]] = None,
        image_std: Optional[List[float]] = None,
        **kwargs,
    ) -> None:
        # 调用父类初始化函数，并传递额外的参数
        super().__init__(**kwargs)
        # 如果未指定 size 参数，默认设置为 224x224
        size = size if size is not None else {"height": 224, "width": 224}
        # 使用函数处理 size 参数，确保格式正确
        size = get_size_dict(size)
        # 如果未指定 crop_size 参数，默认设置为 224x224，并采用默认的正方形裁剪
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 使用函数处理 crop_size 参数，采用默认的正方形裁剪
        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
    
        # 设置各种图像处理标志
        self.do_resize = do_resize
        self.do_rescale = do_rescale
        self.do_normalize = do_normalize
        self.do_center_crop = do_center_crop
        # 设置裁剪尺寸
        self.crop_size = crop_size
        # 设置图像大小
        self.size = size
        # 设置重采样方式
        self.resample = resample
        # 设置图像缩放因子
        self.rescale_factor = rescale_factor
        # 设置图像均值，默认为 IMAGENET_DEFAULT_MEAN
        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
        # 设置图像标准差，默认为 IMAGENET_DEFAULT_STD
        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
        # 设置有效的处理键列表
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample:
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.

        Returns:
            `np.ndarray`: The resized image.
        """
        # 获取经过处理后的尺寸字典
        size = get_size_dict(size)

        # 如果 size 字典中有 "shortest_edge" 键，根据最短边调整图片大小
        if "shortest_edge" in size:
            # 调用函数计算输出图片的大小
            size = get_resize_output_image_size(
                image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
            )
            # 如果 size 中有 "height" 和 "width" 键，将它们作为尺寸
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])
        else:
            # 如果 size 中既不含 "shortest_edge" 也不含完整的尺寸信息，则抛出异常
            raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
        # 调用 resize 函数进行图片大小调整，并返回调整后的图片
        return resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
        )

`.\models\efficientformer\modeling_efficientformer.py`

# coding=utf-8
# 版权 2022 年 Snapchat 研究团队和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可;
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件按“原样”分发，
# 不提供任何明示或暗示的担保或条件。
# 请参阅许可证了解特定语言下的权限和限制。

""" PyTorch EfficientFormer 模型。"""

import itertools
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_efficientformer import EfficientFormerConfig


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "EfficientFormerConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"


EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "snap-research/efficientformer-l1-300",
    # See all EfficientFormer models at https://huggingface.co/models?filter=efficientformer
]


class EfficientFormerPatchEmbeddings(nn.Module):
    """
    此类在两个阶段之间执行下采样。对于形状为 [batch_size, num_channels, height, width] 的输入张量，
    它生成形状为 [batch_size, num_channels, height/stride, width/stride] 的输出张量。
    """

    def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True):
        super().__init__()
        self.num_channels = num_channels

        # 使用 nn.Conv2d 定义投影层，用于下采样操作
        self.projection = nn.Conv2d(
            num_channels,
            embed_dim,
            kernel_size=config.downsample_patch_size,
            stride=config.downsample_stride,
            padding=config.downsample_pad,
        )
        # 根据 apply_norm 参数选择是否添加批标准化层或恒等映射
        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
    # 定义前向传播方法，接受像素值张量作为输入，并返回处理后的张量
    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 获取输入张量的批处理大小、通道数、高度和宽度
        batch_size, num_channels, height, width = pixel_values.shape
        
        # 检查通道数是否与模型配置中设置的通道数一致，如果不一致则抛出数值错误异常
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        
        # 将输入张量投影到嵌入空间中
        embeddings = self.projection(pixel_values)
        
        # 对投影后的张量进行规范化处理
        embeddings = self.norm(embeddings)

        # 返回处理后的嵌入张量作为前向传播的输出
        return embeddings
class EfficientFormerSelfAttention(nn.Module):
    def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int):
        super().__init__()

        self.num_heads = num_heads
        self.key_dim = key_dim
        self.attention_ratio = attention_ratio
        self.scale = key_dim**-0.5
        self.total_key_dim = key_dim * num_heads
        self.expanded_key_dim = int(attention_ratio * key_dim)
        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
        
        # Calculate the hidden size based on key dimensions and attention ratios
        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
        
        # Linear transformation for Q, K, V inputs
        self.qkv = nn.Linear(dim, hidden_size)
        
        # Linear projection for output
        self.projection = nn.Linear(self.total_expanded_key_dim, dim)
        
        # Generate all possible pairs of points in the resolution
        points = list(itertools.product(range(resolution), range(resolution)))
        num_points = len(points)
        
        # Create unique offsets and assign indices to them
        attention_offsets = {}
        idxs = []
        for point_1 in points:
            for point_2 in points:
                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
                if offset not in attention_offsets:
                    attention_offsets[offset] = len(attention_offsets)
                idxs.append(attention_offsets[offset])
        
        # Define attention biases as a parameter
        self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
        
        # Register buffer for storing attention bias indices
        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points))

    @torch.no_grad()
    def train(self, mode=True):
        super().train(mode)
        
        # Delete existing attention biases if training mode is enabled
        if mode and hasattr(self, "ab"):
            del self.ab
        else:
            # Store attention biases sliced by precomputed indices
            self.ab = self.attention_biases[:, self.attention_bias_idxs]
    # 定义前向传播函数，接受隐藏状态张量和是否输出注意力权重的标志，返回元组类型的张量
    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
        # 获取隐藏状态张量的批量大小、序列长度和通道数
        batch_size, sequence_length, num_channels = hidden_states.shape
        
        # 使用 self.qkv 对象处理隐藏状态张量，得到查询、键、值张量
        qkv = self.qkv(hidden_states)
        
        # 将处理后的 qkv 张量重塑并分割成查询层、键层、值层
        query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split(
            [self.key_dim, self.key_dim, self.expanded_key_dim], dim=3
        )
        
        # 对查询层、键层、值层的维度进行重新排列
        query_layer = query_layer.permute(0, 2, 1, 3)
        key_layer = key_layer.permute(0, 2, 1, 3)
        value_layer = value_layer.permute(0, 2, 1, 3)

        # 如果不处于训练状态，则将 self.ab 张量移动到与 attention_biases 的设备相同
        if not self.training:
            self.ab = self.ab.to(self.attention_biases.device)
        
        # 计算注意力概率，考虑缩放因子和注意力偏置
        attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + (
            self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
        )
        
        # 对注意力概率进行 softmax 归一化
        attention_probs = attention_probs.softmax(dim=-1)

        # 计算上下文层，将注意力概率应用于值层，再进行维度转置
        context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2)
        
        # 将上下文层重塑为(batch_size, sequence_length, total_expanded_key_dim)
        context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim)
        
        # 使用投影层处理上下文层，得到最终输出的上下文层
        context_layer = self.projection(context_layer)

        # 如果输出注意力权重，则将其加入输出元组
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        # 返回最终输出的元组
        return outputs
# 定义一个 EfficientFormerConvStem 类，继承自 nn.Module 类
class EfficientFormerConvStem(nn.Module):
    # 初始化函数，接受 EfficientFormerConfig 类型的 config 参数和一个整数 out_channels 参数
    def __init__(self, config: EfficientFormerConfig, out_channels: int):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()

        # 创建一个 2D 卷积层，输入通道数为 config.num_channels，输出通道数为 out_channels // 2，卷积核大小为 3x3，步幅为 2，填充为 1
        self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
        # 创建一个批标准化层，输入通道数为 out_channels // 2，epsilon 参数为 config.batch_norm_eps
        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)

        # 创建第二个 2D 卷积层，输入通道数为 out_channels // 2，输出通道数为 out_channels，卷积核大小为 3x3，步幅为 2，填充为 1
        self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
        # 创建第二个批标准化层，输入通道数为 out_channels，epsilon 参数为 config.batch_norm_eps
        self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)

        # 创建一个 ReLU 激活函数
        self.activation = nn.ReLU()

    # 前向传播函数，接受一个名为 pixel_values 的 torch.Tensor 输入，返回一个 torch.Tensor 输出
    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 对输入 pixel_values 进行第一次卷积和批标准化，然后应用激活函数
        features = self.batchnorm_before(self.convolution1(pixel_values))
        features = self.activation(features)
        # 对前一步得到的特征再进行一次卷积和批标准化，然后再应用激活函数
        features = self.batchnorm_after(self.convolution2(features))
        features = self.activation(features)

        # 返回处理后的特征
        return features


# 定义一个 EfficientFormerPooling 类，继承自 nn.Module 类
class EfficientFormerPooling(nn.Module):
    # 初始化函数，接受一个整数 pool_size 参数
    def __init__(self, pool_size: int):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()
        # 创建一个平均池化层，池化大小为 pool_size x pool_size，步幅为 1，填充大小为 pool_size // 2，不包括填充部分到计算中
        self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)

    # 前向传播函数，接受一个名为 hidden_states 的 torch.Tensor 输入，返回一个 torch.Tensor 输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 对输入 hidden_states 执行平均池化操作，然后从原始 hidden_states 中减去池化结果
        output = self.pool(hidden_states) - hidden_states
        # 返回处理后的输出
        return output


# 定义一个 EfficientFormerDenseMlp 类，继承自 nn.Module 类
class EfficientFormerDenseMlp(nn.Module):
    # 初始化函数，接受一个 EfficientFormerConfig 类型的 config 参数，整数 in_features、hidden_features 和 out_features 参数
    def __init__(
        self,
        config: EfficientFormerConfig,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
    ):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()
        # 如果未提供 out_features，则设置为输入的 in_features
        out_features = out_features or in_features
        # 如果未提供 hidden_features，则设置为输入的 in_features
        hidden_features = hidden_features or in_features

        # 创建一个线性层，输入特征数为 in_features，输出特征数为 hidden_features
        self.linear_in = nn.Linear(in_features, hidden_features)
        # 根据配置中的 hidden_act 属性选择相应的激活函数
        self.activation = ACT2FN[config.hidden_act]
        # 创建一个以 config.hidden_dropout_prob 为概率丢弃部分神经元的 dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 创建一个线性层，输入特征数为 hidden_features，输出特征数为 out_features
        self.linear_out = nn.Linear(hidden_features, out_features)

    # 前向传播函数，接受一个名为 hidden_states 的 torch.Tensor 输入，返回一个 torch.Tensor 输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 对输入 hidden_states 进行线性变换
        hidden_states = self.linear_in(hidden_states)
        # 应用预先选择的激活函数
        hidden_states = self.activation(hidden_states)
        # 对激活后的结果应用 dropout
        hidden_states = self.dropout(hidden_states)
        # 再次进行线性变换
        hidden_states = self.linear_out(hidden_states)
        # 再次应用 dropout
        hidden_states = self.dropout(hidden_states)

        # 返回处理后的输出
        return hidden_states


# 定义一个 EfficientFormerConvMlp 类，继承自 nn.Module 类
class EfficientFormerConvMlp(nn.Module):
    # 初始化函数，接受一个 EfficientFormerConfig 类型的 config 参数，整数 in_features、hidden_features 和 out_features 参数，以及一个浮点数 drop 参数，默认为 0.0
    def __init__(
        self,
        config: EfficientFormerConfig,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
        drop: float = 0.0,
    ):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()
        # 如果未提供 out_features，则设置为输入的 in_features
        out_features = out_features or in_features
        # 如果未提供 hidden_features，则设置为输入的 in_features
        hidden_features = hidden_features or in_features
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 如果未指定输出特征数，则默认与输入特征数相同
        out_features = out_features or in_features
        # 如果未指定隐藏层特征数，则默认与输入特征数相同
        hidden_features = hidden_features or in_features

        # 定义第一个卷积层，输入特征数为in_features，输出特征数为hidden_features，卷积核大小为1x1
        self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.hidden_act]
        # 定义第二个卷积层，输入特征数为hidden_features，输出特征数为out_features，卷积核大小为1x1
        self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
        # 定义一个Dropout层，用于防止过拟合
        self.dropout = nn.Dropout(drop)

        # 定义第一个批归一化层，对hidden_features个通道的特征进行归一化，epsilon设为config中的batch_norm_eps
        self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
        # 定义第二个批归一化层，对out_features个通道的特征进行归一化，epsilon设为config中的batch_norm_eps
        self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        # 第一层卷积操作，将hidden_state作为输入
        hidden_state = self.convolution1(hidden_state)
        # 第一层卷积后进行批归一化操作
        hidden_state = self.batchnorm_before(hidden_state)

        # 使用指定的激活函数对特征进行非线性变换
        hidden_state = self.activation(hidden_state)
        # 对特征进行Dropout操作，以减少过拟合风险
        hidden_state = self.dropout(hidden_state)
        # 第二层卷积操作
        hidden_state = self.convolution2(hidden_state)

        # 第二层卷积后进行批归一化操作
        hidden_state = self.batchnorm_after(hidden_state)
        # 再次对特征进行Dropout操作
        hidden_state = self.dropout(hidden_state)

        # 返回最终的特征表示
        return hidden_state
# Copied from transformers.models.convnext.modeling_convnext.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果 dropout 概率为 0 或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留的概率
    keep_prob = 1 - drop_prob
    # 生成与输入形状相同的随机张量，用于随机丢弃路径
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化
    # 计算输出，将输入按照保留概率进行缩放，并且乘以随机张量
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
class EfficientFormerDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用上面定义的 drop_path 函数来实现 drop path 功能
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回描述对象的字符串，包括 drop_prob 参数的信息
        return "p={}".format(self.drop_prob)


class EfficientFormerFlat(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
        # 将输入张量展平，并且交换维度以适应特定的输出格式
        hidden_states = hidden_states.flatten(2).transpose(1, 2)
        return hidden_states


class EfficientFormerMeta3D(nn.Module):
    # 这里是定义的一个类，暂未提供具体实现
    # 初始化函数，用于创建 EfficientFormer 类的实例，接收配置对象、维度和可选的 drop_path 参数
    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
        # 调用父类的初始化方法
        super().__init__()
    
        # 创建 EfficientFormerSelfAttention 实例，用于处理 token_mixer 的自注意力机制
        self.token_mixer = EfficientFormerSelfAttention(
            dim=config.dim,                     # 设置维度参数
            key_dim=config.key_dim,             # 设置键的维度
            num_heads=config.num_attention_heads,  # 设置注意力头的数量
            attention_ratio=config.attention_ratio,  # 设置注意力机制的比率
            resolution=config.resolution,       # 设置分辨率参数
        )
    
        # 创建 LayerNorm 层，用于第一层的归一化
        self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建 LayerNorm 层，用于第二层的归一化
        self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
    
        # 计算 MLP 隐藏层的维度
        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
        # 创建 EfficientFormerDenseMlp 实例，用于多层感知机操作
        self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
    
        # 如果 drop_path 大于 0，则创建 EfficientFormerDropPath 实例，否则创建单位函数（Identity）
        self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
    
        # 检查是否使用层缩放
        self.use_layer_scale = config.use_layer_scale
        if config.use_layer_scale:
            # 创建可学习的参数，初始化为 config.layer_scale_init_value 倍的 dim 维度张量，用于第一层的缩放
            self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
            # 创建可学习的参数，初始化为 config.layer_scale_init_value 倍的 dim 维度张量，用于第二层的缩放
            self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
    
    # 前向传播函数，接收隐藏状态张量和输出注意力权重的标志，返回元组类型的张量
    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
        # 进行 token_mixer 的自注意力计算，并应用第一层的 LayerNorm
        self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，则添加自注意力权重信息到 outputs 中
    
        # 如果使用层缩放，则按照层缩放因子进行加权和操作，否则直接应用 drop_path 和 MLP
        if self.use_layer_scale:
            layer_output = hidden_states + self.drop_path(
                self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output
            )
            layer_output = layer_output + self.drop_path(
                self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output))
            )
        else:
            layer_output = hidden_states + self.drop_path(attention_output)
            layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output)))
    
        # 将最终的层输出添加到 outputs 中，并返回
        outputs = (layer_output,) + outputs
    
        return outputs
# 定义一个 EfficientFormerMeta4DLayers 类，继承自 nn.Module 类
class EfficientFormerMeta4DLayers(nn.Module):
    # 初始化方法，接收一个 EfficientFormerConfig 类型的 config 参数
    def __init__(self, config: EfficientFormerConfig):
        # 调用父类的初始化方法
        super().__init__()
        
        # 计算每个块的 drop path 值列表
        drop_paths = [
            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
            for block_idx in range(config.num_meta4d_blocks)
        ]
        
        # 使用列表推导式创建一个 nn.ModuleList，包含多个 EfficientFormerMeta4D 实例化对象
        self.blocks = nn.ModuleList(
            [EfficientFormerMeta4D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths]
        )

    # 前向传播方法，接收输入的 hidden_states 张量和一个布尔类型的 output_attentions 参数，返回一个元组
    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
        # 如果 output_attentions 为 False，则初始化一个空元组 all_attention_outputs
        all_attention_outputs = () if output_attentions else None

        # 遍历 self.blocks 中的每个 layer_module
        for layer_module in self.blocks:
            # 如果 hidden_states 是元组，则取其第一个元素
            if isinstance(hidden_states, tuple):
                hidden_states = hidden_states[0]

            # 调用当前层的 layer_module 进行前向传播，更新 hidden_states
            hidden_states = layer_module(hidden_states)

            # 如果 output_attentions 为 True，则将当前层的注意力输出加入 all_attention_outputs 元组中
            if output_attentions:
                all_attention_outputs = all_attention_outputs + (hidden_states[1],)

        # 如果 output_attentions 为 True，则构造输出元组 outputs
        if output_attentions:
            outputs = (hidden_states[0],) + all_attention_outputs
            return outputs

        # 返回最终的 hidden_states
        return hidden_states
    def __init__(self, config: EfficientFormerConfig, stage_idx: int):
        # 调用父类的初始化方法
        super().__init__()
        # 根据给定阶段索引获取层的数量
        num_layers = (
            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
        )
        # 计算每个块的丢弃路径率并存储在列表中
        drop_paths = [
            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
        ]

        # 创建包含各个块的模块列表
        self.blocks = nn.ModuleList(
            [
                EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
                for drop_path in drop_paths
            ]
        )

    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
        # 遍历每个块模块并对输入的隐藏状态进行处理
        for layer_module in self.blocks:
            hidden_states = layer_module(hidden_states)
        # 返回处理后的隐藏状态
        return hidden_states
class EfficientFormerIntermediateStage(nn.Module):
    def __init__(self, config: EfficientFormerConfig, index: int):
        super().__init__()
        # 创建 EfficientFormerMeta4DLayers 实例作为中间层处理器
        self.meta4D_layers = EfficientFormerMeta4DLayers(config, index)

    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
        # 调用中间层处理器处理隐藏状态张量
        hidden_states = self.meta4D_layers(hidden_states)
        return hidden_states


class EfficientFormerLastStage(nn.Module):
    def __init__(self, config: EfficientFormerConfig):
        super().__init__()
        # 创建 EfficientFormerMeta4DLayers 实例作为最后阶段处理器
        self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1)
        # 创建 EfficientFormerFlat 实例用于扁平化处理
        self.flat = EfficientFormerFlat()
        # 创建 EfficientFormerMeta3DLayers 实例作为三维层处理器
        self.meta3D_layers = EfficientFormerMeta3DLayers(config)

    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
        # 调用最后阶段处理器处理隐藏状态张量
        hidden_states = self.meta4D_layers(hidden_states)
        # 调用扁平化处理器处理隐藏状态张量
        hidden_states = self.flat(hidden_states)
        # 调用三维层处理器处理隐藏状态张量和注意力输出标志
        hidden_states = self.meta3D_layers(hidden_states, output_attentions)
        return hidden_states


class EfficientFormerEncoder(nn.Module):
    def __init__(self, config: EfficientFormerConfig):
        super().__init__()
        self.config = config
        num_intermediate_stages = len(config.depths) - 1
        # 根据配置计算是否需要降采样
        downsamples = [
            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
            for i in range(num_intermediate_stages)
        ]
        intermediate_stages = []

        # 构建中间阶段模块列表
        for i in range(num_intermediate_stages):
            # 添加 EfficientFormerIntermediateStage 实例到中间阶段列表
            intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
            # 如果需要降采样，添加 EfficientFormerPatchEmbeddings 实例到中间阶段列表
            if downsamples[i]:
                intermediate_stages.append(
                    EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1])
                )

        # 使用 nn.ModuleList 封装中间阶段模块列表
        self.intermediate_stages = nn.ModuleList(intermediate_stages)
        # 创建 EfficientFormerLastStage 实例作为最后阶段处理器
        self.last_stage = EfficientFormerLastStage(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_hidden_states: bool = False,
        output_attentions: bool = False,
        return_dict: bool = True,
    ) -> BaseModelOutput:
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 如果输出隐藏状态，初始化一个空元组用于存储所有隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 遍历中间层模块并逐层计算隐藏状态
        for layer_module in self.intermediate_stages:
            hidden_states = layer_module(hidden_states)
            # 如果输出隐藏状态，将当前层的隐藏状态加入到存储中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 调用最后一个阶段模块计算最终输出
        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)

        # 如果输出注意力权重，将当前层的注意力权重加入到存储中
        if output_attentions:
            all_self_attentions = all_self_attentions + layer_output[1:]

        # 如果输出隐藏状态，将最后一层的隐藏状态加入到存储中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (layer_output[0],)

        # 如果不返回字典形式的结果，将各部分非空的结果组成元组返回
        if not return_dict:
            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)

        # 返回以字典形式封装的 BaseModelOutput 对象
        return BaseModelOutput(
            last_hidden_state=layer_output[0],
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
@add_start_docstrings(
    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
    EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerModel(EfficientFormerPreTrainedModel):
    """
    EfficientFormerModel extends EfficientFormerPreTrainedModel and represents a transformer model architecture 
    without specific task heads.

    Args:
        config (EfficientFormerConfig): The configuration class for initializing the model.

    Attributes:
        patch_embed (EfficientFormerConvStem): Patch embedding layer.
        encoder (EfficientFormerEncoder): Transformer encoder.
        layernorm (nn.LayerNorm): Layer normalization for the final output.

    Methods:
        forward: Implements the forward pass of the model.

    Inherits from:
        EfficientFormerPreTrainedModel: Handles weights initialization and pretrained model loading interface.
    """

    def __init__(self, config: EfficientFormerConfig):
        super().__init__(config)
        self.config = config

        # Initialize patch embedding layer
        self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
        # Initialize transformer encoder
        self.encoder = EfficientFormerEncoder(config)
        # Layer normalization for the final output
        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
    def forward(self, pixel_values, output_attentions=False, output_hidden_states=False, return_dict=True):
        """
        Defines the forward pass of EfficientFormerModel.

        Args:
            pixel_values (torch.FloatTensor): Input pixel values of shape (batch_size, num_channels, height, width).
            output_attentions (bool, optional): Whether to return attention tensors of all layers.
            output_hidden_states (bool, optional): Whether to return hidden states of all layers.
            return_dict (bool, optional): Whether to return a ModelOutput instead of a tuple.

        Returns:
            ModelOutput or tuple:
                Depending on `return_dict`, either:
                - ModelOutput if `return_dict=True` (default),
                - A tuple of torch.FloatTensor otherwise.
        """
        pass  # Placeholder for the actual implementation of the forward method
    # 使用 @add_code_sample_docstrings 装饰器添加文档字符串，用于代码示例的文档化
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 指定文档化的检查点（checkpoint）
        output_type=BaseModelOutputWithPooling,  # 指定输出类型为包含汇总的基础模型输出
        config_class=_CONFIG_FOR_DOC,  # 指定用于文档化的配置类
        modality="vision",  # 指定模态性（此处为视觉）
        expected_output=_EXPECTED_OUTPUT_SHAPE,  # 指定预期输出的形状
    )
    # 定义前向传播方法，接收输入的像素值、是否输出注意力、是否输出隐藏状态、是否返回字典等参数，返回联合类型的结果或基础模型输出
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,  # 输入参数：像素值，默认为空
        output_attentions: Optional[bool] = None,  # 输入参数：是否输出注意力，默认为空
        output_hidden_states: Optional[bool] = None,  # 输入参数：是否输出隐藏状态，默认为空
        return_dict: Optional[bool] = None,  # 输入参数：是否返回字典，默认为空
    ) -> Union[tuple, BaseModelOutput]:
        # 如果未提供像素值，则抛出数值错误
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据输入或配置设定是否输出注意力
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据输入或配置设定是否输出隐藏状态
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据输入或配置设定是否使用返回字典

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")
        # 如果像素值为空，则引发数值错误异常

        # 将像素值传递给 patch_embed 方法进行嵌入
        embedding_output = self.patch_embed(pixel_values)
        # 使用编码器处理嵌入输出，根据参数设定是否输出注意力和隐藏状态
        encoder_outputs = self.encoder(
            embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states
        )

        # 获取编码器的序列输出
        sequence_output = encoder_outputs[0]
        # 对序列输出进行层归一化处理
        sequence_output = self.layernorm(sequence_output)

        if not return_dict:
            # 如果不要求返回字典，则返回元组形式的头部输出和编码器其他输出状态
            head_outputs = (sequence_output,)
            return head_outputs + encoder_outputs[1:]

        # 否则，返回基础模型输出对象，包含最终隐藏状态、所有隐藏状态和注意力
        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 使用自定义的文档字符串描述 EfficientFormer 模型，这是一个在顶部增加了图像分类头部的转换器模型，例如用于 ImageNet 数据集的场景。
@add_start_docstrings(
    """
    EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final
    hidden state of the [CLS] token) e.g. for ImageNet.
    """,
    EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel):
    def __init__(self, config: EfficientFormerConfig):
        super().__init__(config)

        # 初始化模型的标签数量
        self.num_labels = config.num_labels
        # 初始化 EfficientFormer 模型
        self.efficientformer = EfficientFormerModel(config)

        # 分类器头部
        self.classifier = (
            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此方法用于模型的前向传播，接受输入参数如像素值、标签等，并返回模型输出
        # 具体文档化细节参见 add_start_docstrings_to_model_forward 和 add_code_sample_docstrings 装饰器
    ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用给定的值，否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Efficientformer 处理输入的像素值，根据参数设置输出注意力和隐藏状态，并返回相应的对象或字典
        outputs = self.efficientformer(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Efficientformer 的输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传入分类器，计算 logits
        logits = self.classifier(sequence_output.mean(-2))

        # 初始化 loss 为 None
        loss = None
        if labels is not None:
            # 确定问题类型（回归或分类）
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数和计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则按照一定格式返回输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则构造 ImageClassifierOutput 对象并返回
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 用于存储图像分类模型输出的数据类，继承自`ModelOutput`
@dataclass
class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
    """
    [`EfficientFormerForImageClassificationWithTeacher`] 的输出类型。

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            预测分数，是 `cls_logits` 和 `distillation_logits` 的平均值。
        cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            分类头部的预测分数（即最终隐藏状态的类标记之上的线性层）。
        distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            蒸馏头部的预测分数（即最终隐藏状态的蒸馏标记之上的线性层）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 传递或 `config.output_hidden_states=True` 时返回):
            `torch.FloatTensor` 元组（一个用于嵌入的输出 + 每层的输出）的形状为 `(batch_size, sequence_length, hidden_size)`。
            模型在每层输出的隐藏状态加上初始嵌入的输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 传递或 `config.output_attentions=True` 时返回):
            `torch.FloatTensor` 元组（每层一个）的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            在注意力 softmax 之后的注意力权重，用于计算自注意力头中的加权平均值。
    """


# 基于 `EfficientFormerPreTrainedModel` 的图像分类头部模型变换器，包含两个线性层（一个在 [CLS] 标记的最终隐藏状态之上，一个在蒸馏标记的最终隐藏状态之上），例如用于 ImageNet。
@add_start_docstrings(
    """
    `EfficientFormer` 模型变换器，其顶部包含图像分类头部（一个在 [CLS] 标记的最终隐藏状态之上的线性层，一个在蒸馏标记的最终隐藏状态之上的线性层），
    例如用于 ImageNet。

    <Tip warning={true}>

           此模型仅支持推断。目前不支持使用蒸馏进行微调（即带有教师模型）。

    </Tip>
    """,
    EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel):
    # 初始化函数，接受一个 EfficientFormerConfig 类型的参数 config
    def __init__(self, config: EfficientFormerConfig):
        # 调用父类的初始化方法，传入 config 参数
        super().__init__(config)

        # 将配置中的 num_labels 属性赋值给当前对象的 num_labels 属性
        self.num_labels = config.num_labels
        # 根据配置创建一个 EfficientFormerModel 对象，并赋值给当前对象的 efficientformer 属性
        self.efficientformer = EfficientFormerModel(config)

        # 分类器头部，根据配置中的 hidden_size 和 num_labels 创建线性分类器或者恒等映射
        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
        # 蒸馏头部，根据配置中的 hidden_size 和 num_labels 创建线性分类器或者恒等映射
        self.distillation_classifier = (
            nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 调用初始化权重和应用最终处理的函数
        self.post_init()

    # 前向传播函数，接受多个参数，返回一个包含预测输出的对象或元组
    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=EfficientFormerForImageClassificationWithTeacherOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]:
        # 如果 return_dict 参数为 None，则使用配置中的 use_return_dict 属性
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 调用 EfficientFormerModel 的前向传播方法，传入相应参数，并获取输出
        outputs = self.efficientformer(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出中获取序列输出（通常是模型最后一层的输出）
        sequence_output = outputs[0]

        # 对序列输出进行均值池化，并通过分类器头部获取分类器预测结果
        cls_logits = self.classifier(sequence_output.mean(-2))
        # 对序列输出进行均值池化，并通过蒸馏头部获取蒸馏预测结果
        distillation_logits = self.distillation_classifier(sequence_output.mean(-2))

        # 在推断过程中，返回两个分类器预测结果的平均值作为最终预测值
        logits = (cls_logits + distillation_logits) / 2

        # 如果 return_dict 为 False，则返回一个包含所有输出和预测结果的元组
        if not return_dict:
            output = (logits, cls_logits, distillation_logits) + outputs[1:]
            return output

        # 如果 return_dict 为 True，则返回一个包含输出对象及相关属性的 EfficientFormerForImageClassificationWithTeacherOutput 对象
        return EfficientFormerForImageClassificationWithTeacherOutput(
            logits=logits,
            cls_logits=cls_logits,
            distillation_logits=distillation_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\efficientformer\modeling_tf_efficientformer.py`

# coding=utf-8
# 版权所有 2023 Snapchat Research 和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，
# 没有任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。
""" TensorFlow EfficientFormer 模型。"""

import itertools
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import tensorflow as tf

# 导入自定义模块
from ...activations_tf import ACT2FN
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFImageClassifierOutput,
)
from ...modeling_tf_utils import (
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_efficientformer import EfficientFormerConfig

# 获取 logger 实例
logger = logging.get_logger(__name__)

# 用于文档的配置信息
_CONFIG_FOR_DOC = "EfficientFormerConfig"

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]

# 用于图像分类的检查点和预期输出
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"

# EfficientFormer 模型的预训练模型存档列表
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "snap-research/efficientformer-l1-300",
    # 查看所有 EfficientFormer 模型：https://huggingface.co/models?filter=efficientformer
]

# 自定义层：TFEfficientFormerPatchEmbeddings
class TFEfficientFormerPatchEmbeddings(keras.layers.Layer):
    """
    此类在两个阶段之间执行下采样。
    对于形状为 [batch_size, num_channels, height, width] 的输入张量，
    它产生形状为 [batch_size, num_channels, height/stride, width/stride] 的输出张量。
    """

    def __init__(
        self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs
    ):
        super().__init__(**kwargs)
    ) -> None:
        # 调用父类初始化方法，并传递额外的关键字参数
        super().__init__(**kwargs)
        # 设置网络的通道数属性
        self.num_channels = num_channels

        # 创建用于填充的 ZeroPadding2D 层，使用配置文件中的填充大小
        self.padding = keras.layers.ZeroPadding2D(padding=config.downsample_pad)
        
        # 创建投影层，使用指定的滤波器数目、卷积核大小、步长和填充方式
        self.projection = keras.layers.Conv2D(
            filters=embed_dim,
            kernel_size=config.downsample_patch_size,
            strides=config.downsample_stride,
            padding="valid",
            name="projection",
        )
        
        # 如果应用归一化，则创建批量归一化层，使用配置文件中的动量和 epsilon，模仿 PyTorch 中的默认设置
        self.norm = (
            keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm")
            if apply_norm
            else tf.identity
        )
        
        # 设置嵌入维度属性
        self.embed_dim = embed_dim

    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 断言输入张量的形状是否正确，确保通道维度与配置中设置的一致
        tf.debugging.assert_shapes(
            [(pixel_values, (..., None, None, self.num_channels))],
            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
        )
        
        # 对输入像素值进行填充和投影操作，生成嵌入表示
        embeddings = self.projection(self.padding(pixel_values))
        
        # 对嵌入表示进行归一化处理，根据训练模式决定是否使用训练模式
        embeddings = self.norm(embeddings, training=training)
        
        # 返回处理后的嵌入表示张量
        return embeddings

    def build(self, input_shape=None):
        # 如果已经构建过网络，则直接返回
        if self.built:
            return
        
        # 标记网络已经构建
        self.built = True
        
        # 如果存在投影层，则构建投影层
        if getattr(self, "projection", None) is not None:
            with tf.name_scope(self.projection.name):
                self.projection.build([None, None, None, self.num_channels])
        
        # 如果存在归一化层，则根据嵌入维度构建归一化层
        if getattr(self, "norm", None) is not None:
            if hasattr(self.norm, "name"):
                with tf.name_scope(self.norm.name):
                    self.norm.build([None, None, None, self.embed_dim])
    # 自定义的 TensorFlow/Keras 层，实现 EfficientFormer 中的自注意力机制
    class TFEfficientFormerSelfAttention(keras.layers.Layer):
        def __init__(
            self,
            dim: int,
            key_dim: int,
            num_heads: int,
            attention_ratio: int,
            resolution: int,
            config: EfficientFormerConfig,
            **kwargs,
        ):
            super().__init__(**kwargs)

            # 初始化层的参数
            self.num_heads = num_heads  # 自注意力头的数量
            self.key_dim = key_dim  # 键向量的维度
            self.attention_ratio = attention_ratio  # 注意力扩展比率
            self.scale = key_dim**-0.5  # 缩放因子，用于缩放注意力分数
            self.total_key_dim = key_dim * num_heads  # 总的键向量维度
            self.expanded_key_dim = int(attention_ratio * key_dim)  # 扩展后的键向量维度
            self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)  # 总的扩展键向量维度
            hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2  # 隐藏层的大小，用于 QKV 矩阵

            # 创建 Dense 层，用于计算 QKV 矩阵
            self.qkv = keras.layers.Dense(
                units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
            )
            # 创建 Dense 层，用于最终投影到输出维度
            self.projection = keras.layers.Dense(
                units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
            )
            self.resolution = resolution  # 分辨率
            self.dim = dim  # 输出维度

        def build(self, input_shape: tf.TensorShape) -> None:
            # 生成所有可能的注意力偏移量
            points = list(itertools.product(range(self.resolution), range(self.resolution)))
            num_points = len(points)
            attention_offsets = {}

            idxs = []

            # 遍历所有点对，计算它们之间的注意力偏移量
            for point_1 in points:
                for point_2 in points:
                    offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
                    if offset not in attention_offsets:
                        attention_offsets[offset] = len(attention_offsets)
                    idxs.append(attention_offsets[offset])

            # 创建注意力偏置权重，用于注意力计算
            self.attention_biases = self.add_weight(
                shape=(self.num_heads, len(attention_offsets)),
                initializer=keras.initializers.zeros(),
                trainable=True,
                name="attention_biases",
            )
            # 创建索引权重，用于指示每对点之间的偏置
            self.attention_bias_idxs = self.add_weight(
                shape=(num_points, num_points),
                trainable=False,
                dtype=tf.int32,
                name="attention_bias_idxs",
            )

            # 将偏置索引转换并分配给注意力偏置索引权重
            self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points)))

            if self.built:
                return
            self.built = True
            if getattr(self, "qkv", None) is not None:
                with tf.name_scope(self.qkv.name):
                    self.qkv.build([None, None, self.dim])
            if getattr(self, "projection", None) is not None:
                with tf.name_scope(self.projection.name):
                    self.projection.build([None, None, self.total_expanded_key_dim])

        def call(
            self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
        ):
            # 留空，因为这里只注释定义和构建部分的代码
    # 定义函数，接收隐藏状态张量并返回元组，包含注意力机制的输出
    ) -> Tuple[tf.Tensor]:
        # 获取隐藏状态张量的形状信息
        batch_size, sequence_length, *_ = shape_list(hidden_states)
        # 调用 self.qkv 方法处理隐藏状态张量，得到 qkv 张量
        qkv = self.qkv(inputs=hidden_states)

        # 将 qkv 张量按照指定大小拆分为查询、键、值张量
        query_layer, key_layer, value_layer = tf.split(
            tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)),
            num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim],
            axis=3,
        )

        # 转置查询、键、值张量的维度顺序，以便后续计算注意力矩阵
        query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
        key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
        value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])

        # 计算注意力矩阵的原始分数
        attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2]))
        # 缩放注意力矩阵
        scale = tf.cast(self.scale, dtype=attention_probs.dtype)
        attention_probs = tf.multiply(attention_probs, scale)

        # 获取注意力偏置项并添加到注意力矩阵中
        attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1)
        attention_probs = attention_probs + attention_biases
        # 对注意力矩阵进行稳定的 softmax 归一化
        attention_probs = stable_softmax(logits=attention_probs, axis=-1)

        # 计算上下文张量，即加权值张量乘以值张量
        context_layer = tf.matmul(attention_probs, value_layer)
        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])

        # 重新调整上下文张量的形状，以便进行最终的投影操作
        context_layer = tf.reshape(
            tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim)
        )
        # 应用投影层处理上下文张量，生成最终输出
        context_layer = self.projection(context_layer)

        # 根据输出设置是否返回注意力矩阵，构造最终的输出元组
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        # 返回函数的最终输出
        return outputs
# 定义一个自定义层 TFEfficientFormerConvStem，继承自 keras.layers.Layer
class TFEfficientFormerConvStem(keras.layers.Layer):
    # 初始化方法，接受配置参数 config、输出通道数 out_channels 和其他关键字参数 kwargs
    def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs):
        super().__init__(**kwargs)

        # 添加一个 ZeroPadding2D 层，在输入数据周围填充1个像素的零填充
        self.padding = keras.layers.ZeroPadding2D(padding=1)
        
        # 添加第一个卷积层 Conv2D，用于特征提取，输出通道数为 out_channels 的一半，使用 3x3 的卷积核，步幅为2，valid padding
        self.convolution1 = keras.layers.Conv2D(
            filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1"
        )
        
        # 添加一个 BatchNormalization 层，在卷积层前进行批量归一化，axis=-1 表示归一化沿着通道维度，使用指定的 epsilon 和 momentum 参数
        self.batchnorm_before = keras.layers.BatchNormalization(
            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
        )

        # 添加第二个卷积层 Conv2D，用于进一步特征提取，输出通道数为 out_channels，使用 3x3 的卷积核，步幅为2，valid padding
        self.convolution2 = keras.layers.Conv2D(
            filters=out_channels,
            kernel_size=3,
            strides=2,
            padding="valid",
            name="convolution2",
        )
        
        # 添加另一个 BatchNormalization 层，在第二个卷积层后进行批量归一化，axis=-1 表示归一化沿着通道维度，使用指定的 epsilon 和 momentum 参数
        self.batchnorm_after = keras.layers.BatchNormalization(
            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
        )

        # 添加激活函数层，使用 ReLU 激活函数
        self.activation = keras.layers.Activation(activation=keras.activations.relu, name="activation")
        
        # 记录输出通道数和配置参数，以备后用
        self.out_channels = out_channels
        self.config = config

    # 定义调用方法，接受输入像素值张量 pixel_values 和训练标志 training，返回特征张量
    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 对输入像素进行填充、卷积、批量归一化和激活操作，得到特征张量
        features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training)
        features = self.activation(features)
        
        # 对第一次卷积后得到的特征进行填充、第二次卷积、批量归一化和激活操作，得到最终特征张量
        features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training)
        features = self.activation(features)
        
        # 返回最终的特征张量作为输出
        return features

    # 定义构建方法，用于动态构建层的参数
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 标记该层为已构建
        self.built = True
        
        # 如果存在卷积层 convolution1，则动态构建该卷积层
        if getattr(self, "convolution1", None) is not None:
            with tf.name_scope(self.convolution1.name):
                self.convolution1.build([None, None, None, self.config.num_channels])
        
        # 如果存在批量归一化层 batchnorm_before，则动态构建该批量归一化层
        if getattr(self, "batchnorm_before", None) is not None:
            with tf.name_scope(self.batchnorm_before.name):
                self.batchnorm_before.build([None, None, None, self.out_channels // 2])
        
        # 如果存在卷积层 convolution2，则动态构建该卷积层
        if getattr(self, "convolution2", None) is not None:
            with tf.name_scope(self.convolution2.name):
                self.convolution2.build([None, None, None, self.out_channels // 2])
        
        # 如果存在批量归一化层 batchnorm_after，则动态构建该批量归一化层
        if getattr(self, "batchnorm_after", None) is not None:
            with tf.name_scope(self.batchnorm_after.name):
                self.batchnorm_after.build([None, None, None, self.out_channels])
        
        # 如果存在激活函数层 activation，则动态构建该激活函数层
        if getattr(self, "activation", None) is not None:
            with tf.name_scope(self.activation.name):
                self.activation.build(None)
    # 初始化函数，用于设置对象的初始状态
    def __init__(self, pool_size: int, **kwargs):
        # 调用父类的初始化方法，传入其他关键字参数
        super().__init__(**kwargs)
        # 创建一个平均池化层对象，设定池化窗口大小、步幅为1、填充方式为"same"
        self.pool = keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same")

    # 实现调用方法，处理隐藏状态张量并返回处理后的张量
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用平均池化层对隐藏状态进行池化操作
        output = self.pool(hidden_states)
        # 计算输出与原始隐藏状态的差值
        output = output - hidden_states
        # 返回处理后的张量作为最终输出
        return output
# 定义一个名为 TFEfficientFormerDenseMlp 的自定义 Keras 层
class TFEfficientFormerDenseMlp(keras.layers.Layer):
    
    # 初始化方法，接收配置对象 config 和输入、隐藏、输出特征的参数
    def __init__(
        self,
        config: EfficientFormerConfig,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        
        # 如果未提供 out_features，默认与 in_features 相同
        out_features = out_features or in_features
        # 如果未提供 hidden_features，默认与 in_features 相同
        hidden_features = hidden_features or in_features
        
        # 创建一个 Dense 层，用于输入特征的线性变换
        self.linear_in = keras.layers.Dense(
            units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in"
        )
        # 根据配置中的激活函数选择对应的激活函数
        self.activation = ACT2FN[config.hidden_act]
        # 创建一个 Dropout 层，用于随机失活
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)

        # 创建一个 Dense 层，用于输出特征的线性变换
        self.linear_out = keras.layers.Dense(
            units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out"
        )
        # 记录隐藏特征的维度
        self.hidden_features = hidden_features
        # 记录输入特征的维度
        self.in_features = in_features

    # 调用方法，实现层的正向传播逻辑
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 输入特征经过线性变换
        hidden_states = self.linear_in(inputs=hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        # 根据训练模式应用 Dropout
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 经过输出层的线性变换
        hidden_states = self.linear_out(inputs=hidden_states)
        # 再次根据训练模式应用 Dropout
        hidden_states = self.dropout(inputs=hidden_states, training=training)

        return hidden_states

    # 构建方法，用于构建层的内部结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记该层已经构建
        self.built = True
        
        # 如果存在 linear_in 层，设置其构建方式和输入维度
        if getattr(self, "linear_in", None) is not None:
            with tf.name_scope(self.linear_in.name):
                self.linear_in.build([None, None, self.in_features])
        
        # 如果存在 linear_out 层，设置其构建方式和输入维度
        if getattr(self, "linear_out", None) is not None:
            with tf.name_scope(self.linear_out.name):
                self.linear_out.build([None, None, self.hidden_features])


# 定义一个名为 TFEfficientFormerConvMlp 的自定义 Keras 层
class TFEfficientFormerConvMlp(keras.layers.Layer):
    
    # 初始化方法，接收配置对象 config 和输入、隐藏、输出特征的参数以及 dropout 率
    def __init__(
        self,
        config: EfficientFormerConfig,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
        drop: float = 0.0,
        **kwargs,
    ):
        super().__init__(**kwargs),
        
        # 如果未提供 out_features，默认与 in_features 相同
        out_features = out_features or in_features
        # 如果未提供 hidden_features，默认与 in_features 相同
        hidden_features = hidden_features or in_features
        # 记录 dropout 率
        self.drop = drop
        
        # 待补充...
    ):
        # 调用父类的初始化方法，传递所有关键字参数
        super().__init__(**kwargs)
        # 如果未提供输出特征数，则默认与输入特征数相同
        out_features = out_features or in_features
        # 如果未提供隐藏层特征数，则默认与输入特征数相同
        hidden_features = hidden_features or in_features

        # 创建第一个卷积层，使用隐藏特征数作为滤波器数，1x1 的卷积核
        self.convolution1 = keras.layers.Conv2D(
            filters=hidden_features,
            kernel_size=1,
            name="convolution1",
            padding="valid",
        )

        # 根据配置中的隐藏激活函数选择激活层
        self.activation = ACT2FN[config.hidden_act]

        # 创建第二个卷积层，使用输出特征数作为滤波器数，1x1 的卷积核
        self.convolution2 = keras.layers.Conv2D(
            filters=out_features,
            kernel_size=1,
            name="convolution2",
            padding="valid",
        )

        # 创建一个 Dropout 层，使用给定的丢弃率
        self.dropout = keras.layers.Dropout(rate=drop)

        # 使用与 PyTorch BatchNormalization 相同的默认动量和 epsilon 参数创建 BatchNormalization 层
        self.batchnorm_before = keras.layers.BatchNormalization(
            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
        )
        # 使用与 PyTorch BatchNormalization 相同的默认动量和 epsilon 参数创建 BatchNormalization 层
        self.batchnorm_after = keras.layers.BatchNormalization(
            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
        )

        # 保存隐藏特征数、输入特征数和输出特征数
        self.hidden_features = hidden_features
        self.in_features = in_features
        self.out_features = out_features

    # 定义调用方法，接受隐藏状态张量和训练标志，返回张量
    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 应用第一个卷积层到隐藏状态张量
        hidden_state = self.convolution1(hidden_state)
        # 应用 BatchNormalization 层到卷积结果
        hidden_state = self.batchnorm_before(hidden_state, training=training)
        # 应用激活函数到归一化后的张量
        hidden_state = self.activation(hidden_state)
        # 应用 Dropout 层到激活后的张量
        hidden_state = self.dropout(hidden_state, training=training)
        # 应用第二个卷积层到 Dropout 后的张量
        hidden_state = self.convolution2(hidden_state)
        # 应用 BatchNormalization 层到第二个卷积结果
        hidden_state = self.batchnorm_after(hidden_state, training=training)
        # 再次应用 Dropout 层到归一化后的张量
        hidden_state = self.dropout(hidden_state, training=training)
        # 返回处理后的张量作为最终结果
        return hidden_state

    # 定义构建方法，用于根据输入形状建立网络结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记网络已经构建
        self.built = True
        # 如果存在第一个卷积层，则根据输入特征数构建其形状
        if getattr(self, "convolution1", None) is not None:
            with tf.name_scope(self.convolution1.name):
                self.convolution1.build([None, None, None, self.in_features])
        # 如果存在第二个卷积层，则根据隐藏特征数构建其形状
        if getattr(self, "convolution2", None) is not None:
            with tf.name_scope(self.convolution2.name):
                self.convolution2.build([None, None, None, self.hidden_features])
        # 如果存在 BatchNormalization 层，则根据隐藏特征数构建其形状
        if getattr(self, "batchnorm_before", None) is not None:
            with tf.name_scope(self.batchnorm_before.name):
                self.batchnorm_before.build([None, None, None, self.hidden_features])
        # 如果存在第二个 BatchNormalization 层，则根据输出特征数构建其形状
        if getattr(self, "batchnorm_after", None) is not None:
            with tf.name_scope(self.batchnorm_after.name):
                self.batchnorm_after.build([None, None, None, self.out_features])
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer
class TFEfficientFormerDropPath(keras.layers.Layer):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    """

    def __init__(self, drop_path: float, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path  # 初始化时设置 drop_path 参数

    def call(self, x: tf.Tensor, training=None):
        if training:
            keep_prob = 1 - self.drop_path
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
            random_tensor = tf.floor(random_tensor)
            # 应用随机深度丢弃路径技术
            return (x / keep_prob) * random_tensor
        return x


class TFEfficientFormerFlat(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, hidden_states: tf.Tensor) -> Tuple[tf.Tensor]:
        batch_size, _, _, in_channels = shape_list(hidden_states)
        # 对输入的隐藏状态进行扁平化处理
        hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels])
        return hidden_states


class TFEfficientFormerMeta3D(keras.layers.Layer):
    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
        super().__init__(**kwargs)

        # 创建自注意力层 `token_mixer`，用于处理 token 之间的交互
        self.token_mixer = TFEfficientFormerSelfAttention(
            dim=config.dim,
            key_dim=config.key_dim,
            num_heads=config.num_attention_heads,
            attention_ratio=config.attention_ratio,
            resolution=config.resolution,
            name="token_mixer",
            config=config,
        )
        self.dim = dim
        self.config = config

        # 第一个 LayerNormalization 层
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1")
        # 第二个 LayerNormalization 层
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2")
        # 多层感知机（MLP）的隐藏层维度计算
        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
        # 创建 MLP 层
        self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp")

        # 使用 `layers.Activation` 代替 `tf.identity` 控制 `training` 行为更精确
        # 创建丢弃路径层或者一个线性激活层，取决于 `drop_path` 的值
        self.drop_path = (
            TFEfficientFormerDropPath(drop_path)
            if drop_path > 0.0
            else keras.layers.Activation("linear", name="drop_path")
        )
        self.config = config
    # 在模型建立时初始化层缩放的权重，这里初始化为None
    def build(self, input_shape=None):
        self.layer_scale_1 = None  # 初始化第一个层缩放权重为None
        self.layer_scale_2 = None  # 初始化第二个层缩放权重为None

        # 如果配置中指定使用层缩放
        if self.config.use_layer_scale:
            # 添加第一个层缩放权重，形状为(self.dim,)
            self.layer_scale_1 = self.add_weight(
                shape=(self.dim,),
                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
                trainable=True,
                name="layer_scale_1",
            )
            # 添加第二个层缩放权重，形状为(self.dim,)
            self.layer_scale_2 = self.add_weight(
                shape=(self.dim,),
                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
                trainable=True,
                name="layer_scale_2",
            )

        # 如果模型已经建立，直接返回
        if self.built:
            return
        # 标记模型已经建立
        self.built = True

        # 如果存在token_mixer层，对其进行建立
        if getattr(self, "token_mixer", None) is not None:
            with tf.name_scope(self.token_mixer.name):
                self.token_mixer.build(None)

        # 如果存在layernorm1层，对其进行建立，输入维度为[None, None, self.dim]
        if getattr(self, "layernorm1", None) is not None:
            with tf.name_scope(self.layernorm1.name):
                self.layernorm1.build([None, None, self.dim])

        # 如果存在layernorm2层，对其进行建立，输入维度为[None, None, self.dim]
        if getattr(self, "layernorm2", None) is not None:
            with tf.name_scope(self.layernorm2.name):
                self.layernorm2.build([None, None, self.dim])

        # 如果存在mlp层，对其进行建立
        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)

        # 如果存在drop_path层，对其进行建立
        if getattr(self, "drop_path", None) is not None:
            with tf.name_scope(self.drop_path.name):
                self.drop_path.build(None)

    # 模型调用方法，接收隐藏状态、是否输出注意力权重以及训练状态，并返回元组包含的张量
    def call(
        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
    ) -> Tuple[tf.Tensor]:
        # 使用token_mixer层处理layernorm1层处理后的隐藏状态，输出注意力权重
        self_attention_outputs = self.token_mixer(
            hidden_states=self.layernorm1(hidden_states, training=training),
            output_attentions=output_attentions,
            training=training,
        )

        # 取自注意力输出的第一个张量作为attention_output
        attention_output = self_attention_outputs[0]
        # 如果要输出注意力权重，则将其它张量也加入outputs中
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        # 如果配置中指定使用层缩放
        if self.config.use_layer_scale:
            # 计算第一层输出，加上drop_path层作用于self.layer_scale_1与attention_output的乘积
            layer_output = hidden_states + self.drop_path(
                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output,
                training=training,
            )
            # 计算第二层输出，加上drop_path层作用于self.layer_scale_2与mlp层作用于layernorm2层处理后的layer_output的乘积
            layer_output = layer_output + self.drop_path(
                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
                * self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
                training=training,
            )
        else:
            # 否则，计算第一层输出，加上drop_path层作用于attention_output
            layer_output = hidden_states + self.drop_path(attention_output, training=training)
            # 计算第二层输出，加上drop_path层作用于mlp层作用于layernorm2层处理后的layer_output
            layer_output = layer_output + self.drop_path(
                self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
                training=training,
            )

        # 将layer_output加入到输出张量中
        outputs = (layer_output,) + outputs

        # 返回所有输出张量的元组
        return outputs
# 定义一个名为 TFEfficientFormerMeta3DLayers 的自定义层，继承自 keras.layers.Layer 类
class TFEfficientFormerMeta3DLayers(keras.layers.Layer):
    
    # 初始化方法，接受 EfficientFormerConfig 类型的 config 参数和其他关键字参数
    def __init__(self, config: EfficientFormerConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 根据 config 中的参数计算每个块的 drop path 值，存储在 drop_paths 列表中
        drop_paths = [
            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
            for block_idx in range(config.num_meta3d_blocks)
        ]
        
        # 创建一个由 TFEfficientFormerMeta3D 层组成的列表 self.blocks
        self.blocks = [
            TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}")
            for i, drop_path in enumerate(drop_paths)
        ]

    # call 方法，用于定义层的前向传播逻辑
    def call(
        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
    ) -> Tuple[tf.Tensor]:
        # 如果需要输出注意力机制的信息，则初始化 all_attention_outputs 为一个空元组，否则置为 None
        all_attention_outputs = () if output_attentions else None
        
        # 遍历 self.blocks 中的每个层模块
        for i, layer_module in enumerate(self.blocks):
            # 如果 hidden_states 是一个元组，则取其第一个元素
            if isinstance(hidden_states, tuple):
                hidden_states = hidden_states[0]
            
            # 调用当前层模块的前向传播方法，更新 hidden_states
            hidden_states = layer_module(
                hidden_states=hidden_states, output_attentions=output_attentions, training=training
            )
            
            # 如果需要输出注意力机制的信息，则更新 all_attention_outputs
            if output_attentions:
                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
        
        # 如果需要输出注意力机制的信息，则返回包含 hidden_states 和 all_attention_outputs 的元组
        if output_attentions:
            outputs = (hidden_states[0],) + all_attention_outputs
            return outputs
        
        # 否则，返回更新后的 hidden_states
        return hidden_states

    # build 方法，用于构建层，确保在调用前未构建过
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果 self.blocks 存在，则遍历每个层并调用其 build 方法
        if getattr(self, "blocks", None) is not None:
            for layer in self.blocks:
                with tf.name_scope(layer.name):
                    layer.build(None)


# 定义一个名为 TFEfficientFormerMeta4D 的自定义层，继承自 keras.layers.Layer 类
class TFEfficientFormerMeta4D(keras.layers.Layer):
    
    # 初始化方法，接受 EfficientFormerConfig 类型的 config 参数、维度 dim 和其他关键字参数
    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
        super().__init__(**kwargs)
        
        # 根据 config 中的参数设置池化大小 pool_size，默认为 3
        pool_size = config.pool_size if config.pool_size is not None else 3
        
        # 创建名为 token_mixer 的 TFEfficientFormerPooling 层，用于混合 token
        self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer")
        
        # 存储维度信息到 self.dim
        self.dim = dim
        
        # 计算 MLP 隐藏层维度 mlp_hidden_dim
        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
        
        # 创建名为 mlp 的 TFEfficientFormerConvMlp 层，用于处理卷积 MLP
        self.mlp = TFEfficientFormerConvMlp(
            config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp"
        )
        
        # 根据 drop_path 的值创建名为 drop_path 的 TFEfficientFormerDropPath 层或线性激活层
        self.drop_path = (
            TFEfficientFormerDropPath(drop_path, name="drop_path")
            if drop_path > 0.0
            else keras.layers.Activation("linear", name="drop_path")
        )
        
        # 存储配置信息到 self.config
        self.config = config
    # 在神经网络层构建时被调用，初始化一些成员变量
    def build(self, input_shape=None):
        # 初始化用于缩放层输出的两个变量为 None
        self.layer_scale_1 = None
        self.layer_scale_2 = None

        # 如果配置指定使用层缩放
        if self.config.use_layer_scale:
            # 添加第一个层缩放权重，初始化为指定的值
            self.layer_scale_1 = self.add_weight(
                shape=(self.dim),
                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
                trainable=True,
                name="layer_scale_1",
            )
            # 添加第二个层缩放权重，初始化为指定的值
            self.layer_scale_2 = self.add_weight(
                shape=(self.dim),
                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
                trainable=True,
                name="layer_scale_2",
            )

        # 如果已经构建过网络层，则直接返回
        if self.built:
            return
        # 标记网络已构建
        self.built = True

        # 如果存在 token_mixer 层，构建其结构
        if getattr(self, "token_mixer", None) is not None:
            with tf.name_scope(self.token_mixer.name):
                self.token_mixer.build(None)
        
        # 如果存在 mlp 层，构建其结构
        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)
        
        # 如果存在 drop_path 层，构建其结构
        if getattr(self, "drop_path", None) is not None:
            with tf.name_scope(self.drop_path.name):
                self.drop_path.build(None)

    # 网络层的调用函数，用于处理输入的隐藏状态
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
        # 使用 token_mixer 处理隐藏状态得到输出
        outputs = self.token_mixer(hidden_states)

        # 如果配置使用层缩放
        if self.config.use_layer_scale:
            # 计算第一层输出，加上缩放后的 token_mixer 输出，并应用 drop_path 层
            layer_output = hidden_states + self.drop_path(
                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs,
                training=training,
            )

            # 计算第二层输出，加上缩放后的 MLP 处理结果，并应用 drop_path 层
            layer_output = layer_output + self.drop_path(
                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
                * self.mlp(hidden_state=layer_output, training=training),
                training=training,
            )

        else:
            # 若不使用层缩放，直接将 token_mixer 输出应用 drop_path 层
            layer_output = hidden_states + self.drop_path(outputs, training=training)
            # 将 MLP 处理结果应用 drop_path 层后加到当前层输出上
            layer_output = layer_output + self.drop_path(
                self.mlp(hidden_state=layer_output, training=training), training=training
            )

        # 返回最终层输出
        return layer_output
class TFEfficientFormerMeta4DLayers(keras.layers.Layer):
    def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs):
        super().__init__(**kwargs)
        # 根据舞台索引选择层数，如果是最后一舞台，减去meta3d块数
        num_layers = (
            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
        )
        # 计算每个块的DropPath率
        drop_paths = [
            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
        ]

        # 创建一个由多个TFEfficientFormerMeta4D组成的列表
        self.blocks = [
            TFEfficientFormerMeta4D(
                config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}"
            )
            for i in range(len(drop_paths))
        ]

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
        # 依次调用每个块处理隐藏状态
        for layer_module in self.blocks:
            hidden_states = layer_module(hidden_states=hidden_states, training=training)
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经建立，直接返回；否则为每个块设置命名空间并构建
        if getattr(self, "blocks", None) is not None:
            for layer in self.blocks:
                with tf.name_scope(layer.name):
                    layer.build(None)


class TFEfficientFormerIntermediateStage(keras.layers.Layer):
    def __init__(self, config: EfficientFormerConfig, index: int, **kwargs):
        super().__init__(**kwargs)
        # 创建一个TFEfficientFormerMeta4DLayers实例作为meta4D层
        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers")

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
        # 调用meta4D层处理隐藏状态
        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经建立，直接返回；否则为meta4D层设置命名空间并构建
        if getattr(self, "meta4D_layers", None) is not None:
            with tf.name_scope(self.meta4D_layers.name):
                self.meta4D_layers.build(None)


class TFEfficientFormerLastStage(keras.layers.Layer):
    def __init__(self, config: EfficientFormerConfig, **kwargs):
        super().__init__(**kwargs)
        # 创建TFEfficientFormerMeta4DLayers实例作为meta4D层，使用-1作为舞台索引
        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers")
        # 创建TFEfficientFormerFlat实例作为flat层
        self.flat = TFEfficientFormerFlat(name="flat")
        # 创建TFEfficientFormerMeta3DLayers实例作为meta3D层
        self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers")

    def call(
        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
    ) -> Tuple[tf.Tensor]:
        # 依次调用meta4D层、flat层和meta3D层处理隐藏状态
        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
        hidden_states = self.flat(hidden_states=hidden_states)
        hidden_states = self.meta3D_layers(
            hidden_states=hidden_states, output_attentions=output_attentions, training=training
        )

        return hidden_states
    # 如果模型已经构建完成，则直接返回，不再重复构建
    if self.built:
        return
    # 将模型标记为已构建状态
    self.built = True
    # 如果存在 meta4D_layers 属性，并且不为 None，则构建 meta4D_layers
    if getattr(self, "meta4D_layers", None) is not None:
        # 在 TensorFlow 中为 meta4D_layers 创建命名空间，并进行构建
        with tf.name_scope(self.meta4D_layers.name):
            self.meta4D_layers.build(None)
    # 如果存在 flat 属性，并且不为 None，则构建 flat
    if getattr(self, "flat", None) is not None:
        # 在 TensorFlow 中为 flat 创建命名空间，并进行构建
        with tf.name_scope(self.flat.name):
            self.flat.build(None)
    # 如果存在 meta3D_layers 属性，并且不为 None，则构建 meta3D_layers
    if getattr(self, "meta3D_layers", None) is not None:
        # 在 TensorFlow 中为 meta3D_layers 创建命名空间，并进行构建
        with tf.name_scope(self.meta3D_layers.name):
            self.meta3D_layers.build(None)
# 定义 TF EfficientFormer 编码器的自定义层
class TFEfficientFormerEncoder(keras.layers.Layer):
    # 初始化方法，接受 EfficientFormerConfig 对象和其他关键字参数
    def __init__(self, config: EfficientFormerConfig, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        # 计算中间阶段的数量
        num_intermediate_stages = len(config.depths) - 1
        # 判断每个中间阶段是否需要下采样
        downsamples = [
            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
            for i in range(num_intermediate_stages)
        ]

        intermediate_stages = []
        layer_count = -1
        # 循环创建中间阶段的模块
        for i in range(num_intermediate_stages):
            layer_count += 1
            # 添加 EfficientFormer 中间阶段模块
            intermediate_stages.append(
                TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}")
            )
            # 如果需要下采样，则添加 Patch Embeddings 模块
            if downsamples[i]:
                layer_count += 1
                intermediate_stages.append(
                    TFEfficientFormerPatchEmbeddings(
                        config,
                        config.hidden_sizes[i],
                        config.hidden_sizes[i + 1],
                        name=f"intermediate_stages.{layer_count}",
                    )
                )
        # 将创建的中间阶段模块列表赋给实例变量
        self.intermediate_stages = intermediate_stages
        # 创建最后一个阶段的模块并赋给实例变量
        self.last_stage = TFEfficientFormerLastStage(config, name="last_stage")

    # 调用方法，执行编码器的前向传播
    def call(
        self,
        hidden_states: tf.Tensor,
        output_hidden_states: bool,
        output_attentions: bool,
        return_dict: bool,
        training: bool = False,
    ) -> TFBaseModelOutput:
        # 初始化空元组或 None，用于存储所有隐藏状态和自注意力
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 如果需要输出隐藏状态，则将输入的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 遍历中间阶段模块，对隐藏状态进行处理
        for layer_module in self.intermediate_stages:
            hidden_states = layer_module(hidden_states, training=training)

            # 如果需要输出隐藏状态，则将当前模块处理后的隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        # 对最后一个阶段模块处理隐藏状态，并获取输出
        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training)

        # 如果需要输出自注意力，则将其添加到 all_self_attentions 中
        if output_attentions:
            all_self_attentions = all_self_attentions + layer_output[1:]

        # 如果需要输出隐藏状态，则将最后阶段模块的输出添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (layer_output[0],)

        # 如果不需要以字典形式返回结果，则返回非 None 的元组值
        if not return_dict:
            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)

        # 否则以 TFBaseModelOutput 对象返回结果字典
        return TFBaseModelOutput(
            last_hidden_state=layer_output[0],
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )

    # 构建方法，在第一次调用 call 方法前被自动调用
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在最后一个阶段模块，则构建它
        if getattr(self, "last_stage", None) is not None:
            with tf.name_scope(self.last_stage.name):
                self.last_stage.build(None)
        # 构建所有中间阶段模块
        for layer in self.intermediate_stages:
            with tf.name_scope(layer.name):
                layer.build(None)


@keras_serializable
# 定义 TF EfficientFormer 主层的自定义 Keras 层
class TFEfficientFormerMainLayer(keras.layers.Layer):
    # 将配置类指定为 EfficientFormerConfig 类
    config_class = EfficientFormerConfig

    # 初始化方法，接收 EfficientFormerConfig 对象和其他关键字参数
    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将传入的配置对象赋值给实例的 config 属性
        self.config = config

        # 创建一个 TFEfficientFormerConvStem 对象作为 patch_embed 属性，用于处理输入的像素值
        self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed")
        
        # 创建一个 TFEfficientFormerEncoder 对象作为 encoder 属性，用于对输入进行编码
        self.encoder = TFEfficientFormerEncoder(config, name="encoder")
        
        # 创建一个 LayerNormalization 层作为 layernorm 属性，用于对输出进行归一化处理
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")

    # 使用装饰器 unpack_inputs，将 call 方法的输入参数解包为具名参数
    @unpack_inputs
    def call(
        self,
        pixel_values: Optional[tf.Tensor] = None,
        output_attentions: Optional[tf.Tensor] = None,
        output_hidden_states: Optional[tf.Tensor] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor, ...]]:
        # 如果没有显式设置输出注意力机制，则使用模型配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # 如果没有显式设置输出隐藏状态，则使用模型配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 如果没有显式设置返回字典，则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            # 如果像素值为 None，则抛出数值错误异常
            raise ValueError("You have to specify pixel_values")

        # 当在 CPU 上运行时，keras.layers.Conv2D 和 keras.layers.AveragePool2D 不支持通道优先的 NCHW 格式。
        # 一些块包含两者。因此在此处将输入格式从 (batch_size, num_channels, height, width) 转换为
        # (batch_size, height, width, num_channels)。
        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 使用 patch_embed 方法嵌入像素值，用于训练模式
        embedding_output = self.patch_embed(pixel_values, training=training)

        # 使用 encoder 处理嵌入的隐藏状态，支持输出注意力和隐藏状态，返回字典模式
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器输出的序列输出（第一个元素）
        sequence_output = encoder_outputs[0]

        # 对序列输出进行 LayerNormalization 处理，用于训练模式
        sequence_output = self.layernorm(sequence_output, training=training)

        # 如果需要输出隐藏状态，则将隐藏状态从 (batch_size, height, width, num_channels) 转换为
        # (batch_size, num_channels, height, width)
        if output_hidden_states:
            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]]) + (
                encoder_outputs[1][-1],
            )

        # 如果不使用返回字典模式，则返回序列输出和所有的隐藏状态
        if not return_dict:
            head_outputs = (sequence_output,)
            return head_outputs + encoder_outputs[1:]

        # 否则，返回 TFBaseModelOutput 对象，包括最后隐藏状态、隐藏状态和注意力机制
        return TFBaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经建立了模型，则直接返回
        if self.built:
            return

        # 设置模型已建立标志为 True
        self.built = True

        # 如果存在 patch_embed 属性，则建立 patch_embed 层
        if getattr(self, "patch_embed", None) is not None:
            with tf.name_scope(self.patch_embed.name):
                self.patch_embed.build(None)

        # 如果存在 encoder 属性，则建立 encoder 层
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)

        # 如果存在 layernorm 属性，则建立 layernorm 层
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, None, self.config.hidden_sizes[-1]])
@add_start_docstrings(
    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
    EFFICIENTFORMER_START_DOCSTRING,
)
class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel):
    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
        super().__init__(config, **kwargs)

        # 初始化 EfficientFormer 主层，并命名为 "efficientformer"
        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def call(
        self,
        pixel_values: Optional[tf.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        **kwargs,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        """
        Call function to forward pass through EfficientFormer model.

        Args:
            pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Pixel values can be obtained using `AutoImageProcessor`. See
                `EfficientFormerImageProcessor.__call__` for details.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
                See `attentions` under returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
                See `hidden_states` under returned tensors for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.
            training (`bool`, *optional*):
                Whether the model is in training mode or evaluation mode.

        Returns:
            Either a `TFBaseModelOutputWithPooling` or a tuple containing a `tf.Tensor`.

        """
        # Forward pass through the EfficientFormer model
        return self.efficientformer(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
            **kwargs,
        )
    ) -> Union[Tuple, TFBaseModelOutput]:
        # 定义函数的输入参数和返回类型注解
        outputs = self.efficientformer(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 调用 efficientformer 模型，传入参数并获取输出结果
        return outputs

    def build(self, input_shape=None):
        # 如果模型已经建立，直接返回，不执行后续操作
        if self.built:
            return
        # 将模型标记为已经建立
        self.built = True
        # 如果 efficientformer 存在
        if getattr(self, "efficientformer", None) is not None:
            # 在 TensorFlow 中，使用 name_scope 命名空间来管理计算图中的节点
            with tf.name_scope(self.efficientformer.name):
                # 构建 efficientformer 模型，此处不传入具体的输入形状（None 表示动态输入形状）
                self.efficientformer.build(None)
@add_start_docstrings(
    """
    EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for
    ImageNet.
    """,
    EFFICIENTFORMER_START_DOCSTRING,
)
class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: EfficientFormerConfig):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")

        # Classifier head
        self.classifier = (
            keras.layers.Dense(config.num_labels, name="classifier")
            if config.num_labels > 0
            else keras.layers.Activation("linear", name="classifier")
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def call(
        self,
        pixel_values: Optional[tf.Tensor] = None,
        labels: Optional[tf.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[tf.Tensor, TFImageClassifierOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # Determine the value of return_dict based on input or default from config
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass inputs to EfficientFormer model for processing
        outputs = self.efficientformer(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # Get the sequence output from EfficientFormer model
        sequence_output = outputs[0]

        # Compute logits for classification using pooled representation
        logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))

        # Compute loss if labels are provided using helper function hf_compute_loss
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # Return outputs based on whether return_dict is enabled
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # Return structured output using TFImageClassifierOutput
        return TFImageClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记为已构建状态
        self.built = True
        # 如果存在 efficientformer 属性，则构建 efficientformer
        if getattr(self, "efficientformer", None) is not None:
            # 在 TensorFlow 中使用 name_scope 来管理命名空间，这里使用 efficientformer 的名称作为命名空间
            with tf.name_scope(self.efficientformer.name):
                # 调用 efficientformer 的 build 方法，传入 None 作为输入形状
                self.efficientformer.build(None)
        # 如果存在 classifier 属性，则构建 classifier
        if getattr(self, "classifier", None) is not None:
            # 如果 classifier 具有 name 属性，将其名称作为命名空间
            if hasattr(self.classifier, "name"):
                with tf.name_scope(self.classifier.name):
                    # 调用 classifier 的 build 方法，传入输入形状 [None, None, self.config.hidden_sizes[-1]]
                    self.classifier.build([None, None, self.config.hidden_sizes[-1]])
# 使用 dataclass 装饰器定义一个数据类，用于存储 EfficientFormer 模型的输出结果
@dataclass
class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
    """
    Args:
    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            预测分数，作为 cls_logits 和 distillation_logits 的平均值。
        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            分类头部的预测分数（即最终类令牌的隐藏状态上的线性层）。
        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            蒸馏头部的预测分数（即蒸馏令牌的隐藏状态上的线性层）。
        hidden_states (`tuple(tf.Tensor)`, *optional*, 当 `output_hidden_states=True` 时返回或当
        `config.output_hidden_states=True` 时返回):
            `tf.Tensor` 元组（一个用于嵌入的输出 + 每层输出的一个），形状为 `(batch_size, sequence_length, hidden_size)`。
            模型在每一层输出的隐藏状态加上初始嵌入输出。
        attentions (`tuple(tf.Tensor)`, *optional*, 当 `output_attentions=True` 时返回或当
        `config.output_attentions=True` 时返回):
            `tf.Tensor` 元组（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    logits: tf.Tensor = None
    cls_logits: tf.Tensor = None
    distillation_logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


# 使用 add_start_docstrings 函数添加类的文档字符串，描述了 EfficientFormer 模型的转换器特征及其图像分类头部（最终隐藏状态上的线性层和蒸馏令牌最终隐藏状态上的线性层），
# 例如用于 ImageNet 的情况。
# 警告：此模型仅支持推断。目前尚不支持使用蒸馏进行微调（即带有教师的微调）。
@add_start_docstrings(
    """
    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
    state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::
            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
            supported.
    """,
    EFFICIENTFORMER_START_DOCSTRING,
)
class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel):
    def __init__(self, config: EfficientFormerConfig) -> None:
        # 调用父类构造函数，初始化模型的配置
        super().__init__(config)

        # 设置模型的标签数量
        self.num_labels = config.num_labels
        # 创建 EfficientFormer 主层，并命名为 efficientformer
        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")

        # 分类器头部
        # 如果标签数量大于 0，则创建密集层作为分类器，否则创建线性激活层作为分类器
        self.classifier = (
            keras.layers.Dense(config.num_labels, name="classifier")
            if config.num_labels > 0
            else keras.layers.Activation("linear", name="classifier")
        )
        # 如果标签数量大于 0，则创建密集层作为蒸馏分类器，否则创建线性激活层作为蒸馏分类器
        self.distillation_classifier = (
            keras.layers.Dense(config.num_labels, name="distillation_classifier")
            if config.num_labels > 0
            else keras.layers.Activation("linear", name="distillation_classifier")
        )

    @unpack_inputs
    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFEfficientFormerForImageClassificationWithTeacherOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def call(
        self,
        pixel_values: Optional[tf.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]:
        # 如果 return_dict 未提供，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果处于训练模式，则抛出异常，该模型仅支持推断
        if training:
            raise Exception(
                "This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported."
            )

        # 调用 EfficientFormer 主层，获取输出
        outputs = self.efficientformer(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取序列输出（通常是模型输出的第一个元素）
        sequence_output = outputs[0]

        # 使用分类器对序列输出的平均值进行分类预测
        cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
        # 使用蒸馏分类器对序列输出的平均值进行分类预测
        distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2))
        # 聚合分类器和蒸馏分类器的输出，计算最终的逻辑回归结果
        logits = (cls_logits + distillation_logits) / 2

        # 如果不需要返回字典，则返回一个元组作为模型输出
        if not return_dict:
            output = (logits, cls_logits, distillation_logits) + outputs[1:]
            return output

        # 如果需要返回字典，则创建一个带有详细输出信息的类实例并返回
        return TFEfficientFormerForImageClassificationWithTeacherOutput(
            logits=logits,
            cls_logits=cls_logits,
            distillation_logits=distillation_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 构建方法用于在给定输入形状的情况下构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，不再重复构建
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        
        # 如果存在 efficientformer 属性，进行进一步处理
        if getattr(self, "efficientformer", None) is not None:
            # 使用 efficientformer 的名称创建一个命名空间
            with tf.name_scope(self.efficientformer.name):
                # 对 efficientformer 进行构建，此处输入形状为 None，表示未指定具体输入形状
                self.efficientformer.build(None)
        
        # 如果存在 classifier 属性，进行进一步处理
        if getattr(self, "classifier", None) is not None:
            # 如果 classifier 具有名称属性
            if hasattr(self.classifier, "name"):
                # 使用 classifier 的名称创建一个命名空间
                with tf.name_scope(self.classifier.name):
                    # 对 classifier 进行构建，输入形状为 [None, None, self.config.hidden_sizes[-1]]
                    # 其中第一个维度为批量大小，第二和第三个维度为未指定大小
                    self.classifier.build([None, None, self.config.hidden_sizes[-1]])
        
        # 如果存在 distillation_classifier 属性，进行进一步处理
        if getattr(self, "distillation_classifier", None) is not None:
            # 如果 distillation_classifier 具有名称属性
            if hasattr(self.distillation_classifier, "name"):
                # 使用 distillation_classifier 的名称创建一个命名空间
                with tf.name_scope(self.distillation_classifier.name):
                    # 对 distillation_classifier 进行构建，输入形状同上
                    self.distillation_classifier.build([None, None, self.config.hidden_sizes[-1]])

`.\models\efficientformer\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_torch_available,
    is_vision_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_efficientformer": [
        "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "EfficientFormerConfig",
    ]
}

# 检查视觉处理模块是否可用，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 将视觉处理模块加入导入结构
    _import_structure["image_processing_efficientformer"] = ["EfficientFormerImageProcessor"]

# 检查是否Torch可用，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 将Torch的模型相关类加入导入结构
    _import_structure["modeling_efficientformer"] = [
        "EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "EfficientFormerForImageClassification",
        "EfficientFormerForImageClassificationWithTeacher",
        "EfficientFormerModel",
        "EfficientFormerPreTrainedModel",
    ]

# 检查是否TensorFlow可用，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 将TensorFlow的模型相关类加入导入结构
    _import_structure["modeling_tf_efficientformer"] = [
        "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFEfficientFormerForImageClassification",
        "TFEfficientFormerForImageClassificationWithTeacher",
        "TFEfficientFormerModel",
        "TFEfficientFormerPreTrainedModel",
    ]

# 若是类型检查环境，则导入必要的类型和类
if TYPE_CHECKING:
    from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig

    # 检查视觉处理模块是否可用，若不可用则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入视觉处理模块中的类
        from .image_processing_efficientformer import EfficientFormerImageProcessor

    # 检查是否Torch可用，若不可用则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入Torch模型相关类
        from .modeling_efficientformer import (
            EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            EfficientFormerForImageClassification,
            EfficientFormerForImageClassificationWithTeacher,
            EfficientFormerModel,
            EfficientFormerPreTrainedModel,
        )
    # 尝试检查是否可用 TensorFlow，如果不可用则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 如果 TensorFlow 不可用，则捕获 OptionalDependencyNotAvailable 异常
    except OptionalDependencyNotAvailable:
        # 如果依赖不可用，什么都不做，继续执行后续代码
        pass
    # 如果没有引发异常，则执行以下代码块
    else:
        # 从 TensorFlow 版本的 EfficientFormer 模型导入相关内容
        from .modeling_tf_efficientformer import (
            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFEfficientFormerForImageClassification,
            TFEfficientFormerForImageClassificationWithTeacher,
            TFEfficientFormerModel,
            TFEfficientFormerPreTrainedModel,
        )
else:
    # 导入sys模块，用于操作Python解释器的相关功能
    import sys

    # 将当前模块注册到sys.modules中，使用_LazyModule进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\efficientnet\configuration_efficientnet.py`

# coding=utf-8
# Copyright 2023 Google Research, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" EfficientNet model configuration"""

# 导入 OrderedDict 和 Mapping 类型
from collections import OrderedDict
from typing import List, Mapping

# 导入版本控制的模块
from packaging import version

# 导入预训练配置和 ONNX 配置
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
# 导入日志工具
from ...utils import logging

# 获取记录器
logger = logging.get_logger(__name__)

# 预训练模型的配置文件映射字典
EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/efficientnet-b7": "https://huggingface.co/google/efficientnet-b7/resolve/main/config.json",
}


# EfficientNet 配置类，继承自 PretrainedConfig
class EfficientNetConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`EfficientNetModel`]. It is used to instantiate an
    EfficientNet model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the EfficientNet
    [google/efficientnet-b7](https://huggingface.co/google/efficientnet-b7) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:
    ```
    >>> from transformers import EfficientNetConfig, EfficientNetModel

    >>> # Initializing a EfficientNet efficientnet-b7 style configuration
    >>> configuration = EfficientNetConfig()

    >>> # Initializing a model (with random weights) from the efficientnet-b7 style configuration
    >>> model = EfficientNetModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    
    # 模型类型定义为 efficientnet
    model_type = "efficientnet"
    def __init__(
        self,
        num_channels: int = 3,
        image_size: int = 600,
        width_coefficient: float = 2.0,
        depth_coefficient: float = 3.1,
        depth_divisor: int = 8,
        kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3],
        in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],
        out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320],
        depthwise_padding: List[int] = [],
        strides: List[int] = [1, 2, 2, 2, 1, 2, 1],
        num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],
        expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6],
        squeeze_expansion_ratio: float = 0.25,
        hidden_act: str = "swish",
        hidden_dim: int = 2560,
        pooling_type: str = "mean",
        initializer_range: float = 0.02,
        batch_norm_eps: float = 0.001,
        batch_norm_momentum: float = 0.99,
        dropout_rate: float = 0.5,
        drop_connect_rate: float = 0.2,
        **kwargs,
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 设置模型的各种超参数
        self.num_channels = num_channels  # 图像通道数
        self.image_size = image_size  # 图像尺寸
        self.width_coefficient = width_coefficient  # 宽度系数
        self.depth_coefficient = depth_coefficient  # 深度系数
        self.depth_divisor = depth_divisor  # 深度除数
        self.kernel_sizes = kernel_sizes  # 卷积核尺寸列表
        self.in_channels = in_channels  # 输入通道数列表
        self.out_channels = out_channels  # 输出通道数列表
        self.depthwise_padding = depthwise_padding  # 深度卷积填充列表
        self.strides = strides  # 步长列表
        self.num_block_repeats = num_block_repeats  # 每个块的重复次数列表
        self.expand_ratios = expand_ratios  # 扩展比率列表
        self.squeeze_expansion_ratio = squeeze_expansion_ratio  # 压缩扩展比率
        self.hidden_act = hidden_act  # 隐藏层激活函数类型
        self.hidden_dim = hidden_dim  # 隐藏层维度
        self.pooling_type = pooling_type  # 池化类型
        self.initializer_range = initializer_range  # 初始化范围
        self.batch_norm_eps = batch_norm_eps  # 批归一化 epsilon
        self.batch_norm_momentum = batch_norm_momentum  # 批归一化动量
        self.dropout_rate = dropout_rate  # Dropout 比率
        self.drop_connect_rate = drop_connect_rate  # DropConnect 比率
        self.num_hidden_layers = sum(num_block_repeats) * 4  # 计算总隐藏层数
# 定义一个 EfficientNetOnnxConfig 类，继承自 OnnxConfig 类
class EfficientNetOnnxConfig(OnnxConfig):
    # 定义一个类变量 torch_onnx_minimum_version，指定最小版本为 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义一个 inputs 属性，返回一个有序字典，描述输入的结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                # 指定输入的像素值结构，包括批次、通道数、高度、宽度
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # 定义一个 atol_for_validation 属性，返回一个浮点数，表示验证时的容差
    @property
    def atol_for_validation(self) -> float:
        return 1e-5

`.\models\efficientnet\convert_efficientnet_to_pytorch.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert EfficientNet checkpoints from the original repository.

URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py"""

import argparse
import json
import os

import numpy as np
import PIL
import requests
import tensorflow.keras.applications.efficientnet as efficientnet
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from tensorflow.keras.preprocessing import image

from transformers import (
    EfficientNetConfig,
    EfficientNetForImageClassification,
    EfficientNetImageProcessor,
)
from transformers.utils import logging

# 设置日志输出为 info 级别
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 EfficientNet 模型的类别映射，每个字符串键对应一个 EfficientNet 模型类
model_classes = {
    "b0": efficientnet.EfficientNetB0,
    "b1": efficientnet.EfficientNetB1,
    "b2": efficientnet.EfficientNetB2,
    "b3": efficientnet.EfficientNetB3,
    "b4": efficientnet.EfficientNetB4,
    "b5": efficientnet.EfficientNetB5,
    "b6": efficientnet.EfficientNetB6,
    "b7": efficientnet.EfficientNetB7,
}

# 定义每个 EfficientNet 模型的配置参数字典
CONFIG_MAP = {
    "b0": {
        "hidden_dim": 1280,
        "width_coef": 1.0,
        "depth_coef": 1.0,
        "image_size": 224,
        "dropout_rate": 0.2,
        "dw_padding": [],
    },
    "b1": {
        "hidden_dim": 1280,
        "width_coef": 1.0,
        "depth_coef": 1.1,
        "image_size": 240,
        "dropout_rate": 0.2,
        "dw_padding": [16],
    },
    "b2": {
        "hidden_dim": 1408,
        "width_coef": 1.1,
        "depth_coef": 1.2,
        "image_size": 260,
        "dropout_rate": 0.3,
        "dw_padding": [5, 8, 16],
    },
    "b3": {
        "hidden_dim": 1536,
        "width_coef": 1.2,
        "depth_coef": 1.4,
        "image_size": 300,
        "dropout_rate": 0.3,
        "dw_padding": [5, 18],
    },
    "b4": {
        "hidden_dim": 1792,
        "width_coef": 1.4,
        "depth_coef": 1.8,
        "image_size": 380,
        "dropout_rate": 0.4,
        "dw_padding": [6],
    },
    "b5": {
        "hidden_dim": 2048,
        "width_coef": 1.6,
        "depth_coef": 2.2,
        "image_size": 456,
        "dropout_rate": 0.4,
        "dw_padding": [13, 27],
    },
    "b6": {
        "hidden_dim": 2304,
        "width_coef": 1.8,
        "depth_coef": 2.6,
        "image_size": 528,
        "dropout_rate": 0.5,
        "dw_padding": [31],
    },
    "b7": {  # 定义一个名为 "b7" 的字典项
        "hidden_dim": 2560,  # 设置 "hidden_dim" 键的值为 2560，表示隐藏维度
        "width_coef": 2.0,   # 设置 "width_coef" 键的值为 2.0，表示宽度系数
        "depth_coef": 3.1,   # 设置 "depth_coef" 键的值为 3.1，表示深度系数
        "image_size": 600,   # 设置 "image_size" 键的值为 600，表示图像尺寸
        "dropout_rate": 0.5, # 设置 "dropout_rate" 键的值为 0.5，表示丢弃率
        "dw_padding": [18],  # 设置 "dw_padding" 键的值为 [18]，表示深度可分离卷积的填充
    },
# 结束上一个函数或代码块的定义，空行隔开，准备定义下一个函数
}


# 根据模型名称获取 EfficientNet 的配置信息
def get_efficientnet_config(model_name):
    # 创建一个 EfficientNetConfig 对象
    config = EfficientNetConfig()
    # 设置隐藏层维度
    config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"]
    # 设置宽度系数
    config.width_coefficient = CONFIG_MAP[model_name]["width_coef"]
    # 设置深度系数
    config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"]
    # 设置图像大小
    config.image_size = CONFIG_MAP[model_name]["image_size"]
    # 设置 dropout 率
    config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"]
    # 设置深度可分离卷积的填充方式
    config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"]

    # 下载并加载预训练模型对应的标签文件
    repo_id = "huggingface/label-files"
    filename = "imagenet-1k-id2label.json"
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    # 转换标签的键值对，将键转为整数
    id2label = {int(k): v for k, v in id2label.items()}

    # 将转换后的标签映射设置到配置对象中
    config.id2label = id2label
    # 创建一个反向映射，从标签到 ID
    config.label2id = {v: k for k, v in id2label.items()}
    # 返回配置对象
    return config


# 准备一个包含可爱猫图像的函数用于验证结果
def prepare_img():
    # 图像 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过请求获取图像的原始字节流并打开为 Image 对象
    im = Image.open(requests.get(url, stream=True).raw)
    # 返回图像对象
    return im


# 根据模型名称创建图像处理器对象
def convert_image_processor(model_name):
    # 获取图像处理器所需的图像大小
    size = CONFIG_MAP[model_name]["image_size"]
    # 创建 EfficientNetImageProcessor 对象，并设置参数
    preprocessor = EfficientNetImageProcessor(
        size={"height": size, "width": size},  # 设置图像高度和宽度
        image_mean=[0.485, 0.456, 0.406],  # 设置图像均值
        image_std=[0.47853944, 0.4732864, 0.47434163],  # 设置图像标准差
        do_center_crop=False,  # 是否进行中心裁剪
    )
    # 返回图像处理器对象
    return preprocessor


# 列出所有需要重命名的键值对（左侧为原始名称，右侧为新名称）
def rename_keys(original_param_names):
    # 从原始参数名称中提取出块的名称，并排序去重
    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
    block_names = sorted(set(block_names))
    num_blocks = len(block_names)
    # 创建块名称与数字索引的映射关系
    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}

    # 初始化重命名列表
    rename_keys = []
    # 添加需要重命名的键值对
    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
    # 遍历给定的块名称列表
    for b in block_names:
        # 获取当前块的映射索引
        hf_b = block_name_mapping[b]
        
        # 添加重命名键值对，映射卷积核权重的原始路径到目标路径
        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
        # 添加重命名键值对，映射批归一化层的 gamma 参数路径
        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
        # 添加重命名键值对，映射批归一化层的 beta 参数路径
        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
        # 添加重命名键值对，映射批归一化层的移动均值路径
        rename_keys.append((f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean"))
        # 添加重命名键值对，映射批归一化层的移动方差路径
        rename_keys.append((f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var"))
        
        # 添加重命名键值对，映射深度可分离卷积层的深度卷积核权重路径
        rename_keys.append((f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight"))
        # 添加重命名键值对，映射深度可分离卷积层的批归一化 gamma 参数路径
        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
        # 添加重命名键值对，映射深度可分离卷积层的批归一化 beta 参数路径
        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
        # 添加重命名键值对，映射深度可分离卷积层的批归一化移动均值路径
        rename_keys.append((f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean"))
        # 添加重命名键值对，映射深度可分离卷积层的批归一化移动方差路径
        rename_keys.append((f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var"))
        
        # 添加重命名键值对，映射 Squeeze-and-Excitation 网络中的压缩卷积核权重路径
        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
        # 添加重命名键值对，映射 Squeeze-and-Excitation 网络中的压缩偏置路径
        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
        # 添加重命名键值对，映射 Squeeze-and-Excitation 网络中的扩展卷积核权重路径
        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
        # 添加重命名键值对，映射 Squeeze-and-Excitation 网络中的扩展偏置路径
        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
        
        # 添加重命名键值对，映射最终投影卷积层的卷积核权重路径
        rename_keys.append((f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight"))
        # 添加重命名键值对，映射最终投影卷积层的批归一化 gamma 参数路径
        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
        # 添加重命名键值对，映射最终投影卷积层的批归一化 beta 参数路径
        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
        # 添加重命名键值对，映射最终投影卷积层的批归一化移动均值路径
        rename_keys.append((f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean"))
        # 添加重命名键值对，映射最终投影卷积层的批归一化移动方差路径
        rename_keys.append((f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var"))

    # 添加重命名键值对，映射顶部卷积层的卷积核权重路径
    rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight"))
    # 添加重命名键值对，映射顶部卷积层的批归一化 gamma 参数路径
    rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight"))
    # 添加重命名键值对，映射顶部卷积层的批归一化 beta 参数路径
    rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias"))
    # 添加重命名键值对，映射顶部卷积层的批归一化移动均值路径
    rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean"))
    # 添加重命名键值对，映射顶部卷积层的批归一化移动方差路径
    rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var"))

    # 创建空字典，用于最终的键映射
    key_mapping = {}
    # 遍历重命名映射列表中的每个项
    for item in rename_keys:
        # 检查重命名映射的原始参数名是否在原始参数名列表中
        if item[0] in original_param_names:
            # 如果存在，将原始参数名映射到新的 efficientnet 模型中的对应位置
            key_mapping[item[0]] = "efficientnet." + item[1]

    # 将特定的预测层权重映射到分类器的权重和偏置项
    key_mapping["predictions/kernel:0"] = "classifier.weight"
    key_mapping["predictions/bias:0"] = "classifier.bias"

    # 返回最终的参数名映射字典
    return key_mapping
# 替换模型参数，将 TensorFlow 模型参数转换为 HuggingFace 模型参数
def replace_params(hf_params, tf_params, key_mapping):
    # 遍历 TensorFlow 模型参数字典
    for key, value in tf_params.items():
        # 如果参数名中包含 "normalization"，跳过当前循环
        if "normalization" in key:
            continue
        
        # 根据映射表获取对应的 HuggingFace 模型参数名
        hf_key = key_mapping[key]

        # 根据不同的参数类型进行转换和调整
        if "_conv" in key and "kernel" in key:
            # 对卷积核参数进行转置和维度置换，从 TensorFlow 到 PyTorch 格式
            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
        elif "depthwise_kernel" in key:
            # 对深度可分离卷积核参数进行维度置换
            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
        elif "kernel" in key:
            # 对一般卷积核参数进行转置
            new_hf_value = torch.from_numpy(np.transpose(value))
        else:
            # 直接转换为 PyTorch 张量
            new_hf_value = torch.from_numpy(value)

        # 使用新值替换 HuggingFace 模型的参数，并断言形状一致
        assert hf_params[hf_key].shape == new_hf_value.shape
        hf_params[hf_key].copy_(new_hf_value)


@torch.no_grad()
def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub):
    """
    Copy/paste/tweak model's weights to our EfficientNet structure.
    """
    # 加载原始模型
    original_model = model_classes[model_name](
        include_top=True,
        weights="imagenet",
        input_tensor=None,
        input_shape=None,
        pooling=None,
        classes=1000,
        classifier_activation="softmax",
    )

    # 获取 TensorFlow 模型的可训练和不可训练参数
    tf_params = original_model.trainable_variables
    tf_non_train_params = original_model.non_trainable_variables
    tf_params = {param.name: param.numpy() for param in tf_params}
    for param in tf_non_train_params:
        tf_params[param.name] = param.numpy()
    tf_param_names = list(tf_params.keys())

    # 加载 HuggingFace 模型
    config = get_efficientnet_config(model_name)
    hf_model = EfficientNetForImageClassification(config).eval()
    hf_params = hf_model.state_dict()

    # 创建源到目标参数名的映射字典
    print("Converting parameters...")
    key_mapping = rename_keys(tf_param_names)
    
    # 调用替换参数函数，将 TensorFlow 参数转换为 HuggingFace 参数
    replace_params(hf_params, tf_params, key_mapping)

    # 初始化预处理器并对输入图像进行预处理
    preprocessor = convert_image_processor(model_name)
    inputs = preprocessor(images=prepare_img(), return_tensors="pt")

    # 在 HuggingFace 模型上进行推理
    hf_model.eval()
    with torch.no_grad():
        outputs = hf_model(**inputs)
    hf_logits = outputs.logits.detach().numpy()

    # 在原始模型上进行推理
    original_model.trainable = False
    image_size = CONFIG_MAP[model_name]["image_size"]
    img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST)
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    original_logits = original_model.predict(x)

    # 检查原始模型和 HuggingFace 模型输出是否匹配 -> 使用 np.allclose 函数
    assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same."
    print("Model outputs match!")
    # 如果需要保存模型
    if save_model:
        # 创建用于保存模型的文件夹
        if not os.path.isdir(pytorch_dump_folder_path):
            os.mkdir(pytorch_dump_folder_path)
        # 将转换后的模型和图像处理器保存到指定路径
        hf_model.save_pretrained(pytorch_dump_folder_path)
        preprocessor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 打印消息，说明正在将模型推送到 Hub
        print(f"Pushing converted {model_name} to the hub...")
        # 修改模型名称为 efficientnet-<model_name>
        model_name = f"efficientnet-{model_name}"
        # 将预处理器推送到 Hub，使用修改后的模型名称
        preprocessor.push_to_hub(model_name)
        # 将模型推送到 Hub，使用修改后的模型名称
        hf_model.push_to_hub(model_name)
if __name__ == "__main__":
    # 如果这个模块是直接执行的主程序，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 添加必需的参数
    parser.add_argument(
        "--model_name",
        default="b0",
        type=str,
        help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].",
    )
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default="hf_model",
        type=str,
        help="Path to the output PyTorch model directory.",
    )

    # 添加可选参数
    parser.add_argument("--save_model", action="store_true", help="Save model to local")
    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数，将命令行参数传递给函数
    convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)

`.\models\efficientnet\image_processing_efficientnet.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image processor class for EfficientNet.
"""

from typing import Dict, List, Optional, Union  # 导入需要的类型提示

import numpy as np  # 导入 NumPy 库

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict  # 导入图像处理相关的模块和函数
from ...image_transforms import rescale, resize, to_channel_dimension_format  # 导入图像变换相关函数
from ...image_utils import (  # 导入图像处理工具函数
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging  # 导入工具函数和模块

if is_vision_available():  # 如果视觉处理可用
    import PIL  # 导入 PIL 库用于图像处理

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


class EfficientNetImageProcessor(BaseImageProcessor):
    r"""
    Constructs a EfficientNet image processor.
    
    This class inherits from BaseImageProcessor and is specialized for EfficientNet models.
    It provides methods for preprocessing images before feeding them into an EfficientNet model.
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 346, "width": 346}`):
            Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
        resample (`PILImageResampling` filter, *optional*, defaults to 0):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
        do_center_crop (`bool`, *optional*, defaults to `False`):
            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
            is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 289, "width": 289}`):
            Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        rescale_offset (`bool`, *optional*, defaults to `False`):
            Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range]. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
        include_top (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image again. Should be set to True if the inputs are used for image classification.
    """
    # 定义模型输入名称列表，只包含一个元素："pixel_values"
    model_input_names = ["pixel_values"]
    # 初始化函数，设置图像处理器的各项参数
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行大小调整，默认为True
        size: Dict[str, int] = None,  # 图像大小的字典，键为"height"和"width"
        resample: PILImageResampling = PIL.Image.NEAREST,  # 图像调整大小时的重采样方法，默认为最近邻插值
        do_center_crop: bool = False,  # 是否进行中心裁剪，默认为False
        crop_size: Dict[str, int] = None,  # 裁剪大小的字典，键为"height"和"width"
        rescale_factor: Union[int, float] = 1 / 255,  # 图像缩放因子，默认为1/255
        rescale_offset: bool = False,  # 是否进行缩放偏移，默认为False
        do_rescale: bool = True,  # 是否进行缩放，默认为True
        do_normalize: bool = True,  # 是否进行归一化，默认为True
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值，可以是浮点数或浮点数列表
        image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差，可以是浮点数或浮点数列表
        include_top: bool = True,  # 是否包含顶部处理，默认为True
        **kwargs,  # 其他关键字参数
    ) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 如果未提供图像大小，则使用默认大小346x346
        size = size if size is not None else {"height": 346, "width": 346}
        # 根据给定的大小参数获取有效的尺寸字典
        size = get_size_dict(size)
        # 如果未提供裁剪大小，则使用默认大小289x289
        crop_size = crop_size if crop_size is not None else {"height": 289, "width": 289}
        # 根据给定的裁剪大小参数获取有效的裁剪尺寸字典
        crop_size = get_size_dict(crop_size, param_name="crop_size")

        # 初始化对象的各个属性
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.rescale_offset = rescale_offset
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN  # 如果未提供图像均值，则使用预设的ImageNet标准均值
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD  # 如果未提供图像标准差，则使用预设的ImageNet标准标准差
        self.include_top = include_top
        # 初始化有效的处理器键列表
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "rescale_offset",
            "do_normalize",
            "image_mean",
            "image_std",
            "include_top",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 从transformers.models.vit.image_processing_vit.ViTImageProcessor.resize复制，将PILImageResampling.BILINEAR更改为PILImageResampling.NEAREST
    def resize(
        self,
        image: np.ndarray,  # 输入的图像数据，numpy数组格式
        size: Dict[str, int],  # 调整后的图像大小字典，键为"height"和"width"
        resample: PILImageResampling = PILImageResampling.NEAREST,  # 图像调整大小时的重采样方法，默认为最近邻插值
        data_format: Optional[Union[str, ChannelDimension]] = None,  # 图像数据格式，可以是字符串或通道维度
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入图像的数据格式，可以是字符串或通道维度
        **kwargs,  # 其他关键字参数
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: Optional[PILImageResampling] = PILImageResampling.NEAREST,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.NEAREST`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.NEAREST`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        # 获取调整后的尺寸，确保 `size` 字典包含 `height` 和 `width` 键
        size = get_size_dict(size)
        if "height" not in size or "width" not in size:
            # 如果 `size` 字典不包含 `height` 或 `width` 键，则抛出 ValueError 异常
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        # 计算输出的图像尺寸
        output_size = (size["height"], size["width"])
        # 调用 resize 函数对图像进行调整大小，并返回调整后的图像
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
    ):
        """
        Rescale an image by a scale factor.

        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
        1/127.5, the image is rescaled between [-1, 1].
            image = image * scale - 1

        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
            image = image * scale

        Args:
            image (`np.ndarray`):
                Image to rescale.
            scale (`int` or `float`):
                Scale to apply to the image.
            offset (`bool`, *optional*):
                Whether to scale the image in both negative and positive directions.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 调用外部函数 `rescale` 对图像进行缩放
        rescaled_image = rescale(
            image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
        )

        # 如果需要进行偏移处理
        if offset:
            # 将图像数值做偏移处理
            rescaled_image = rescaled_image - 1

        # 返回经过缩放和可能的偏移处理后的图像
        return rescaled_image

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample=None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        rescale_offset: bool = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        include_top: bool = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\efficientnet\modeling_efficientnet.py`

# coding=utf-8
# Copyright 2023 Google Research, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch EfficientNet model."""


import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_efficientnet import EfficientNetConfig


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "EfficientNetConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "google/efficientnet-b7"
_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "google/efficientnet-b7"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/efficientnet-b7",
    # See all EfficientNet models at https://huggingface.co/models?filter=efficientnet
]


EFFICIENTNET_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`EfficientNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

EFFICIENTNET_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
            Indices can be obtained using :class:`~transformers.EfficientNetTokenizer`.
            See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
            for more details.

        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not** masked,
            - 0 for tokens that are **masked**.

        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, optional):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, optional):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.

        output_attentions (:obj:`bool`, optional):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
            returned tensors for more detail.

        output_hidden_states (:obj:`bool`, optional):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
            for more detail.

        return_dict (:obj:`bool`, optional):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
            When set to ``True``, the output will be a :class:`~transformers.file_utils.ModelOutput` object.

    Returns:
        :class:`~transformers.file_utils.ModelOutput` or tuple:
        Example of output for a model with 12 hidden layers and a vocabulary size of 30522.

        Args:
            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
                Sequence of hidden-states at the output of the last layer of the model.
            hidden_states (:obj:`tuple(torch.FloatTensor)`, optional, returned when ``output_hidden_states=True`` is passed or when ``return_dict=True`` is passed):
                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            attentions (:obj:`tuple(torch.FloatTensor)`, optional, returned when ``output_attentions=True`` is passed or when ``return_dict=True`` is passed):
                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

        Examples::

            from transformers import EfficientNetTokenizer, EfficientNetModel
            import torch

            tokenizer = EfficientNetTokenizer.from_pretrained('efficientnet')
            model = EfficientNetModel.from_pretrained('efficientnet')

            inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
            outputs = model(**inputs)
"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 输入的像素值张量，形状为 `(batch_size, num_channels, height, width)`
            # 像素值可以使用 `AutoImageProcessor` 获得。详见 [`AutoImageProcessor.__call__`]。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。查看返回的张量中的 `hidden_states` 以获取更多细节。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
# 定义一个函数，根据 EfficientNet 的配置和通道数，调整滤波器的数量
def round_filters(config: EfficientNetConfig, num_channels: int):
    # 获取深度除数
    divisor = config.depth_divisor
    # 根据宽度系数调整通道数
    num_channels *= config.width_coefficient
    # 计算新的维度，确保是 divisor 的倍数且接近最接近的整数
    new_dim = max(divisor, int(num_channels + divisor / 2) // divisor * divisor)

    # 确保下取整不会低于原始通道数的 90%
    if new_dim < 0.9 * num_channels:
        new_dim += divisor

    return int(new_dim)


# 定义一个函数，用于计算深度可分离卷积的填充值的实用工具函数
def correct_pad(kernel_size: Union[int, Tuple], adjust: bool = True):
    # 如果 kernel_size 是整数，则转换成元组
    if isinstance(kernel_size, int):
        kernel_size = (kernel_size, kernel_size)

    # 计算正确的填充值
    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
    if adjust:
        return (correct[1] - 1, correct[1], correct[0] - 1, correct[0])
    else:
        return (correct[1], correct[1], correct[0], correct[0])


class EfficientNetEmbeddings(nn.Module):
    r"""
    EfficientNet 的嵌入模块，对应原始工作中的 stem 模块。
    """

    def __init__(self, config: EfficientNetConfig):
        super().__init__()

        # 计算输出维度
        self.out_dim = round_filters(config, 32)
        # 添加零填充层
        self.padding = nn.ZeroPad2d(padding=(0, 1, 0, 1))
        # 定义卷积层
        self.convolution = nn.Conv2d(
            config.num_channels, self.out_dim, kernel_size=3, stride=2, padding="valid", bias=False
        )
        # 批归一化层
        self.batchnorm = nn.BatchNorm2d(self.out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum)
        # 激活函数
        self.activation = ACT2FN[config.hidden_act]

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 对输入进行填充
        features = self.padding(pixel_values)
        # 进行卷积操作
        features = self.convolution(features)
        # 执行批归一化
        features = self.batchnorm(features)
        # 应用激活函数
        features = self.activation(features)

        return features


class EfficientNetDepthwiseConv2d(nn.Conv2d):
    def __init__(
        self,
        in_channels,
        depth_multiplier=1,
        kernel_size=3,
        stride=1,
        padding=0,
        dilation=1,
        bias=True,
        padding_mode="zeros",
    ):
        # 计算输出通道数
        out_channels = in_channels * depth_multiplier
        super().__init__(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=in_channels,  # 设置分组卷积数为输入通道数
            bias=bias,
            padding_mode=padding_mode,
        )


class EfficientNetExpansionLayer(nn.Module):
    r"""
    这个类对应原始实现中每个块的扩展阶段。
    """
    # 初始化函数，用于创建一个扩展层对象
    def __init__(self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int):
        super().__init__()  # 调用父类构造函数

        # 创建1x1卷积层，用于通道数扩展
        self.expand_conv = nn.Conv2d(
            in_channels=in_dim,          # 输入通道数
            out_channels=out_dim,        # 输出通道数
            kernel_size=1,               # 卷积核大小为1x1
            padding="same",              # 使用与原图大小相同的填充方式
            bias=False,                  # 不使用偏置项
        )
        
        # 创建批归一化层，用于标准化输出
        self.expand_bn = nn.BatchNorm2d(num_features=out_dim, eps=config.batch_norm_eps)
        
        # 选择激活函数，根据配置文件中的隐藏层激活函数选择
        self.expand_act = ACT2FN[config.hidden_act]

    # 前向传播函数，实现扩展阶段的处理过程
    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 执行扩展卷积操作
        hidden_states = self.expand_conv(hidden_states)
        # 执行批归一化操作
        hidden_states = self.expand_bn(hidden_states)
        # 执行激活函数操作
        hidden_states = self.expand_act(hidden_states)

        # 返回处理后的结果张量
        return hidden_states
# 定义 EfficientNet 模型的深度可分离卷积层
class EfficientNetDepthwiseLayer(nn.Module):
    r"""
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    """

    def __init__(
        self,
        config: EfficientNetConfig,
        in_dim: int,
        stride: int,
        kernel_size: int,
        adjust_padding: bool,
    ):
        super().__init__()
        self.stride = stride
        # 根据步长选择是否使用 valid 或 same 填充方式
        conv_pad = "valid" if self.stride == 2 else "same"
        # 计算正确的填充量
        padding = correct_pad(kernel_size, adjust=adjust_padding)

        # 创建深度可分离卷积的零填充层
        self.depthwise_conv_pad = nn.ZeroPad2d(padding=padding)
        # 创建深度可分离卷积层
        self.depthwise_conv = EfficientNetDepthwiseConv2d(
            in_dim, kernel_size=kernel_size, stride=stride, padding=conv_pad, bias=False
        )
        # 创建批归一化层
        self.depthwise_norm = nn.BatchNorm2d(
            num_features=in_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
        )
        # 选择激活函数
        self.depthwise_act = ACT2FN[config.hidden_act]

    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 深度可分离卷积
        if self.stride == 2:
            hidden_states = self.depthwise_conv_pad(hidden_states)

        hidden_states = self.depthwise_conv(hidden_states)
        hidden_states = self.depthwise_norm(hidden_states)
        hidden_states = self.depthwise_act(hidden_states)

        return hidden_states


# 定义 EfficientNet 模型的 Squeeze and Excite 层
class EfficientNetSqueezeExciteLayer(nn.Module):
    r"""
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    """

    def __init__(self, config: EfficientNetConfig, in_dim: int, expand_dim: int, expand: bool = False):
        super().__init__()
        self.dim = expand_dim if expand else in_dim
        # 计算 Squeeze and Excite 的维度
        self.dim_se = max(1, int(in_dim * config.squeeze_expansion_ratio))

        # 创建全局平均池化层
        self.squeeze = nn.AdaptiveAvgPool2d(output_size=1)
        # 创建 Squeeze 层的卷积操作
        self.reduce = nn.Conv2d(
            in_channels=self.dim,
            out_channels=self.dim_se,
            kernel_size=1,
            padding="same",
        )
        # 创建 Excite 层的卷积操作
        self.expand = nn.Conv2d(
            in_channels=self.dim_se,
            out_channels=self.dim,
            kernel_size=1,
            padding="same",
        )
        # 选择 Squeeze 层的激活函数
        self.act_reduce = ACT2FN[config.hidden_act]
        # 创建 Excite 层的激活函数
        self.act_expand = nn.Sigmoid()

    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        inputs = hidden_states
        hidden_states = self.squeeze(hidden_states)
        hidden_states = self.reduce(hidden_states)
        hidden_states = self.act_reduce(hidden_states)

        hidden_states = self.expand(hidden_states)
        hidden_states = self.act_expand(hidden_states)
        hidden_states = torch.mul(inputs, hidden_states)

        return hidden_states


# 定义 EfficientNet 模型的最终阶段的块
class EfficientNetFinalBlockLayer(nn.Module):
    r"""
    This corresponds to the final phase of each block in the original implementation.
    """
    # 初始化函数，用于构建一个 EfficientNetBlock 对象
    def __init__(
        self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int, drop_rate: float, id_skip: bool
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 根据条件确定是否应用 dropout
        self.apply_dropout = stride == 1 and not id_skip
        # 创建 1x1 的卷积层，用于调整输入通道数和输出通道数
        self.project_conv = nn.Conv2d(
            in_channels=in_dim,
            out_channels=out_dim,
            kernel_size=1,
            padding="same",  # 注意：此处应为 "same"，可能是个错误，通常 "same" 用于填充不应该在此使用
            bias=False,  # 不使用偏置项
        )
        # 创建批归一化层，对输出特征图进行归一化
        self.project_bn = nn.BatchNorm2d(
            num_features=out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
        )
        # 创建 dropout 层，用于在训练过程中随机丢弃部分特征
        self.dropout = nn.Dropout(p=drop_rate)

    # 前向传播函数，定义了 EfficientNetBlock 的前向计算过程
    def forward(self, embeddings: torch.FloatTensor, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 使用 1x1 卷积层对输入的隐藏状态进行通道数的调整
        hidden_states = self.project_conv(hidden_states)
        # 对调整后的隐藏状态进行批归一化处理
        hidden_states = self.project_bn(hidden_states)

        # 如果应用了 dropout，则在隐藏状态上进行 dropout 操作，并将嵌入向量添加到 dropout 后的结果中
        if self.apply_dropout:
            hidden_states = self.dropout(hidden_states)
            hidden_states = hidden_states + embeddings  # 将嵌入向量添加到 dropout 后的结果中

        # 返回处理后的隐藏状态作为最终的输出
        return hidden_states
# 定义 EfficientNet 模型的一个块，对应原始实现中每个块的扩展和深度卷积阶段
class EfficientNetBlock(nn.Module):
    r"""
    This corresponds to the expansion and depthwise convolution phase of each block in the original implementation.

    Args:
        config ([`EfficientNetConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    """

    def __init__(
        self,
        config: EfficientNetConfig,
        in_dim: int,
        out_dim: int,
        stride: int,
        expand_ratio: int,
        kernel_size: int,
        drop_rate: float,
        id_skip: bool,
        adjust_padding: bool,
    ):
        super().__init__()
        # 设置扩展比例
        self.expand_ratio = expand_ratio
        # 检查是否需要进行扩展
        self.expand = True if self.expand_ratio != 1 else False
        # 计算扩展后的输入维度
        expand_in_dim = in_dim * expand_ratio

        # 如果需要扩展，则使用 EfficientNetExpansionLayer 执行扩展
        if self.expand:
            self.expansion = EfficientNetExpansionLayer(
                config=config, in_dim=in_dim, out_dim=expand_in_dim, stride=stride
            )

        # 使用 EfficientNetDepthwiseLayer 执行深度卷积
        self.depthwise_conv = EfficientNetDepthwiseLayer(
            config=config,
            in_dim=expand_in_dim if self.expand else in_dim,
            stride=stride,
            kernel_size=kernel_size,
            adjust_padding=adjust_padding,
        )

        # 使用 EfficientNetSqueezeExciteLayer 执行 Squeeze-Excite 操作
        self.squeeze_excite = EfficientNetSqueezeExciteLayer(
            config=config, in_dim=in_dim, expand_dim=expand_in_dim, expand=self.expand
        )

        # 使用 EfficientNetFinalBlockLayer 执行最终的投影和残差连接
        self.projection = EfficientNetFinalBlockLayer(
            config=config,
            in_dim=expand_in_dim if self.expand else in_dim,
            out_dim=out_dim,
            stride=stride,
            drop_rate=drop_rate,
            id_skip=id_skip,
        )
    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        embeddings = hidden_states  # 将输入的隐藏状态保存到变量embeddings中
        # Expansion and depthwise convolution phase
        if self.expand_ratio != 1:
            hidden_states = self.expansion(hidden_states)  # 如果扩展比例不为1，通过self.expansion扩展隐藏状态

        hidden_states = self.depthwise_conv(hidden_states)  # 使用深度可分离卷积处理隐藏状态

        # Squeeze and excite phase
        hidden_states = self.squeeze_excite(hidden_states)  # 使用squeeze-and-excite模块处理隐藏状态
        hidden_states = self.projection(embeddings, hidden_states)  # 使用投影层处理原始输入和处理后的隐藏状态

        return hidden_states  # 返回处理后的隐藏状态作为输出
    r"""
    Forward propogates the embeddings through each EfficientNet block.

    Args:
        config ([`EfficientNetConfig`]):
            Model configuration class.
    """
    
    def __init__(self, config: EfficientNetConfig):
        super().__init__()
        self.config = config
        self.depth_coefficient = config.depth_coefficient

        def round_repeats(repeats):
            # 根据深度系数向上取整，确定块的重复次数
            return int(math.ceil(self.depth_coefficient * repeats))

        num_base_blocks = len(config.in_channels)
        num_blocks = sum(round_repeats(n) for n in config.num_block_repeats)

        curr_block_num = 0
        blocks = []
        for i in range(num_base_blocks):
            in_dim = round_filters(config, config.in_channels[i])
            out_dim = round_filters(config, config.out_channels[i])
            stride = config.strides[i]
            kernel_size = config.kernel_sizes[i]
            expand_ratio = config.expand_ratios[i]

            for j in range(round_repeats(config.num_block_repeats[i])):
                id_skip = True if j == 0 else False
                stride = 1 if j > 0 else stride
                in_dim = out_dim if j > 0 else in_dim
                adjust_padding = False if curr_block_num in config.depthwise_padding else True
                drop_rate = config.drop_connect_rate * curr_block_num / num_blocks

                # 创建 EfficientNetBlock 对象并添加到 blocks 列表中
                block = EfficientNetBlock(
                    config=config,
                    in_dim=in_dim,
                    out_dim=out_dim,
                    stride=stride,
                    kernel_size=kernel_size,
                    expand_ratio=expand_ratio,
                    drop_rate=drop_rate,
                    id_skip=id_skip,
                    adjust_padding=adjust_padding,
                )
                blocks.append(block)
                curr_block_num += 1

        # 将所有块组成的列表转换为 ModuleList，以便能够在 PyTorch 中进行管理
        self.blocks = nn.ModuleList(blocks)

        # 添加顶部的卷积层，1x1 卷积，输出通道数为 round_filters(config, 1280)
        self.top_conv = nn.Conv2d(
            in_channels=out_dim,
            out_channels=round_filters(config, 1280),
            kernel_size=1,
            padding="same",  # 使用相同的填充方式
            bias=False,  # 不使用偏置
        )

        # 添加顶部的 Batch Normalization 层
        self.top_bn = nn.BatchNorm2d(
            num_features=config.hidden_dim,  # 输入特征的数量为 config.hidden_dim
            eps=config.batch_norm_eps,  # BN 层的 epsilon 值
            momentum=config.batch_norm_momentum  # BN 层的动量
        )

        # 添加顶部的激活函数，使用 EfficientNetConfig 中指定的激活函数
        self.top_activation = ACT2FN[config.hidden_act]

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        # 前向传播函数，接收隐藏状态作为输入，可选地返回隐藏状态字典或单个张量
        ) -> BaseModelOutputWithNoAttention:
        # 函数签名，指定返回类型为BaseModelOutputWithNoAttention

        all_hidden_states = (hidden_states,) if output_hidden_states else None
        # 如果需要输出所有隐藏状态，则初始化一个元组，包含当前隐藏状态；否则初始化为None

        for block in self.blocks:
            # 遍历模型中的每一个块
            hidden_states = block(hidden_states)
            # 将当前隐藏状态传入块中进行处理

            if output_hidden_states:
                # 如果需要输出所有隐藏状态
                all_hidden_states += (hidden_states,)
                # 将当前处理后的隐藏状态添加到所有隐藏状态元组中

        hidden_states = self.top_conv(hidden_states)
        # 将当前隐藏状态通过顶层卷积层处理

        hidden_states = self.top_bn(hidden_states)
        # 将处理后的隐藏状态通过顶层批归一化层处理

        hidden_states = self.top_activation(hidden_states)
        # 将处理后的隐藏状态通过顶层激活函数处理

        if not return_dict:
            # 如果不需要返回字典形式的输出
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
            # 返回一个元组，包含所有非None的隐藏状态和所有隐藏状态，作为输出

        return BaseModelOutputWithNoAttention(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )
        # 返回一个BaseModelOutputWithNoAttention对象，包含最终的隐藏状态和所有隐藏状态
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    
    # 配置类，用于EfficientNet的配置
    config_class = EfficientNetConfig
    # 基础模型前缀，用于标识EfficientNet模型
    base_model_prefix = "efficientnet"
    # 主输入名称，代表模型的像素值输入
    main_input_name = "pixel_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层或卷积层，使用正态分布初始化权重
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 与TensorFlow版本略有不同，这里使用正态分布而不是截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是LayerNorm层，初始化偏置为零，权重为1
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


@add_start_docstrings(
    "The bare EfficientNet model outputting raw features without any specific head on top.",
    EFFICIENTNET_START_DOCSTRING,
)
class EfficientNetModel(EfficientNetPreTrainedModel):
    def __init__(self, config: EfficientNetConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 保存配置对象
        self.config = config
        # 创建EfficientNet模型的嵌入层和编码器
        self.embeddings = EfficientNetEmbeddings(config)
        self.encoder = EfficientNetEncoder(config)

        # 根据配置选择最终的池化层
        if config.pooling_type == "mean":
            self.pooler = nn.AvgPool2d(config.hidden_dim, ceil_mode=True)
        elif config.pooling_type == "max":
            self.pooler = nn.MaxPool2d(config.hidden_dim, ceil_mode=True)
        else:
            # 抛出错误，要求配置中的池化类型必须是'mean'或'max'
            raise ValueError(f"config.pooling must be one of ['mean', 'max'] got {config.pooling}")

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(EFFICIENTNET_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: torch.FloatTensor = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
        # 设置是否输出隐藏状态，默认为模型配置中的设定
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典形式的输出，默认为模型配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值，抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值通过嵌入层处理，得到嵌入输出
        embedding_output = self.embeddings(pixel_values)

        # 使用编码器处理嵌入输出，根据需要返回隐藏状态或字典形式的输出
        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 应用池化操作，从编码器输出的第一个元素中获取最后一个隐藏状态
        last_hidden_state = encoder_outputs[0]
        pooled_output = self.pooler(last_hidden_state)

        # 将池化输出的形状从 (batch_size, 1280, 1, 1) 调整为 (batch_size, 1280)
        pooled_output = pooled_output.reshape(pooled_output.shape[:2])

        # 如果不需要以字典形式返回结果，则返回元组形式的结果
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果需要以特定输出类型返回结果，则创建该类型的对象并返回
        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
"""
EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g.
for ImageNet.
"""
# 继承自预训练模型基类 EfficientNetPreTrainedModel，用于图像分类任务
class EfficientNetForImageClassification(EfficientNetPreTrainedModel):
    
    def __init__(self, config):
        # 调用父类构造函数初始化
        super().__init__(config)
        
        # 从配置中获取标签数目
        self.num_labels = config.num_labels
        self.config = config
        
        # 创建 EfficientNet 模型实例
        self.efficientnet = EfficientNetModel(config)
        
        # 分类器头部
        self.dropout = nn.Dropout(p=config.dropout_rate)  # Dropout 层，用于减少过拟合
        self.classifier = nn.Linear(config.hidden_dim, self.num_labels) if self.num_labels > 0 else nn.Identity()
        # 线性层作为分类器，根据是否有标签数目来决定使用 nn.Linear 还是 nn.Identity()

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(EFFICIENTNET_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 覆盖父类的 forward 方法，添加文档字符串和示例代码文档
    def forward(
        self,
        pixel_values: torch.FloatTensor = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用 return_dict；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 efficientnet 模型进行推断
        outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果 return_dict 为 True，则使用 outputs 的 pooler_output；否则使用 outputs 的第二个元素
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 对 pooled_output 进行 dropout 处理
        pooled_output = self.dropout(pooled_output)

        # 使用分类器计算 logits
        logits = self.classifier(pooled_output)

        # 初始化 loss 为 None
        loss = None

        # 如果 labels 不为 None，则计算损失函数
        if labels is not None:
            # 如果问题类型未定义，则根据条件自动设定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对单个标签的回归问题应用损失函数
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对多标签的回归问题应用损失函数
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对单标签分类问题应用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对多标签分类问题应用二进制交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回包含 logits 和额外输出的元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 ImageClassifierOutputWithNoAttention 类的实例
        return ImageClassifierOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )

`.\models\efficientnet\init.py`

# flake8: noqa
# 在本模块中无法忽略“F401 '...' imported but unused”警告，但需要保留其他警告。因此，完全禁用对本模块的检查。

# 版权 2023 年 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何形式的明示或暗示的保证或条件。
# 有关特定语言的权限，请参阅许可证。

from typing import TYPE_CHECKING

# 使用 isort 来合并导入
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构
_import_structure = {
    "configuration_efficientnet": [
        "EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "EfficientNetConfig",
        "EfficientNetOnnxConfig",
    ]
}

# 如果视觉处理可用，导入图像处理的 EfficientNet
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["image_processing_efficientnet"] = ["EfficientNetImageProcessor"]

# 如果 Torch 可用，导入 EfficientNet 的模型处理
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_efficientnet"] = [
        "EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "EfficientNetForImageClassification",
        "EfficientNetModel",
        "EfficientNetPreTrainedModel",
    ]

# 如果类型检查开启，导入必要的配置和模型类
if TYPE_CHECKING:
    from .configuration_efficientnet import (
        EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
        EfficientNetConfig,
        EfficientNetOnnxConfig,
    )

    # 如果视觉处理可用，导入图像处理的 EfficientNet
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_efficientnet import EfficientNetImageProcessor

    # 如果 Torch 可用，导入 EfficientNet 的模型处理
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_efficientnet import (
            EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            EfficientNetForImageClassification,
            EfficientNetModel,
            EfficientNetPreTrainedModel,
        )

# 如果类型检查未开启，使用 LazyModule 封装模块的导入
else:
    import sys

    # 将当前模块替换为 LazyModule 对象，用于延迟导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\electra\configuration_electra.py`

# 引入必要的模块和类
from collections import OrderedDict  # 从 collections 模块中引入 OrderedDict 类
from typing import Mapping  # 从 typing 模块中引入 Mapping 类型

# 从相关的模块中导入必要的配置类和函数
from ...configuration_utils import PretrainedConfig  # 从 ...configuration_utils 模块导入 PretrainedConfig 类
from ...onnx import OnnxConfig  # 从 ...onnx 模块导入 OnnxConfig 类
from ...utils import logging  # 从 ...utils 模块导入 logging 工具

# 获取当前模块的 logger 对象
logger = logging.get_logger(__name__)

# 定义 ELECTRA 预训练模型配置文件的 URL 映射
ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/config.json",
    "google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/config.json",
    "google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/config.json",
    "google/electra-small-discriminator": (
        "https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json"
    ),
    "google/electra-base-discriminator": (
        "https://huggingface.co/google/electra-base-discriminator/resolve/main/config.json"
    ),
    "google/electra-large-discriminator": (
        "https://huggingface.co/google/electra-large-discriminator/resolve/main/config.json"
    ),
}

# ElectraConfig 类，用于存储 ELECTRA 模型的配置信息，继承自 PretrainedConfig 类
class ElectraConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ElectraModel`] or a [`TFElectraModel`]. It is
    used to instantiate a ELECTRA model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the ELECTRA
    [google/electra-small-discriminator](https://huggingface.co/google/electra-small-discriminator) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import ElectraConfig, ElectraModel

    >>> # Initializing a ELECTRA electra-base-uncased style configuration
    >>> configuration = ElectraConfig()

    >>> # Initializing a model (with random weights) from the electra-base-uncased style configuration
    >>> model = ElectraModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "electra"  # 模型类型为 "electra"
    # 初始化函数，用于创建一个新的对象实例，设置各种参数和默认值
    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小，默认为30522
        embedding_size=128,  # 嵌入大小，默认为128
        hidden_size=256,  # 隐藏层大小，默认为256
        num_hidden_layers=12,  # 隐藏层的数量，默认为12
        num_attention_heads=4,  # 注意力头的数量，默认为4
        intermediate_size=1024,  # 中间层大小，默认为1024
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout_prob=0.1,  # 隐藏层的Dropout概率，默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力机制的Dropout概率，默认为0.1
        max_position_embeddings=512,  # 最大位置嵌入数，默认为512
        type_vocab_size=2,  # 类型词汇表大小，默认为2
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-12,  # 层归一化的ε值，默认为1e-12
        summary_type="first",  # 摘要类型，默认为"first"
        summary_use_proj=True,  # 是否使用投影进行摘要，默认为True
        summary_activation="gelu",  # 摘要激活函数，默认为GELU
        summary_last_dropout=0.1,  # 最后一层摘要的Dropout概率，默认为0.1
        pad_token_id=0,  # 填充标记的ID，默认为0
        position_embedding_type="absolute",  # 位置嵌入类型，默认为"absolute"
        use_cache=True,  # 是否使用缓存，默认为True
        classifier_dropout=None,  # 分类器的Dropout概率，默认为None
        **kwargs,
    ):
        # 调用父类的初始化方法，设置填充标记ID和其他可选参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        # 将参数值分配给对象的相应属性
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps

        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_last_dropout = summary_last_dropout
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.classifier_dropout = classifier_dropout
# 定义一个名为 ElectraOnnxConfig 的类，继承自 OnnxConfig 类
class ElectraOnnxConfig(OnnxConfig):
    
    # 定义一个 inputs 属性，返回一个映射，其键为字符串，值为映射类型，键为整数，值为字符串
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务类型是 "multiple-choice"
        if self.task == "multiple-choice":
            # 设置动态轴的映射，其中0对应 "batch"，1对应 "choice"，2对应 "sequence"
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则，设置动态轴的映射，其中0对应 "batch"，1对应 "sequence"
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，包含输入名称和相应的动态轴映射
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),         # 输入名称 "input_ids" 对应动态轴映射 dynamic_axis
                ("attention_mask", dynamic_axis),    # 输入名称 "attention_mask" 对应动态轴映射 dynamic_axis
                ("token_type_ids", dynamic_axis),    # 输入名称 "token_type_ids" 对应动态轴映射 dynamic_axis
            ]
        )

`.\models\electra\convert_electra_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert ELECTRA checkpoint."""


import argparse  # 导入处理命令行参数的模块

import torch  # 导入PyTorch库

from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra  # 导入transformers相关模块
from transformers.utils import logging  # 导入logging模块


logging.set_verbosity_info()  # 设置日志级别为INFO


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
    # Initialise PyTorch model
    config = ElectraConfig.from_json_file(config_file)  # 从配置文件加载ElectraConfig对象
    print(f"Building PyTorch model from configuration: {config}")  # 打印配置信息

    if discriminator_or_generator == "discriminator":  # 判断是判别器还是生成器模型
        model = ElectraForPreTraining(config)  # 构建ElectraForPreTraining模型
    elif discriminator_or_generator == "generator":
        model = ElectraForMaskedLM(config)  # 构建ElectraForMaskedLM模型
    else:
        raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")  # 参数错误时抛出异常

    # Load weights from tf checkpoint
    load_tf_weights_in_electra(
        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
    )  # 加载TensorFlow的权重到PyTorch模型中

    # Save pytorch-model
    print(f"Save PyTorch model to {pytorch_dump_path}")  # 打印保存路径信息
    torch.save(model.state_dict(), pytorch_dump_path)  # 保存PyTorch模型的状态字典到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器
    # Required parameters
    parser.add_argument(
        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )  # 添加tf_checkpoint_path参数，指定TensorFlow检查点路径
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
    )  # 添加config_file参数，指定预训练模型的配置文件路径
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )  # 添加pytorch_dump_path参数，指定输出的PyTorch模型路径
    parser.add_argument(
        "--discriminator_or_generator",
        default=None,
        type=str,
        required=True,
        help=(
            "Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
            "'generator'."
        ),
    )  # 添加discriminator_or_generator参数，指定导出的是生成器还是判别器
    args = parser.parse_args()  # 解析命令行参数
    convert_tf_checkpoint_to_pytorch(
        args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
    )  # 调用函数进行TensorFlow模型到PyTorch模型的转换

`.\models\electra\modeling_electra.py`

# coding=utf-8
# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch ELECTRA model."""

import math
import os
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN, get_activation
from ...modeling_outputs import (
    BaseModelOutputWithCrossAttentions,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_electra import ElectraConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
_CONFIG_FOR_DOC = "ElectraConfig"

ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/electra-small-generator",
    "google/electra-base-generator",
    "google/electra-large-generator",
    "google/electra-small-discriminator",
    "google/electra-base-discriminator",
    "google/electra-large-discriminator",
    # See all ELECTRA models at https://huggingface.co/models?filter=electra
]

def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re  # 导入正则表达式模块，用于处理 TensorFlow checkpoint 中的变量名
        import numpy as np  # 导入 NumPy 模块，用于处理数值数据
        import tensorflow as tf  # 导入 TensorFlow 模块，用于加载 TF checkpoint
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)  # 获取 TF checkpoint 文件的绝对路径
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")  # 记录日志信息，显示正在转换的 TF checkpoint 路径
    # 从 TF 模型加载权重
    init_vars = tf.train.list_variables(tf_path)  # 获取 TF checkpoint 中的所有变量列表
    names = []  # 初始化空列表，用于存储变量名
    arrays = []  # 初始化空列表，用于存储变量值数组
    # 遍历初始变量列表，每个元素包含变量名和形状信息
    for name, shape in init_vars:
        # 记录日志，显示正在加载的 TensorFlow 权重的名称和形状
        logger.info(f"Loading TF weight {name} with shape {shape}")
        # 使用 TensorFlow 提供的 API 加载指定路径下的变量数据
        array = tf.train.load_variable(tf_path, name)
        # 将变量名添加到名称列表
        names.append(name)
        # 将加载的数组数据添加到数组列表
        arrays.append(array)
    
    # 遍历名称列表和数组列表，这两个列表应该是一一对应的
    for name, array in zip(names, arrays):
        # 原始变量名称，用于异常处理和日志记录
        original_name: str = name

        try:
            # 如果模型是 ElectraForMaskedLM 类型，则更新变量名
            if isinstance(model, ElectraForMaskedLM):
                name = name.replace("electra/embeddings/", "generator/embeddings/")

            # 如果是生成器，更新变量名以匹配生成器的路径
            if discriminator_or_generator == "generator":
                name = name.replace("electra/", "discriminator/")
                name = name.replace("generator/", "electra/")

            # 对一些特定的变量名进行替换，以匹配 PyTorch 模型的命名规则
            name = name.replace("dense_1", "dense_prediction")
            name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias")

            # 按斜杠分割变量名
            name = name.split("/")
            
            # 检查是否有特定的变量名需要跳过处理
            if any(n in ["global_step", "temperature"] for n in name):
                # 记录日志，跳过当前变量的处理
                logger.info(f"Skipping {original_name}")
                continue
            
            # 初始化指针指向模型
            pointer = model
            
            # 遍历变量名的各个部分
            for m_name in name:
                # 如果变量名匹配形如 A_1 的模式，按照下划线分割
                if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                    scope_names = re.split(r"_(\d+)", m_name)
                else:
                    scope_names = [m_name]
                
                # 根据不同的变量名部分更新指针
                if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                    pointer = getattr(pointer, "weight")
                elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                    pointer = getattr(pointer, "bias")
                elif scope_names[0] == "output_weights":
                    pointer = getattr(pointer, "weight")
                elif scope_names[0] == "squad":
                    pointer = getattr(pointer, "classifier")
                else:
                    pointer = getattr(pointer, scope_names[0])
                
                # 如果变量名包含索引，则更新指针到具体索引位置
                if len(scope_names) >= 2:
                    num = int(scope_names[1])
                    pointer = pointer[num]
            
            # 如果变量名以 "_embeddings" 结尾，指针更新到嵌入权重
            if m_name.endswith("_embeddings"):
                pointer = getattr(pointer, "weight")
            # 如果变量名为 "kernel"，需要对数组进行转置操作
            elif m_name == "kernel":
                array = np.transpose(array)
            
            # 检查指针和数组的形状是否匹配
            try:
                if pointer.shape != array.shape:
                    raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
            except ValueError as e:
                # 如果形状不匹配，抛出异常
                e.args += (pointer.shape, array.shape)
                raise
            
            # 记录日志，显示正在初始化的 PyTorch 权重的名称和原始名称
            print(f"Initialize PyTorch weight {name}", original_name)
            # 将 NumPy 数组转换为 PyTorch 张量，并赋值给指针指向的属性
            pointer.data = torch.from_numpy(array)
        
        except AttributeError as e:
            # 捕获属性错误异常，记录日志，跳过当前变量的处理
            print(f"Skipping {original_name}", name, e)
            continue
    
    # 返回更新后的模型
    return model
# ElectraEmbeddings 类，用于构建来自单词、位置和标记类型嵌入的嵌入层。
class ElectraEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    # 初始化方法，接收一个 config 参数
    def __init__(self, config):
        super().__init__()
        # 单词嵌入层，根据词汇表大小、嵌入大小和填充标记ID创建 Embedding 对象
        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
        # 位置嵌入层，根据最大位置嵌入数量和嵌入大小创建 Embedding 对象
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
        # 标记类型嵌入层，根据类型词汇表大小和嵌入大小创建 Embedding 对象
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)

        # LayerNorm 没有使用蛇形命名，以便与 TensorFlow 模型变量名保持一致，并能够加载任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        # Dropout 层，使用指定的丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids 是在内存中连续的，并在序列化时导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 位置嵌入类型，默认为 "absolute"
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # token_type_ids 初始化为与 position_ids 相同形状的零张量
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    # 前向传播方法，接收多个输入参数，并返回嵌入后的张量
    # 代码复制自 transformers.models.bert.modeling_bert.BertEmbeddings.forward
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
    # 定义一个方法，接受输入参数 input_ids（可选），token_type_ids（可选），inputs_embeds（可选），position_ids（可选），past_key_values_length 和返回一个 torch.Tensor 对象
    def forward(
        self,
        input_ids=None,  # 输入的 token IDs
        token_type_ids=None,  # token 类型 IDs，指示每个 token 的类型（如 segment A 或 segment B）
        inputs_embeds=None,  # 输入的嵌入向量
        position_ids=None,  # 位置 IDs，指示每个 token 在序列中的位置
        past_key_values_length=0,  # 过去的键值对长度，用于注意力机制
    ) -> torch.Tensor:
        # 如果给定 input_ids，则获取其形状；否则获取 inputs_embeds 的形状去除最后一维
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果未提供 position_ids，则从 self.position_ids 中切片获取位置 IDs
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 如果未提供 token_type_ids，则检查是否已定义 self.token_type_ids，若已定义则扩展为与输入形状相匹配的全零 tensor
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果未提供 inputs_embeds，则使用 self.word_embeddings 获取 input_ids 的嵌入向量
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # 使用 token_type_ids 获取 token 类型的嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入的嵌入向量和 token 类型的嵌入向量相加作为最终的嵌入向量
        embeddings = inputs_embeds + token_type_embeddings

        # 如果位置嵌入类型是 "absolute"，则加上位置嵌入向量
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对最终的嵌入向量进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)

        # 对处理后的嵌入向量进行 dropout 处理
        embeddings = self.dropout(embeddings)

        # 返回最终的嵌入向量作为输出
        return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Electra
class ElectraSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏大小是否能被注意力头数整除，若不能且没有embedding_size属性则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # Dropout 层，用于注意力概率的随机失活
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 位置嵌入类型，默认为绝对位置嵌入
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果位置嵌入类型为相对键或相对键查询，则创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否作为解码器使用
        self.is_decoder = config.is_decoder

    # 将输入张量 x 转置以适应多头注意力的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ):
        # 这里将输入隐藏状态、注意力掩码等作为参数
        pass

# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
class ElectraSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 全连接层，用于变换隐藏状态的维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm 层，用于归一化隐藏状态
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于随机失活隐藏状态
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入隐藏状态先通过全连接层、dropout、LayerNorm层，然后与输入张量相加作为输出
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra
# ElectraAttention 类定义，继承自 nn.Module
class ElectraAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化 self 层，即自注意力层，使用 ElectraSelfAttention 类
        self.self = ElectraSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化 output 层，即自注意力层输出层，使用 ElectraSelfOutput 类
        self.output = ElectraSelfOutput(config)
        # 初始化一个空集合用于存储已经裁剪的注意力头
        self.pruned_heads = set()

    # 头裁剪方法，用于删除指定的注意力头
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 寻找可以裁剪的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 裁剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储裁剪的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 使用 self 层进行自注意力计算
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 使用 output 层处理自注意力层的输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力，将注意力张量添加到输出中
        outputs = (attention_output,) + self_outputs[1:]  # 如果有的话，添加注意力
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate
# ElectraIntermediate 类定义，继承自 nn.Module
class ElectraIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性层，将隐藏状态转换为中间状态，尺寸由 config.hidden_size 到 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果隐藏激活函数是字符串，使用对应的激活函数；否则使用配置中指定的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用线性层进行转换
        hidden_states = self.dense(hidden_states)
        # 使用中间激活函数处理转换后的隐藏状态
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput
# ElectraOutput 类定义，继承自 nn.Module
class ElectraOutput(nn.Module):
    # 类定义略过，因为没有在提供的代码段中完整展示
    # 初始化方法，用于初始化对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，输入大小为config.intermediate_size，输出大小为config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层，对隐藏状态进行归一化，设置 epsilon 为config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，用于随机失活一部分神经元，概率为config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，定义了模型的计算过程
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的结果进行随机失活
        hidden_states = self.dropout(hidden_states)
        # 将随机失活后的结果与输入张量进行残差连接，并对结果进行 LayerNorm 归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态张量作为输出
        return hidden_states
# 从transformers.models.bert.modeling_bert.BertLayer复制并修改为使用Electra模型
class ElectraLayer(nn.Module):
    # ElectraLayer类的初始化函数，接受一个config参数
    def __init__(self, config):
        super().__init__()
        # 设置前向传播中的块大小（用于分块的前馈网络）
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度的维度，默认为1
        self.seq_len_dim = 1
        # 创建ElectraAttention对象，根据给定的config参数
        self.attention = ElectraAttention(config)
        # 是否作为解码器使用
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加了交叉注意力
        if self.add_cross_attention:
            # 如果不是解码器，抛出错误
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 创建带有绝对位置嵌入类型的ElectraAttention对象
            self.crossattention = ElectraAttention(config, position_embedding_type="absolute")
        # 创建ElectraIntermediate对象
        self.intermediate = ElectraIntermediate(config)
        # 创建ElectraOutput对象
        self.output = ElectraOutput(config)

    # 前向传播函数，接收多个Tensor类型的输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention on the input hidden states using the attention module
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # Extract all outputs except the last (which is the present key/value) for decoder
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            # Include self-attentions in outputs if we are outputting attention weights
            outputs = self_attention_outputs[1:]

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                # Raise error if cross-attention is expected but not defined in the model
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention using crossattention module
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            # Append cross-attentions outputs to existing outputs
            outputs = outputs + cross_attention_outputs[1:-1]

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # Apply chunking strategy to the feed forward computation for potentially large inputs
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # Process the attention output through intermediate and output layers of the feed forward network
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从 transformers.models.bert.modeling_bert.BertEncoder 复制代码，并将其中的 "Bert" 替换为 "Electra"
class ElectraEncoder(nn.Module):
    # ElectraEncoder 类的初始化方法
    def __init__(self, config):
        # 调用父类 nn.Module 的初始化方法
        super().__init__()
        # 将传入的配置参数 config 存储到实例变量 self.config 中
        self.config = config
        # 使用列表推导式创建一个 nn.ModuleList，其中包含 config.num_hidden_layers 个 ElectraLayer 实例
        self.layer = nn.ModuleList([ElectraLayer(config) for _ in range(config.num_hidden_layers)])
        # 将梯度检查点功能设为 False
        self.gradient_checkpointing = False

    # ElectraEncoder 类的前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果输出隐藏状态，初始化空元组；否则设为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，初始化空元组；否则设为 None
        all_self_attentions = () if output_attentions else None
        # 如果输出交叉注意力权重且配置允许，初始化空元组；否则设为 None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果开启梯度检查点且在训练中
        if self.gradient_checkpointing and self.training:
            # 如果 use_cache 为 True，给出警告并设置为 False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果不使用缓存，初始化空元组；否则设为 None
        next_decoder_cache = () if use_cache else None
        # 遍历每个 Transformer 层
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态，将当前隐藏状态添加到 all_hidden_states 元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果有头部掩码，根据索引获取；否则设为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果有过去的键值对，根据索引获取；否则设为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果开启梯度检查点且在训练中，调用梯度检查点函数处理当前层
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则正常调用当前层模块处理输入数据
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存，将当前层的缓存信息添加到 next_decoder_cache 中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果输出注意力权重，将当前层的注意力权重信息添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果配置允许，将当前层的交叉注意力权重信息添加到 all_cross_attentions 中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果输出隐藏状态，将最终隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的输出结果
        if not return_dict:
            # 返回元组形式的结果，排除其中为 None 的元素
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 返回字典形式的 BaseModelOutputWithPastAndCrossAttentions 结果
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
class ElectraDiscriminatorPredictions(nn.Module):
    """Prediction module for the discriminator, made up of two dense layers."""

    def __init__(self, config):
        super().__init__()
        
        # 初始化第一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        
        # 根据配置获取激活函数，并初始化激活层
        self.activation = get_activation(config.hidden_act)
        
        # 初始化第二个全连接层，输入维度是 config.hidden_size，输出维度是 1
        self.dense_prediction = nn.Linear(config.hidden_size, 1)
        
        # 保存配置信息
        self.config = config

    def forward(self, discriminator_hidden_states):
        # 经过第一个全连接层
        hidden_states = self.dense(discriminator_hidden_states)
        
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        
        # 经过第二个全连接层得到 logits，并进行压缩
        logits = self.dense_prediction(hidden_states).squeeze(-1)

        return logits


class ElectraGeneratorPredictions(nn.Module):
    """Prediction module for the generator, made up of two dense layers."""

    def __init__(self, config):
        super().__init__()
        
        # 获取激活函数，并初始化激活层
        self.activation = get_activation("gelu")
        
        # 初始化 LayerNorm 层，输入维度是 config.embedding_size
        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        
        # 初始化全连接层，输入维度是 config.hidden_size，输出维度是 config.embedding_size
        self.dense = nn.Linear(config.hidden_size, config.embedding_size)

    def forward(self, generator_hidden_states):
        # 经过全连接层
        hidden_states = self.dense(generator_hidden_states)
        
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        
        # 应用 LayerNorm
        hidden_states = self.LayerNorm(hidden_states)

        return hidden_states


class ElectraPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用的配置类
    config_class = ElectraConfig
    
    # 加载 TensorFlow 权重的方法
    load_tf_weights = load_tf_weights_in_electra
    
    # 模型的前缀名称
    base_model_prefix = "electra"
    
    # 是否支持梯度检查点
    supports_gradient_checkpointing = True

    # 来自 transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights 的方法
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # 对线性层的权重进行初始化，使用正态分布，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置，则将偏置初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 对嵌入层的权重进行初始化，使用正态分布，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果存在 padding_idx，则将对应位置的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 对 LayerNorm 层的权重初始化，偏置初始化为零，权重初始化为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


@dataclass
class ElectraForPreTrainingOutput(ModelOutput):
    """
    Output type of [`ElectraForPreTraining`].
    """
    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            ELECTRA 目标函数的总损失。
            如果提供了 `labels`，则返回此损失。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            头部预测分数（SoftMax 前每个标记的分数）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组包含 `torch.FloatTensor` 类型的张量（当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回）。
            形状为 `(batch_size, sequence_length, hidden_size)`。

            模型每一层的隐藏状态加上初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            元组包含 `torch.FloatTensor` 类型的张量（每层一个）。
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            自注意力机制中注意力权重经过 softmax 后的结果，用于计算自注意力头的加权平均值。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# ELECTRA 模型的文档字符串，描述了模型继承自 PreTrainedModel，并提供了一些通用方法的描述和链接
ELECTRA_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# ELECTRA 模型的输入文档字符串，当前为空，通常应包含有关输入参数的描述
ELECTRA_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
    "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
    "hidden size and embedding size are different. "
    ""
    "Both the generator and discriminator checkpoints may be loaded into this model.",
    ELECTRA_START_DOCSTRING,
)
# ElectraModel 类的定义，继承自 ElectraPreTrainedModel
class ElectraModel(ElectraPreTrainedModel):
    # ElectraModel 类的初始化方法
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 初始化词嵌入层
        self.embeddings = ElectraEmbeddings(config)

        # 如果 embedding_size 与 hidden_size 不同，添加一个线性映射层
        if config.embedding_size != config.hidden_size:
            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)

        # 初始化编码器
        self.encoder = ElectraEncoder(config)
        self.config = config
        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入的词嵌入
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入的词嵌入
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 向模型的前向方法添加文档字符串，描述了输入参数的格式
    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 向模型的前向方法添加代码示例的文档字符串，包括了加载检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义 Transformer 模型的前向传播方法，接收多个输入参数
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs，可以为空
        attention_mask: Optional[torch.Tensor] = None,  # 注意力 mask，可以为空
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，可以为空
        position_ids: Optional[torch.Tensor] = None,  # 位置 IDs，可以为空
        head_mask: Optional[torch.Tensor] = None,  # 头部 mask，可以为空
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，可以为空
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器隐藏状态，可以为空
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器注意力 mask，可以为空
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 历史的键值对，可以为空
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以为空
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以为空
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以为空
        return_dict: Optional[bool] = None,  # 是否返回字典格式的结果，可以为空
# ElectraClassificationHead 类定义，用于处理句子级别的分类任务
class ElectraClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 分类器的 dropout 率，如果没有指定，则使用 config.hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 激活函数为 GELU
        self.activation = get_activation("gelu")
        # Dropout 层，使用指定的 dropout 率
        self.dropout = nn.Dropout(classifier_dropout)
        # 输出层全连接层，输入维度为 config.hidden_size，输出维度为 config.num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 取 <s> 标记对应的特征 (等效于 [CLS] 标记)
        x = features[:, 0, :]
        # 应用 dropout
        x = self.dropout(x)
        # 经过全连接层
        x = self.dense(x)
        # 应用激活函数 GELU
        x = self.activation(x)
        # 再次应用 dropout
        x = self.dropout(x)
        # 经过输出全连接层
        x = self.out_proj(x)
        # 返回分类结果
        return x


# ElectraForSequenceClassification 类，继承自 ElectraPreTrainedModel 类，用于序列分类任务
@add_start_docstrings(
    """
    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    ELECTRA_START_DOCSTRING,
)
class ElectraForSequenceClassification(ElectraPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 标签数量
        self.num_labels = config.num_labels
        # 配置
        self.config = config
        # Electra 模型
        self.electra = ElectraModel(config)
        # 序列分类器头部
        self.classifier = ElectraClassificationHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="bhadresh-savani/electra-base-emotion",
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'joy'",
        expected_loss=0.06,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据需要确定是否使用返回字典，如果未指定则根据配置决定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 ELECTRA 模型进行前向传播，获取鉴别器的隐藏状态
        discriminator_hidden_states = self.electra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取鉴别器输出的序列特征向量
        sequence_output = discriminator_hidden_states[0]

        # 使用分类器对序列特征向量进行分类预测
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None

        # 如果存在标签，则计算损失
        if labels is not None:
            # 根据配置确定问题类型，如果未指定则根据标签类型确定
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不需要返回字典，则构造输出元组
        if not return_dict:
            output = (logits,) + discriminator_hidden_states[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有损失、预测 logits、隐藏状态和注意力权重的 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=discriminator_hidden_states.hidden_states,
            attentions=discriminator_hidden_states.attentions,
        )
@add_start_docstrings(
    """
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    """,
    ELECTRA_START_DOCSTRING,
)
class ElectraForPreTraining(ElectraPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.electra = ElectraModel(config)  # 初始化 Electra 模型
        self.discriminator_predictions = ElectraDiscriminatorPredictions(config)  # 初始化判别器预测组件
        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """
    Electra model with a language modeling head on top.

    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
    the two to have been trained for the masked language modeling task.
    """,
    ELECTRA_START_DOCSTRING,
)
class ElectraForMaskedLM(ElectraPreTrainedModel):
    _tied_weights_keys = ["generator_lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)

        self.electra = ElectraModel(config)  # 初始化 Electra 模型
        self.generator_predictions = ElectraGeneratorPredictions(config)  # 初始化生成器预测组件

        self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)  # 初始化生成器的语言建模头部
        # 初始化权重并进行最终处理
        self.post_init()

    def get_output_embeddings(self):
        return self.generator_lm_head

    def set_output_embeddings(self, word_embeddings):
        self.generator_lm_head = word_embeddings

    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="google/electra-small-generator",
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="[MASK]",
        expected_output="'paris'",
        expected_loss=1.22,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token IDs，可选参数
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可选参数
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，可选参数
        position_ids: Optional[torch.Tensor] = None,  # 位置 IDs，可选参数
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，可选参数
        inputs_embeds: Optional[torch.Tensor] = None,  # 嵌入的输入，可选参数
        labels: Optional[torch.Tensor] = None,  # 用于计算MLM损失的标签，可选参数
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选参数
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选参数
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选参数
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 通过Electra模型生成隐状态
        generator_hidden_states = self.electra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        generator_sequence_output = generator_hidden_states[0]  # 获取生成器的序列输出

        # 使用生成器预测MLM任务的分数
        prediction_scores = self.generator_predictions(generator_sequence_output)
        prediction_scores = self.generator_lm_head(prediction_scores)  # 应用MLM的softmax层

        loss = None
        # 如果提供了标签，则计算MLM损失
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()  # 交叉熵损失函数，-100索引表示填充标记
            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果不要求返回字典形式的输出，则返回元组形式的输出
        if not return_dict:
            output = (prediction_scores,) + generator_hidden_states[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回MaskedLMOutput对象，包含损失、预测logits、隐藏状态和注意力权重
        return MaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=generator_hidden_states.hidden_states,
            attentions=generator_hidden_states.attentions,
        )
# 定义一个基于 Electra 模型的标记分类器模型
@add_start_docstrings(
    """
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    """,
    ELECTRA_START_DOCSTRING,
)
class ElectraForTokenClassification(ElectraPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置分类器的标签数量
        self.num_labels = config.num_labels

        # 加载 Electra 模型
        self.electra = ElectraModel(config)
        # 获取分类器的 dropout 配置，如果未指定则使用隐藏层 dropout 的配置
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义一个 dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 定义一个线性层，用于分类
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # 初始化权重并进行最终的处理
        self.post_init()

    # 增加输入文档字符串到模型的前向传播方法
    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 增加示例代码文档字符串到模型的前向传播方法
    @add_code_sample_docstrings(
        checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english",
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="['B-LOC', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC']",
        expected_loss=0.11,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 前向传播方法，接收多种输入参数，返回模型的输出
        # 可选参数包括输入的张量、注意力掩码、token 类型 ID、位置 ID、头部掩码、嵌入的输入张量、标签、是否输出注意力、是否输出隐藏状态、是否返回字典
        ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 根据 return_dict 参数确定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 Electra 模型进行推断
        discriminator_hidden_states = self.electra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取 discriminator_hidden_states 的输出序列
        discriminator_sequence_output = discriminator_hidden_states[0]

        # 对输出序列应用 dropout
        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
        # 使用分类器生成 logits
        logits = self.classifier(discriminator_sequence_output)

        # 初始化损失值为 None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果 return_dict 为 False，则按非字典形式返回输出
        if not return_dict:
            output = (logits,) + discriminator_hidden_states[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则按 TokenClassifierOutput 对象形式返回输出
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=discriminator_hidden_states.hidden_states,
            attentions=discriminator_hidden_states.attentions,
        )
@add_start_docstrings(
    """
    ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ELECTRA_START_DOCSTRING,
)
# 定义用于问答任务的 ELECTRA 模型，包含一个用于提取式问答任务（如 SQuAD）的跨度分类头部（在隐藏状态输出之上的线性层，
# 用于计算 `span start logits` 和 `span end logits`）。
class ElectraForQuestionAnswering(ElectraPreTrainedModel):
    # 指定配置类
    config_class = ElectraConfig
    # 基础模型前缀
    base_model_prefix = "electra"

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # ELECTRA 模型
        self.electra = ElectraModel(config)
        # 问答输出层，用于预测答案的开始和结束位置
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="bhadresh-savani/electra-base-squad2",
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        qa_target_start_index=11,
        qa_target_end_index=12,
        expected_output="'a nice puppet'",
        expected_loss=2.64,
    )
    # 前向传播方法，接收一系列输入并返回预测的答案开始和结束位置的 logit
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """
    ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    ELECTRA_START_DOCSTRING,
)
# 定义用于多选分类任务的 ELECTRA 模型，包含一个用于多选分类任务（如 RocStories/SWAG）的分类头部（线性层放置在池化输出之上，
# 并应用 softmax 操作）。
class ElectraForMultipleChoice(ElectraPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # ELECTRA 模型
        self.electra = ElectraModel(config)
        # 序列汇总层
        self.sequence_summary = SequenceSummary(config)
        # 分类器层，用于多选任务的分类
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token ids，类型为可选的PyTorch张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，类型为可选的PyTorch张量
        token_type_ids: Optional[torch.Tensor] = None,  # token类型 ids，类型为可选的PyTorch张量
        position_ids: Optional[torch.Tensor] = None,  # 位置 ids，类型为可选的PyTorch张量
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，类型为可选的PyTorch张量
        inputs_embeds: Optional[torch.Tensor] = None,  # 嵌入的输入，类型为可选的PyTorch张量
        labels: Optional[torch.Tensor] = None,  # 标签，用于多项选择分类损失计算的PyTorch张量
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，类型为可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，类型为可选的布尔值
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 确定是否返回字典格式的输出

        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]  # 获取选择的数量

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None  # 将输入token ids重新视图化
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None  # 将注意力掩码重新视图化
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None  # 将token类型 ids重新视图化
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None  # 将位置 ids重新视图化
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )  # 将嵌入的输入重新视图化

        discriminator_hidden_states = self.electra(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 使用Electra模型进行前向传播，获取鉴别器的隐藏状态

        sequence_output = discriminator_hidden_states[0]  # 获取鉴别器的序列输出

        pooled_output = self.sequence_summary(sequence_output)  # 序列总结，获取池化输出
        logits = self.classifier(pooled_output)  # 分类器，计算logits

        reshaped_logits = logits.view(-1, num_choices)  # 重新形状化logits

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 定义交叉熵损失函数
            loss = loss_fct(reshaped_logits, labels)  # 计算损失

        if not return_dict:
            output = (reshaped_logits,) + discriminator_hidden_states[1:]  # 构造非字典格式的输出
            return ((loss,) + output) if loss is not None else output  # 返回损失和输出，如果损失不为None的话

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=discriminator_hidden_states.hidden_states,
            attentions=discriminator_hidden_states.attentions,
        )  # 返回字典格式的多项选择模型输出
# 继承自 ElectraPreTrainedModel 类的 ELECTRA 语言模型，添加了用于条件语言建模 fine-tuning 的头部
@add_start_docstrings(
    """ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING
)
class ElectraForCausalLM(ElectraPreTrainedModel):
    # 用于指定权重共享的键名列表
    _tied_weights_keys = ["generator_lm_head.weight"]

    # 初始化方法，接受配置参数 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 如果配置中未指定为解码器，发出警告
        if not config.is_decoder:
            logger.warning("If you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`")

        # 创建 ELECTRA 模型
        self.electra = ElectraModel(config)
        # 创建用于生成预测的预测器对象
        self.generator_predictions = ElectraGeneratorPredictions(config)
        # 创建用于语言建模的线性层，设置输入维度和词汇表大小
        self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)

        # 初始化权重
        self.init_weights()

    # 返回语言建模头部的输出嵌入
    def get_output_embeddings(self):
        return self.generator_lm_head

    # 设置语言建模头部的输出嵌入
    def set_output_embeddings(self, new_embeddings):
        self.generator_lm_head = new_embeddings

    # 前向传播方法，接受多个输入参数，并返回预测的条件语言建模输出
    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处为 forward 方法的详细文档注释
        """
        Replace these docstrings with ones in transformers.models.**

        """
    
        # 根据输入准备生成过程中需要的输入参数
        def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
            input_shape = input_ids.shape
            # 如果未提供注意力掩码，则创建全 1 的注意力掩码
            if attention_mask is None:
                attention_mask = input_ids.new_ones(input_shape)

            # 如果传入了过去的键值，则根据它们调整输入的 input_ids
            if past_key_values is not None:
                past_length = past_key_values[0][0].shape[2]

                # 某些生成方法可能只传入最后一个输入 ID
                if input_ids.shape[1] > past_length:
                    remove_prefix_length = past_length
                else:
                    # 默认行为：仅保留最后一个 ID
                    remove_prefix_length = input_ids.shape[1] - 1

                input_ids = input_ids[:, remove_prefix_length:]

            # 返回准备好的输入字典
            return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
    # 定义方法 `_reorder_cache`，用于重新排序模型的缓存 `past_key_values`，以便与给定的 `beam_idx` 对齐
    def _reorder_cache(self, past_key_values, beam_idx):
        # 初始化一个空元组 `reordered_past`，用于存储重新排序后的缓存
        reordered_past = ()
        # 遍历 `past_key_values` 中的每个层的缓存
        for layer_past in past_key_values:
            # 对当前层的每个缓存状态 `past_state` 执行索引选择操作，
            # 根据 `beam_idx` 对应的索引进行选择，并将结果转移到与 `past_state` 相同的设备上
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的缓存 `reordered_past`
        return reordered_past

Transformers-源码解析-四十三-

Transformers 源码解析（四十三）

.\models\efficientformer\image_processing_efficientformer.py

.\models\efficientformer\modeling_efficientformer.py

.\models\efficientformer\modeling_tf_efficientformer.py

.\models\efficientformer\__init__.py

.\models\efficientnet\configuration_efficientnet.py

.\models\efficientnet\convert_efficientnet_to_pytorch.py

.\models\efficientnet\image_processing_efficientnet.py

.\models\efficientnet\modeling_efficientnet.py

.\models\efficientnet\__init__.py

.\models\electra\configuration_electra.py

.\models\electra\convert_electra_original_tf_checkpoint_to_pytorch.py

.\models\electra\modeling_electra.py

`.\models\efficientformer\image_processing_efficientformer.py`

`.\models\efficientformer\modeling_efficientformer.py`

`.\models\efficientformer\modeling_tf_efficientformer.py`

`.\models\efficientformer\init.py`

`.\models\efficientnet\configuration_efficientnet.py`

`.\models\efficientnet\convert_efficientnet_to_pytorch.py`

`.\models\efficientnet\image_processing_efficientnet.py`

`.\models\efficientnet\modeling_efficientnet.py`

`.\models\efficientnet\init.py`

`.\models\electra\configuration_electra.py`

`.\models\electra\convert_electra_original_tf_checkpoint_to_pytorch.py`

`.\models\electra\modeling_electra.py`