Transformers 源码解析（五十三）

`.\models\glpn\convert_glpn_to_pytorch.py`

# 导入必要的模块和库
import argparse  # 导入命令行参数解析模块
from collections import OrderedDict  # 导入有序字典模块
from pathlib import Path  # 导入处理文件路径的模块

import requests  # 导入处理 HTTP 请求的库
import torch  # 导入 PyTorch 深度学习框架
from PIL import Image  # 导入 Python Imaging Library，用于图像处理

# 从transformers库中导入GLPN模型相关的类和函数
from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
# 从transformers的utils模块中导入日志记录功能
from transformers.utils import logging

# 设置日志级别为信息（info）
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义一个函数，用于重命名模型的state_dict中的键
def rename_keys(state_dict):
    new_state_dict = OrderedDict()
    # 遍历给定的状态字典中的每一个键值对
    for key, value in state_dict.items():
        # 如果键名以 "module.encoder" 开头，将其替换为 "glpn.encoder"
        if key.startswith("module.encoder"):
            key = key.replace("module.encoder", "glpn.encoder")
        # 如果键名以 "module.decoder" 开头，将其替换为 "decoder.stages"
        if key.startswith("module.decoder"):
            key = key.replace("module.decoder", "decoder.stages")
        # 如果键名中包含 "patch_embed"，例如 "patch_embed1"，将其替换为 "patch_embeddings.0"
        if "patch_embed" in key:
            # 获取数字索引
            idx = key[key.find("patch_embed") + len("patch_embed")]
            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}")
        # 如果键名中包含 "norm"，将其替换为 "layer_norm"
        if "norm" in key:
            key = key.replace("norm", "layer_norm")
        # 如果键名中包含 "glpn.encoder.layer_norm"，例如 "glpn.encoder.layer_norm1"，将其替换为 "glpn.encoder.layer_norm.0"
        if "glpn.encoder.layer_norm" in key:
            # 获取数字索引
            idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")]
            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}")
        # 如果键名中包含 "layer_norm1"，将其替换为 "layer_norm_1"
        if "layer_norm1" in key:
            key = key.replace("layer_norm1", "layer_norm_1")
        # 如果键名中包含 "layer_norm2"，将其替换为 "layer_norm_2"
        if "layer_norm2" in key:
            key = key.replace("layer_norm2", "layer_norm_2")
        # 如果键名中包含 "block"，例如 "block1"，将其替换为 "block.0"
        if "block" in key:
            # 获取数字索引
            idx = key[key.find("block") + len("block")]
            key = key.replace(f"block{idx}", f"block.{int(idx)-1}")
        # 如果键名中包含 "attn.q"，将其替换为 "attention.self.query"
        if "attn.q" in key:
            key = key.replace("attn.q", "attention.self.query")
        # 如果键名中包含 "attn.proj"，将其替换为 "attention.output.dense"
        if "attn.proj" in key:
            key = key.replace("attn.proj", "attention.output.dense")
        # 如果键名中包含 "attn"，将其替换为 "attention.self"
        if "attn" in key:
            key = key.replace("attn", "attention.self")
        # 如果键名中包含 "fc1"，将其替换为 "dense1"
        if "fc1" in key:
            key = key.replace("fc1", "dense1")
        # 如果键名中包含 "fc2"，将其替换为 "dense2"
        if "fc2" in key:
            key = key.replace("fc2", "dense2")
        # 如果键名中包含 "linear_pred"，将其替换为 "classifier"
        if "linear_pred" in key:
            key = key.replace("linear_pred", "classifier")
        # 如果键名中包含 "linear_fuse"，将其替换为 "linear_fuse" 和 "batch_norm"
        if "linear_fuse" in key:
            key = key.replace("linear_fuse.conv", "linear_fuse")
            key = key.replace("linear_fuse.bn", "batch_norm")
        # 如果键名中包含 "linear_c"，例如 "linear_c4"，将其替换为 "linear_c.3"
        if "linear_c" in key:
            # 获取数字索引
            idx = key[key.find("linear_c") + len("linear_c")]
            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}")
        # 如果键名中包含 "bot_conv"，将其替换为 "0.convolution"
        if "bot_conv" in key:
            key = key.replace("bot_conv", "0.convolution")
        # 如果键名中包含 "skip_conv1"，将其替换为 "1.convolution"
        if "skip_conv1" in key:
            key = key.replace("skip_conv1", "1.convolution")
        # 如果键名中包含 "skip_conv2"，将其替换为 "2.convolution"
        if "skip_conv2" in key:
            key = key.replace("skip_conv2", "2.convolution")
        # 如果键名中包含 "fusion1"，将其替换为 "1.fusion"
        if "fusion1" in key:
            key = key.replace("fusion1", "1.fusion")
        # 如果键名中包含 "fusion2"，将其替换为 "2.fusion"
        if "fusion2" in key:
            key = key.replace("fusion2", "2.fusion")
        # 如果键名中包含 "fusion3"，将其替换为 "3.fusion"
        if "fusion3" in key:
            key = key.replace("fusion3", "3.fusion")
        # 如果键名中包含 "fusion" 和 "conv"，将其替换为 "fusion.convolutional_layer"
        if "fusion" in key and "conv" in key:
            key = key.replace("conv", "convolutional_layer")
        # 如果键名以 "module.last_layer_depth" 开头，将其替换为 "head.head"
        if key.startswith("module.last_layer_depth"):
            key = key.replace("module.last_layer_depth", "head.head")
        # 将更新后的键值对存入新的状态字典中
        new_state_dict[key] = value

    # 返回更新后的状态字典
    return new_state_dict
# 读取每个编码器块的键值对权重和偏置
def read_in_k_v(state_dict, config):
    # 遍历每个编码器块
    for i in range(config.num_encoder_blocks):
        # 遍历当前深度下的层数
        for j in range(config.depths[i]):
            # 从状态字典中弹出键和值（在原始实现中，它们是单个矩阵）
            kv_weight = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.weight")
            kv_bias = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.bias")
            # 将键和值按顺序添加到状态字典中
            # 键的权重
            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
            # 键的偏置
            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
            # 值的权重
            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[config.hidden_sizes[i] :, :]
            # 值的偏置
            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i]:]

# 我们将在 COCO 图像上验证我们的结果
def prepare_img():
    # 定义 COCO 图像的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用请求获取图像的原始流并打开图像
    image = Image.open(requests.get(url, stream=True).raw)
    # 返回处理后的图像
    return image

@torch.no_grad()
def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub=False, model_name=None):
    """
    将模型的权重复制/粘贴/调整到我们的 GLPN 结构中。
    """

    # 加载 GLPN 配置（Segformer-B4 尺寸）
    config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])

    # 加载图像处理器（仅调整大小 + 重新缩放）
    image_processor = GLPNImageProcessor()

    # 准备图像
    image = prepare_img()
    # 使用图像处理器处理图像并获取像素值张量
    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values

    # 记录日志，指示模型转换过程开始
    logger.info("Converting model...")

    # 加载原始状态字典
    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))

    # 重命名键名
    state_dict = rename_keys(state_dict)

    # 处理键和值矩阵需要特殊处理
    read_in_k_v(state_dict, config)

    # 创建 HuggingFace 模型并加载状态字典
    model = GLPNForDepthEstimation(config)
    model.load_state_dict(state_dict)
    model.eval()

    # 前向传播
    outputs = model(pixel_values)
    # 获取预测的深度信息
    predicted_depth = outputs.predicted_depth

    # 验证输出
    # 如果模型名称不是空的话
    if model_name is not None:
        # 如果模型名称包含 "nyu"
        if "nyu" in model_name:
            # 设置预期的切片值为特定的张量
            expected_slice = torch.tensor(
                [[4.4147, 4.0873, 4.0673], [3.7890, 3.2881, 3.1525], [3.7674, 3.5423, 3.4913]]
            )
        # 如果模型名称包含 "kitti"
        elif "kitti" in model_name:
            # 设置预期的切片值为特定的张量
            expected_slice = torch.tensor(
                [[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]]
            )
        else:
            # 如果模型名称既不包含 "nyu" 也不包含 "kitti"，则抛出异常
            raise ValueError(f"Unknown model name: {model_name}")

        # 设置预期的张量形状
        expected_shape = torch.Size([1, 480, 640])

        # 断言预测深度图的形状是否与预期形状相同
        assert predicted_depth.shape == expected_shape
        # 断言预测深度图的前 3x3 部分是否与预期的切片值在指定的误差范围内相近
        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
        # 打印确认信息
        print("Looks ok!")

    # 最后，如果需要推送到 hub
    if push_to_hub:
        # 记录推送模型和图像处理器到 hub 的信息
        logger.info("Pushing model and image processor to the hub...")
        # 将模型推送到 hub
        model.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
            commit_message="Add model",
            use_temp_dir=True,
        )
        # 将图像处理器推送到 hub
        image_processor.push_to_hub(
            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
            organization="nielsr",
            commit_message="Add image processor",
            use_temp_dir=True,
        )
# 如果当前脚本作为主程序运行（而不是被导入为模块），则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加命令行参数，用于指定原始 PyTorch 检查点文件的路径
    parser.add_argument(
        "--checkpoint_path",
        default=None,
        type=str,
        help="Path to the original PyTorch checkpoint (.pth file).",
    )

    # 添加命令行参数，用于指定输出 PyTorch 模型的文件夹路径
    parser.add_argument(
        "--pytorch_dump_folder_path", 
        default=None, 
        type=str, 
        help="Path to the folder to output PyTorch model."
    )

    # 添加命令行参数，指定是否将模型上传到 HuggingFace hub
    parser.add_argument(
        "--push_to_hub", 
        action="store_true", 
        help="Whether to upload the model to the HuggingFace hub."
    )

    # 添加命令行参数，用于指定模型名称，上传到 Hub 时会用到
    parser.add_argument(
        "--model_name",
        default="glpn-kitti",
        type=str,
        help="Name of the model in case you're pushing to the hub.",
    )

    # 解析命令行参数，将其存储在 args 变量中
    args = parser.parse_args()

    # 调用函数 convert_glpn_checkpoint，传递解析后的命令行参数作为函数参数
    convert_glpn_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)

`.\models\glpn\feature_extraction_glpn.py`

# 设置文件编码为 UTF-8
# 版权声明和许可协议
# 版权所有 2022 年 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证版本 2.0 进行许可；
# 除非符合许可协议的规定，否则不得使用此文件。
# 您可以在以下网址获取许可协议的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发本软件，
# 没有任何明示或暗示的担保或条件。
# 有关许可协议的详细信息，请参阅许可协议。
"""GLPN 的特征提取器类。"""

# 引入警告模块
import warnings

# 引入日志模块
from ...utils import logging
# 从本地模块中引入 GLPNImageProcessor 类
from .image_processing_glpn import GLPNImageProcessor

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# GLPNFeatureExtractor 类继承自 GLPNImageProcessor 类
class GLPNFeatureExtractor(GLPNImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告信息，表明 GLPNFeatureExtractor 类在 Transformers 版本 5 中将被弃用并移除，建议使用 GLPNImageProcessor 代替
        warnings.warn(
            "The class GLPNFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
            " use GLPNImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类的初始化方法
        super().__init__(*args, **kwargs)

`.\models\glpn\image_processing_glpn.py`

# 设置脚本编码为 UTF-8，确保支持中文和其他非 ASCII 字符
# 版权声明，指出代码版权归 The HuggingFace Inc. 团队所有，保留一切权利
#
# 根据 Apache 许可证 2.0 版本使用本文件，详细条款可参见 http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件按“原样”提供，不提供任何明示或暗示的担保或条件
# 请查看许可证以获取具体的使用条款和限制条件
"""GLPN 的图像处理类。"""

from typing import List, Optional, Union

import numpy as np  # 导入 NumPy 库用于数组操作
import PIL.Image  # 导入 PIL 库用于图像处理

from ...image_processing_utils import BaseImageProcessor, BatchFeature  # 导入基础图像处理工具和批量特征处理
from ...image_transforms import resize, to_channel_dimension_format  # 导入图像调整大小和转换通道维度的函数
from ...image_utils import (  # 导入图像工具函数，包括通道维度，PIL 图像重采样等
    ChannelDimension,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import TensorType, logging  # 导入 Tensor 类型和日志记录工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


class GLPNImageProcessor(BaseImageProcessor):
    r"""
    构建 GLPN 图像处理器。

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            是否调整图像的 (height, width) 尺寸，将它们舍入到最接近的 `size_divisor` 的倍数。
            可以在 `preprocess` 中通过 `do_resize` 覆盖。
        size_divisor (`int`, *optional*, defaults to 32):
            当 `do_resize` 为 `True` 时，图像被调整大小，使其高度和宽度舍入到最接近的 `size_divisor` 的倍数。
            可以在 `preprocess` 中通过 `size_divisor` 覆盖。
        resample (`PIL.Image` resampling filter, *optional*, defaults to `Resampling.BILINEAR`):
            如果调整图像大小，要使用的重采样滤波器。可以在 `preprocess` 中通过 `resample` 覆盖。
        do_rescale (`bool`, *optional*, defaults to `True`):
            是否应用缩放因子（使像素值在 0 到 1 之间）。可以在 `preprocess` 中通过 `do_rescale` 覆盖。
    """

    model_input_names = ["pixel_values"]  # 模型输入的名称为 "pixel_values"

    def __init__(
        self,
        do_resize: bool = True,
        size_divisor: int = 32,
        resample=PILImageResampling.BILINEAR,
        do_rescale: bool = True,
        **kwargs,
    # 初始化图像处理器对象，设置是否调整大小、是否重新缩放、大小除数、重采样方法
    def __init__(
        self,
        do_resize: bool = True,
        do_rescale: bool = False,
        size_divisor: int = 32,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        **kwargs,
    ) -> None:
        # 设置是否执行大小调整
        self.do_resize = do_resize
        # 设置是否执行重新缩放
        self.do_rescale = do_rescale
        # 设置大小除数
        self.size_divisor = size_divisor
        # 设置重采样方法
        self.resample = resample
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 设置有效的图像处理器关键字列表
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size_divisor",
            "resample",
            "do_rescale",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 调整图像大小，将图像的高度和宽度向下舍入到最接近的size_divisor的倍数
    def resize(
        self,
        image: np.ndarray,
        size_divisor: int,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        调整图像大小，将 (height, width) 维度向下舍入到最接近的 size_divisor 的倍数。

        如果图像的维度是 (3, 260, 170)，size_divisor 是 32，图像将被调整为 (3, 256, 160)。

        Args:
            image (`np.ndarray`):
                需要调整大小的图像。
            size_divisor (`int`):
                图像将被调整为其高度和宽度向下舍入到最接近的 `size_divisor` 的倍数。
            resample:
                调整图像时使用的 PIL.Image 重采样滤波器，例如 `PILImageResampling.BILINEAR`。
            data_format (`ChannelDimension` 或 `str`, *可选*):
                输出图像的通道维度格式。如果为 `None`，则使用输入图像的通道维度格式。可以是以下之一：
                - `ChannelDimension.FIRST`: 图像格式为 (num_channels, height, width)。
                - `ChannelDimension.LAST`: 图像格式为 (height, width, num_channels)。
            input_data_format (`ChannelDimension` 或 `str`, *可选*):
                输入图像的通道维度格式。如果未设置，则从输入图像推断通道维度格式。可以是以下之一：
                - `"channels_first"` 或 `ChannelDimension.FIRST`: 图像格式为 (num_channels, height, width)。
                - `"channels_last"` 或 `ChannelDimension.LAST`: 图像格式为 (height, width, num_channels)。

        Returns:
            `np.ndarray`: 调整大小后的图像。
        """
        # 获取图像的高度和宽度
        height, width = get_image_size(image, channel_dim=input_data_format)
        # 将高度和宽度向下舍入到最接近的 size_divisor 的倍数
        new_h = height // size_divisor * size_divisor
        new_w = width // size_divisor * size_divisor
        # 调用 resize 函数进行图像调整
        image = resize(
            image,
            (new_h, new_w),
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
        return image
    # 定义预处理方法，用于处理图像数据
    def preprocess(
        self,
        # 图像参数：可以是单张 PIL 图像、张量、或它们的列表
        images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
        # 是否进行调整大小的标志，可以为布尔值或 None
        do_resize: Optional[bool] = None,
        # 调整大小的尺寸除数，可以为整数或 None
        size_divisor: Optional[int] = None,
        # 重采样方法，可以为 None 或指定重采样方法
        resample=None,
        # 是否进行重新缩放的标志，可以为布尔值或 None
        do_rescale: Optional[bool] = None,
        # 返回的数据格式，可以是张量、字符串或它们的组合
        return_tensors: Optional[Union[TensorType, str]] = None,
        # 数据格式的通道维度，通常为首通道（FIRST）或其他
        data_format: ChannelDimension = ChannelDimension.FIRST,
        # 输入数据的格式，可以是字符串或通道维度对象的组合
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        # 其他关键字参数，用于灵活设置
        **kwargs,

`.\models\glpn\modeling_glpn.py`

# 设置源代码文件的编码格式为UTF-8，确保能正确处理中文等特殊字符
# 版权声明，版权归KAIST和The HuggingFace Inc.团队所有，保留所有权利
#
# 根据Apache许可证2.0版进行许可，除非符合许可条件，否则不得使用此文件
# 可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 不提供任何形式的明示或暗示担保或条件
# 请参阅许可证了解具体的法律条款
""" PyTorch GLPN模型。"""

# 导入需要的模块
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

# 从其他模块导入必要的内容
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_glpn import GLPNConfig

# 获取logger对象用于记录日志信息
logger = logging.get_logger(__name__)

# 模型配置文件的标识符
_CONFIG_FOR_DOC = "GLPNConfig"

# 检查点模型的位置
_CHECKPOINT_FOR_DOC = "vinvino02/glpn-kitti"

# 预期输出的形状
_EXPECTED_OUTPUT_SHAPE = [1, 512, 15, 20]

# GLPN模型的预训练模型存档列表
GLPN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "vinvino02/glpn-kitti",
    # 查看所有GLPN模型的列表：https://huggingface.co/models?filter=glpn
]

# 从transformers.models.beit.modeling_beit.drop_path中复制的函数
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    每个样本（在残差块的主路径中应用时）丢弃路径（随机深度）。

    评论由Ross Wightman提供：这与我为EfficientNet等网络创建的DropConnect实现相同，
    然而，原始名称具有误导性，因为'Drop Connect'是另一篇论文中不同形式的dropout……
    参见讨论：https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956
    …我选择更改层和参数名称为'drop path'而不是将DropConnect作为层名称混合使用，并使用'survival rate'作为参数。
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 适用于不同维度张量，而不仅仅是2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 二值化
    output = input.div(keep_prob) * random_tensor
    return output


# 从transformers.models.segformer.modeling_segformer.SegformerDropPath中复制的类
class GLPNDropPath(nn.Module):
    """每个样本（在残差块的主路径中应用时）丢弃路径（随机深度）。"""
    # 初始化函数，用于初始化DropPath模块
    def __init__(self, drop_prob: Optional[float] = None) -> None:
        # 调用父类的初始化方法
        super().__init__()
        # 设置实例变量drop_prob，用于存储丢弃概率
        self.drop_prob = drop_prob

    # 前向传播函数，接收隐藏状态作为输入，返回处理后的隐藏状态
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用drop_path函数，对隐藏状态进行DropPath操作，传入丢弃概率和训练模式标志
        return drop_path(hidden_states, self.drop_prob, self.training)

    # 返回额外的表示信息，此处返回DropPath模块的丢弃概率
    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)
# Copied from transformers.models.segformer.modeling_segformer.SegformerOverlapPatchEmbeddings
class GLPNOverlapPatchEmbeddings(nn.Module):
    """Construct the overlapping patch embeddings."""

    def __init__(self, patch_size, stride, num_channels, hidden_size):
        super().__init__()
        # 使用二维卷积操作来进行图像的特征提取，将输入的图像通道数变换为指定的隐藏层大小
        self.proj = nn.Conv2d(
            num_channels,
            hidden_size,
            kernel_size=patch_size,
            stride=stride,
            padding=patch_size // 2,
        )

        # 对隐藏层输出进行层归一化操作
        self.layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, pixel_values):
        # 使用卷积层进行特征提取
        embeddings = self.proj(pixel_values)
        _, _, height, width = embeddings.shape
        # 将卷积层输出的特征张量展平，并进行维度置换，以便于传入Transformer层处理
        embeddings = embeddings.flatten(2).transpose(1, 2)
        # 对特征向量进行层归一化操作
        embeddings = self.layer_norm(embeddings)
        return embeddings, height, width


# Copied from transformers.models.segformer.modeling_segformer.SegformerEfficientSelfAttention
class GLPNEfficientSelfAttention(nn.Module):
    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
    paper](https://arxiv.org/abs/2102.12122)."""

    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads

        # 检查隐藏层大小是否能被注意力头数整除
        if self.hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
                f"heads ({self.num_attention_heads})"
            )

        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键、值的线性变换层
        self.query = nn.Linear(self.hidden_size, self.all_head_size)
        self.key = nn.Linear(self.hidden_size, self.all_head_size)
        self.value = nn.Linear(self.hidden_size, self.all_head_size)

        # 定义注意力矩阵的dropout操作
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        self.sr_ratio = sequence_reduction_ratio
        # 如果序列缩减比率大于1，则使用二维卷积层进行序列缩减，并进行层归一化操作
        if sequence_reduction_ratio > 1:
            self.sr = nn.Conv2d(
                hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
            )
            self.layer_norm = nn.LayerNorm(hidden_size)

    def transpose_for_scores(self, hidden_states):
        # 将隐藏状态变换为注意力分数的形状
        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        hidden_states = hidden_states.view(new_shape)
        return hidden_states.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states,
        height,
        width,
        output_attentions=False,
        ):
            # 对查询向量进行变换以适应注意力分数计算所需的形状
            query_layer = self.transpose_for_scores(self.query(hidden_states))

            # 如果设置的序列缩减比率大于1，则进行以下操作
            if self.sr_ratio > 1:
                batch_size, seq_len, num_channels = hidden_states.shape
                
                # 将隐藏状态重塑为 (batch_size, num_channels, height, width) 的形状
                hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
                
                # 应用序列缩减操作
                hidden_states = self.sr(hidden_states)
                
                # 将隐藏状态重塑为 (batch_size, seq_len, num_channels) 的形状
                hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
                
                # 应用层归一化
                hidden_states = self.layer_norm(hidden_states)

            # 对键向量进行变换以适应注意力分数计算所需的形状
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            
            # 对值向量进行变换以适应注意力分数计算所需的形状
            value_layer = self.transpose_for_scores(self.value(hidden_states))

            # 计算 "查询" 和 "键" 之间的点积，得到原始注意力分数
            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

            # 将注意力分数除以缩放因子 sqrt(attention_head_size)
            attention_scores = attention_scores / math.sqrt(self.attention_head_size)

            # 对注意力分数进行归一化处理，转换为概率
            attention_probs = nn.functional.softmax(attention_scores, dim=-1)

            # 应用 dropout 操作以防止过拟合
            attention_probs = self.dropout(attention_probs)

            # 计算上下文向量，通过注意力概率与值向量的乘积
            context_layer = torch.matmul(attention_probs, value_layer)

            # 调整上下文向量的形状以适应后续层的输入要求
            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
            new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
            context_layer = context_layer.view(new_context_layer_shape)

            # 根据需要决定是否返回注意力概率
            outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

            return outputs
# Copied from transformers.models.segformer.modeling_segformer.SegformerSelfOutput
class GLPNSelfOutput(nn.Module):
    def __init__(self, config, hidden_size):
        super().__init__()
        # 初始化线性层，用于映射隐藏状态到相同大小的空间
        self.dense = nn.Linear(hidden_size, hidden_size)
        # 初始化丢弃层，根据给定的隐藏丢弃概率丢弃部分神经元
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 将隐藏状态通过线性层进行映射
        hidden_states = self.dense(hidden_states)
        # 对映射后的结果进行丢弃操作
        hidden_states = self.dropout(hidden_states)
        return hidden_states


# Copied from transformers.models.segformer.modeling_segformer.SegformerAttention with Segformer->GLPN
class GLPNAttention(nn.Module):
    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
        super().__init__()
        # 初始化自注意力层，用于计算自注意力机制
        self.self = GLPNEfficientSelfAttention(
            config=config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequence_reduction_ratio=sequence_reduction_ratio,
        )
        # 初始化输出层，用于处理自注意力层的输出
        self.output = GLPNSelfOutput(config, hidden_size=hidden_size)
        # 初始化剪枝头信息的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 寻找可剪枝的头部和对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝头部信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, hidden_states, height, width, output_attentions=False):
        # 进行自注意力计算
        self_outputs = self.self(hidden_states, height, width, output_attentions)

        # 通过输出层处理自注意力层的输出
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力，添加到输出中
        return outputs


# Copied from transformers.models.segformer.modeling_segformer.SegformerDWConv
class GLPNDWConv(nn.Module):
    def __init__(self, dim=768):
        super().__init__()
        # 初始化深度可分离卷积层，使用3x3卷积核
        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)

    def forward(self, hidden_states, height, width):
        batch_size, seq_len, num_channels = hidden_states.shape
        # 转置和重塑隐藏状态以适应深度可分离卷积层的输入格式
        hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
        hidden_states = self.dwconv(hidden_states)
        # 展平卷积后的结果并重新转置以恢复原始形状
        hidden_states = hidden_states.flatten(2).transpose(1, 2)

        return hidden_states
# 从 transformers.models.segformer.modeling_segformer.SegformerMixFFN 复制到 GLPNMixFFN，并将 Segformer 更名为 GLPN
class GLPNMixFFN(nn.Module):
    def __init__(self, config, in_features, hidden_features=None, out_features=None):
        super().__init__()
        out_features = out_features or in_features
        # 创建一个全连接层，输入特征数为 in_features，输出特征数为 hidden_features
        self.dense1 = nn.Linear(in_features, hidden_features)
        # 使用 GLPNDWConv 类来定义深度可分离卷积
        self.dwconv = GLPNDWConv(hidden_features)
        # 根据配置文件中的隐藏层激活函数类型选择合适的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
        # 创建一个全连接层，输入特征数为 hidden_features，输出特征数为 out_features
        self.dense2 = nn.Linear(hidden_features, out_features)
        # 创建一个 dropout 层，使用配置文件中的隐藏层 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, height, width):
        # 前向传播函数：先通过第一个全连接层
        hidden_states = self.dense1(hidden_states)
        # 然后通过深度可分离卷积层
        hidden_states = self.dwconv(hidden_states, height, width)
        # 再通过选定的中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 应用 dropout
        hidden_states = self.dropout(hidden_states)
        # 最后通过第二个全连接层
        hidden_states = self.dense2(hidden_states)
        # 再次应用 dropout
        hidden_states = self.dropout(hidden_states)
        return hidden_states


# 从 transformers.models.segformer.modeling_segformer.SegformerLayer 复制到 GLPNLayer，并将 Segformer 更名为 GLPN
class GLPNLayer(nn.Module):
    """这对应于原始实现中的 Block 类。"""

    def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
        super().__init__()
        # 创建一个层归一化层，输入大小为 hidden_size
        self.layer_norm_1 = nn.LayerNorm(hidden_size)
        # 创建一个 GLPNAttention 层，根据参数设置
        self.attention = GLPNAttention(
            config,
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            sequence_reduction_ratio=sequence_reduction_ratio,
        )
        # 如果 drop_path 大于 0.0，则使用 GLPNDropPath，否则使用 nn.Identity()
        self.drop_path = GLPNDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        # 创建另一个层归一化层，输入大小为 hidden_size
        self.layer_norm_2 = nn.LayerNorm(hidden_size)
        # 计算 MLP 隐藏层大小
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        # 创建一个 GLPNMixFFN 对象作为 MLP 层
        self.mlp = GLPNMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
    # 定义神经网络模型的前向传播方法
    def forward(self, hidden_states, height, width, output_attentions=False):
        # 使用 self.attention 对隐藏状态进行自注意力机制处理，
        # 在 GLPN 中，layernorm 在应用自注意力机制之前进行
        self_attention_outputs = self.attention(
            self.layer_norm_1(hidden_states),  # 对隐藏状态进行层归一化处理
            height,
            width,
            output_attentions=output_attentions,
        )

        # 获取自注意力机制的输出
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，将其添加到输出中

        # 第一个残差连接（带随机深度）
        attention_output = self.drop_path(attention_output)
        hidden_states = attention_output + hidden_states

        # 将隐藏状态应用 MLP 网络
        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)

        # 第二个残差连接（带随机深度）
        mlp_output = self.drop_path(mlp_output)
        layer_output = mlp_output + hidden_states

        # 将最终的层输出添加到输出中
        outputs = (layer_output,) + outputs

        # 返回所有输出
        return outputs
# 定义 GLPNEncoder 类，继承自 nn.Module，用于编码器部分的实现
class GLPNEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        # stochastic depth decay rule
        # 使用线性空间生成的概率衰减率列表，用于随机深度（stochastic depth）的实现
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]

        # patch embeddings
        # 初始化 patch embeddings 列表，用于存储每个编码器块的重叠补丁嵌入层
        embeddings = []
        for i in range(config.num_encoder_blocks):
            embeddings.append(
                GLPNOverlapPatchEmbeddings(
                    patch_size=config.patch_sizes[i],
                    stride=config.strides[i],
                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
                    hidden_size=config.hidden_sizes[i],
                )
            )
        self.patch_embeddings = nn.ModuleList(embeddings)

        # Transformer blocks
        # 初始化 Transformer 块列表，用于存储每个编码器块的 Transformer 层
        blocks = []
        cur = 0
        for i in range(config.num_encoder_blocks):
            # 每个块包含多个层
            layers = []
            if i != 0:
                cur += config.depths[i - 1]
            for j in range(config.depths[i]):
                layers.append(
                    GLPNLayer(
                        config,
                        hidden_size=config.hidden_sizes[i],
                        num_attention_heads=config.num_attention_heads[i],
                        drop_path=dpr[cur + j],
                        sequence_reduction_ratio=config.sr_ratios[i],
                        mlp_ratio=config.mlp_ratios[i],
                    )
                )
            blocks.append(nn.ModuleList(layers))

        self.block = nn.ModuleList(blocks)

        # Layer norms
        # 初始化 LayerNorm 模块列表，用于每个编码器块的层归一化
        self.layer_norm = nn.ModuleList(
            [nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)]
        )

    # 前向传播函数定义，接受像素值和其他控制参数作为输入，返回编码器的输出
    def forward(
        self,
        pixel_values,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        ):
            # 初始化空元组，根据需要决定是否保存隐藏状态和注意力信息
            all_hidden_states = () if output_hidden_states else None
            all_self_attentions = () if output_attentions else None

            # 获取批次大小
            batch_size = pixel_values.shape[0]

            # 初始化隐藏状态为输入像素值
            hidden_states = pixel_values
            # 迭代每个嵌入层、块层和规范化层
            for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)):
                embedding_layer, block_layer, norm_layer = x
                # 第一步，获取补丁嵌入
                hidden_states, height, width = embedding_layer(hidden_states)
                # 第二步，将嵌入通过块层
                for i, blk in enumerate(block_layer):
                    # 获取层输出，包括隐藏状态和可选的注意力信息
                    layer_outputs = blk(hidden_states, height, width, output_attentions)
                    hidden_states = layer_outputs[0]
                    if output_attentions:
                        all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 第三步，应用层规范化
                hidden_states = norm_layer(hidden_states)
                # 第四步，可选地重新整形为(batch_size, num_channels, height, width)
                hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果不需要以字典形式返回结果，则返回相应的元组
            if not return_dict:
                return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            # 否则，返回BaseModelOutput对象，包含最终隐藏状态、所有隐藏状态和所有注意力信息
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
class GLPNPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 设置配置类为 GLPNConfig
    config_class = GLPNConfig
    # 基础模型前缀为 "glpn"
    base_model_prefix = "glpn"
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    # Copied from transformers.models.segformer.modeling_segformer.SegformerPreTrainedModel._init_weights
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层或者卷积层，使用正态分布初始化权重
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 与 TF 版本稍有不同，TF 版本使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，则将偏置初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是嵌入层，使用正态分布初始化权重
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了 padding_idx，则将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果是 LayerNorm 层，将偏置初始化为零，权重初始化为 1.0
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# GLPN_START_DOCSTRING 文档字符串
GLPN_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`GLPNConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# GLPN_INPUTS_DOCSTRING 文档字符串
GLPN_INPUTS_DOCSTRING = r"""

    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`GLPNImageProcessor.__call__`] for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 使用 @add_start_docstrings 添加文档字符串
@add_start_docstrings(
    "The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
    GLPN_START_DOCSTRING,
)
# 定义 GLPNModel 类，继承自 GLPNPreTrainedModel
class GLPNModel(GLPNPreTrainedModel):
    # 从 transformers.models.segformer.modeling_segformer.SegformerModel.__init__ 处复制而来，将 Segformer 替换为 GLPN
    def __init__(self, config):
        # 调用父类 PreTrainedModel 的初始化方法
        super().__init__(config)
        # 将传入的配置参数保存到对象属性中
        self.config = config
    
        # 初始化 hierarchical Transformer 编码器，使用 GLPNEncoder
        self.encoder = GLPNEncoder(config)
    
        # 执行初始化权重和最终处理
        self.post_init()
    
    # 用于剪枝模型中注意力头部的方法
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历需要剪枝的每一层及其对应的注意力头部列表
        for layer, heads in heads_to_prune.items():
            # 对指定层的注意力模型进行头部剪枝操作
            self.encoder.layer[layer].attention.prune_heads(heads)
    
    # 从 transformers.models.segformer.modeling_segformer.SegformerModel.forward 处复制而来
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 确定是否输出注意力权重，默认根据配置参数决定
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态，默认根据配置参数决定
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否返回字典形式的输出，默认根据配置参数决定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    
        # 将输入的像素值传递给编码器进行处理
        encoder_outputs = self.encoder(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器输出的序列输出
        sequence_output = encoder_outputs[0]
    
        # 如果不返回字典形式的输出，则返回序列输出及编码器的其他输出
        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]
    
        # 返回 BaseModelOutput 对象，其中包含序列输出、隐藏状态和注意力权重等信息
        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# GLPNSelectiveFeatureFusion 类定义了选择性特征融合模块，参考论文第3.4节
class GLPNSelectiveFeatureFusion(nn.Module):
    """
    Selective Feature Fusion module, as explained in the [paper](https://arxiv.org/abs/2201.07436) (section 3.4). This
    module adaptively selects and integrates local and global features by attaining an attention map for each feature.
    """

    # 初始化方法，设置模块中的各个层
    def __init__(self, in_channel=64):
        super().__init__()

        # 第一个卷积层序列，包括卷积、批归一化和ReLU激活函数
        self.convolutional_layer1 = nn.Sequential(
            nn.Conv2d(in_channels=int(in_channel * 2), out_channels=in_channel, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(in_channel),
            nn.ReLU(),
        )

        # 第二个卷积层序列，包括卷积、批归一化和ReLU激活函数
        self.convolutional_layer2 = nn.Sequential(
            nn.Conv2d(in_channels=in_channel, out_channels=int(in_channel / 2), kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(int(in_channel / 2)),
            nn.ReLU(),
        )

        # 第三个卷积层，直接定义，包括卷积操作
        self.convolutional_layer3 = nn.Conv2d(
            in_channels=int(in_channel / 2), out_channels=2, kernel_size=3, stride=1, padding=1
        )

        # Sigmoid 激活层，用于生成两通道的注意力图
        self.sigmoid = nn.Sigmoid()

    # 前向传播方法，接收局部特征和全局特征作为输入
    def forward(self, local_features, global_features):
        # 在通道维度上连接局部特征和全局特征
        features = torch.cat((local_features, global_features), dim=1)
        # 通过第一个卷积层序列处理特征
        features = self.convolutional_layer1(features)
        # 通过第二个卷积层序列处理特征
        features = self.convolutional_layer2(features)
        # 通过第三个卷积层得到特征，生成两通道的注意力图
        features = self.convolutional_layer3(features)
        # 应用 Sigmoid 激活函数生成注意力图
        attn = self.sigmoid(features)
        # 使用注意力图加权组合局部特征和全局特征，生成混合特征
        hybrid_features = local_features * attn[:, 0, :, :].unsqueeze(1) + global_features * attn[
            :, 1, :, :
        ].unsqueeze(1)

        return hybrid_features


# GLPNDecoderStage 类定义了解码器阶段，根据输入和输出通道的不同，可能会跳过卷积操作和选择性特征融合
class GLPNDecoderStage(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        # 如果输入和输出通道相同，使用恒等映射，否则使用卷积层
        should_skip = in_channels == out_channels
        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1) if not should_skip else nn.Identity()
        # 选择性特征融合模块，根据输出通道数初始化
        self.fusion = GLPNSelectiveFeatureFusion(out_channels)
        # 上采样层，使用双线性插值方法，比例为2倍
        self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)

    # 前向传播方法，接收隐藏状态和可能的残差作为输入
    def forward(self, hidden_state, residual=None):
        # 如果需要，通过卷积层处理隐藏状态
        hidden_state = self.convolution(hidden_state)
        # 如果存在残差，通过选择性特征融合模块融合处理后的隐藏状态和残差
        if residual is not None:
            hidden_state = self.fusion(hidden_state, residual)
        # 通过上采样层进行特征图尺寸的放大
        hidden_state = self.upsample(hidden_state)

        return hidden_state
    def __init__(self, config):
        super().__init__()
        # we use features from end -> start
        # 根据配置反转隐藏层大小列表，以便从最后一个开始使用特征
        reserved_hidden_sizes = config.hidden_sizes[::-1]
        # 从配置中获取解码器隐藏层大小作为输出通道数
        out_channels = config.decoder_hidden_size

        # 创建阶段列表，每个阶段使用不同的隐藏层大小
        self.stages = nn.ModuleList(
            [GLPNDecoderStage(hidden_size, out_channels) for hidden_size in reserved_hidden_sizes]
        )
        # 在第一个阶段不进行融合操作
        self.stages[0].fusion = None

        # 创建最终的上采样层，使用双线性插值模式，并不对齐角点
        self.final_upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)

    def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
        # 存储每个阶段的隐藏状态
        stage_hidden_states = []
        # 初始阶段隐藏状态为 None
        stage_hidden_state = None
        # 逆序遍历隐藏状态列表和阶段列表
        for hidden_state, stage in zip(hidden_states[::-1], self.stages):
            # 对当前阶段应用隐藏状态和前一个阶段的隐藏状态，得到新的阶段隐藏状态
            stage_hidden_state = stage(hidden_state, stage_hidden_state)
            # 将新的阶段隐藏状态添加到阶段隐藏状态列表中
            stage_hidden_states.append(stage_hidden_state)

        # 对最后一个阶段的隐藏状态进行上采样
        stage_hidden_states[-1] = self.final_upsample(stage_hidden_state)

        # 返回所有阶段的隐藏状态列表
        return stage_hidden_states
class SiLogLoss(nn.Module):
    r"""
    Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://arxiv.org/abs/1406.2283).

    $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
    y_{i}^{*}$.

    """

    def __init__(self, lambd=0.5):
        super().__init__()
        self.lambd = lambd

    def forward(self, pred, target):
        # 创建一个有效值掩码，用于排除目标值为零的情况
        valid_mask = (target > 0).detach()
        # 计算预测值和目标值的对数差异
        diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
        # 计算损失函数
        loss = torch.sqrt(torch.pow(diff_log, 2).mean() - self.lambd * torch.pow(diff_log.mean(), 2))

        return loss


class GLPNDepthEstimationHead(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.config = config

        channels = config.decoder_hidden_size
        # 定义深度估计头部的神经网络层次结构
        self.head = nn.Sequential(
            nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=False),
            nn.Conv2d(channels, 1, kernel_size=3, stride=1, padding=1),
        )

    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
        # 使用解码器的最后一个特征作为输入
        hidden_states = hidden_states[self.config.head_in_index]

        hidden_states = self.head(hidden_states)

        # 计算预测的深度，通过 sigmoid 函数和最大深度缩放
        predicted_depth = torch.sigmoid(hidden_states) * self.config.max_depth
        predicted_depth = predicted_depth.squeeze(dim=1)

        return predicted_depth


@add_start_docstrings(
    """GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.""",
    GLPN_START_DOCSTRING,
)
class GLPNForDepthEstimation(GLPNPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 GLPN 模型、解码器和深度估计头部
        self.glpn = GLPNModel(config)
        self.decoder = GLPNDecoder(config)
        self.head = GLPNDepthEstimationHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(GLPN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        labels: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs
    ) -> DepthEstimatorOutput:
        # 此处省略部分代码
        ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
        r"""
        labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Returns:
            Depending on the configuration and inputs, returns either a tuple with loss and predicted depth,
            or a `DepthEstimatorOutput` object containing loss, predicted depth, hidden states, and attentions.

        Examples:

        ```
        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
        >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        ...     predicted_depth = outputs.predicted_depth

        >>> # interpolate to original size
        >>> prediction = torch.nn.functional.interpolate(
        ...     predicted_depth.unsqueeze(1),
        ...     size=image.size[::-1],
        ...     mode="bicubic",
        ...     align_corners=False,
        ... )

        >>> # visualize the prediction
        >>> output = prediction.squeeze().cpu().numpy()
        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
        >>> depth = Image.fromarray(formatted)
        ```
        """
        # Determine whether to use the provided return_dict or the model's default configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # Determine whether to include hidden states in the outputs based on the provided flag or config
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # Pass input through GLPN model to obtain outputs including hidden states if needed
        outputs = self.glpn(
            pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=True,  # we need the intermediate hidden states
            return_dict=return_dict,
        )

        # Select the appropriate hidden states depending on the return_dict flag
        hidden_states = outputs.hidden_states if return_dict else outputs[1]

        # Decode hidden states and predict depth map
        out = self.decoder(hidden_states)
        predicted_depth = self.head(out)

        # Compute loss if ground truth labels are provided
        loss = None
        if labels is not None:
            loss_fct = SiLogLoss()
            loss = loss_fct(predicted_depth, labels)

        # Prepare output based on return_dict and output_hidden_states settings
        if not return_dict:
            if output_hidden_states:
                output = (predicted_depth,) + outputs[1:]  # Include hidden states in output
            else:
                output = (predicted_depth,) + outputs[2:]  # Exclude hidden states from output
            return ((loss,) + output) if loss is not None else output

        # Return structured DepthEstimatorOutput object with all relevant components
        return DepthEstimatorOutput(
            loss=loss,
            predicted_depth=predicted_depth,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )

`.\models\glpn\init.py`

# 版权声明和许可信息，标明此代码版权归 HuggingFace 团队所有
#
# 根据 Apache 许可证 2.0 版本，除非符合许可证规定，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 如果适用法律要求或书面同意，软件按“原样”分发，不提供任何明示或暗示的担保或条件。
# 请查阅许可证以获取特定语言的详细信息。
from typing import TYPE_CHECKING

# 导入 OptionalDependencyNotAvailable 异常类、_LazyModule 类、is_torch_available 和 is_vision_available 函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构的字典
_import_structure = {"configuration_glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"]}

# 检查视觉库是否可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果视觉库可用，则添加特征提取和图像处理到导入结构字典
    _import_structure["feature_extraction_glpn"] = ["GLPNFeatureExtractor"]
    _import_structure["image_processing_glpn"] = ["GLPNImageProcessor"]

# 检查 Torch 库是否可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 库可用，则添加建模相关类到导入结构字典
    _import_structure["modeling_glpn"] = [
        "GLPN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GLPNForDepthEstimation",
        "GLPNLayer",
        "GLPNModel",
        "GLPNPreTrainedModel",
    ]

# 如果是类型检查模式，则进行类型检查导入
if TYPE_CHECKING:
    # 从 configuration_glpn 模块导入特定符号
    from .configuration_glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig

    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 feature_extraction_glpn 模块导入特定符号
        from .feature_extraction_glpn import GLPNFeatureExtractor
        # 从 image_processing_glpn 模块导入特定符号
        from .image_processing_glpn import GLPNImageProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 modeling_glpn 模块导入特定符号
        from .modeling_glpn import (
            GLPN_PRETRAINED_MODEL_ARCHIVE_LIST,
            GLPNForDepthEstimation,
            GLPNLayer,
            GLPNModel,
            GLPNPreTrainedModel,
        )

# 如果不是类型检查模式，则使用 LazyModule 懒加载模式导入模块
else:
    import sys

    # 将当前模块替换为 LazyModule 对象，用于按需导入模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\gpt2\configuration_gpt2.py`

# coding=utf-8
# 设置文件编码为 UTF-8

# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright 2018, NVIDIA CORPORATION.  All rights reserved.
# 版权声明，版权归 OpenAI Team Authors 和 HuggingFace Inc. team 以及 NVIDIA CORPORATION 所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 授权，可以在符合许可证条件下使用本文件

# you may not use this file except in compliance with the License.
# 除非符合许可证条件，否则不得使用此文件

# You may obtain a copy of the License at
# 可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，不附带任何明示或暗示的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 查阅许可证了解详细的权限和限制

""" OpenAI GPT-2 configuration"""
# 引入 OpenAI GPT-2 配置模块

from collections import OrderedDict
# 导入 OrderedDict 数据结构，用于有序字典操作
from typing import Any, List, Mapping, Optional
# 导入类型提示相关的模块

from ... import PreTrainedTokenizer, TensorType, is_torch_available
# 导入其他模块或函数

from ...configuration_utils import PretrainedConfig
# 从 transformers 的配置工具模块中导入 PretrainedConfig 类
from ...onnx import OnnxConfigWithPast, PatchingSpec
# 从 transformers 的 ONNX 模块中导入特定配置
from ...utils import logging
# 导入 logging 工具模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/config.json",
    "openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/config.json",
    "openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/config.json",
    "openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/config.json",
    "distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/config.json",
}
# 定义 GPT-2 预训练模型配置文件的映射字典，包含不同模型的名称和其配置文件的 URL

class GPT2Config(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the GPT-2
    [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import GPT2Config, GPT2Model

    >>> # Initializing a GPT2 configuration
    >>> configuration = GPT2Config()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = GPT2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    # GPT2Config 类，用于存储 [`GPT2Model`] 或 [`TFGPT2Model`] 的配置信息。根据指定的参数实例化 GPT-2 模型，定义模型架构。
    # 使用默认配置实例化将得到与 GPT-2 [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) 架构类似的配置。

    model_type = "gpt2"
    # 模型类型为 "gpt2"
    keys_to_ignore_at_inference = ["past_key_values"]
    # 推断时忽略的键名列表，在推断过程中不使用 "past_key_values" 键名
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }
    # 属性映射，将模型配置中的一些属性名称映射到新的名称，例如 "hidden_size" 映射到 "n_embd"
    # 初始化函数，用于设置 Transformer 模型的各种参数和配置
    def __init__(
        self,
        vocab_size=50257,  # 词汇表大小，默认为50257
        n_positions=1024,  # 序列长度，默认为1024
        n_embd=768,  # Embedding 向量的维度，默认为768
        n_layer=12,  # Transformer 层的数量，默认为12
        n_head=12,  # 多头注意力机制的头数，默认为12
        n_inner=None,  # Feedforward 层中间层的维度，可选参数，默认为None
        activation_function="gelu_new",  # 激活函数类型，默认为 gelu_new
        resid_pdrop=0.1,  # 残差连接中的 dropout 概率，默认为0.1
        embd_pdrop=0.1,  # Embedding 层的 dropout 概率，默认为0.1
        attn_pdrop=0.1,  # 注意力层的 dropout 概率，默认为0.1
        layer_norm_epsilon=1e-5,  # Layer normalization 中 epsilon 的值，默认为1e-5
        initializer_range=0.02,  # 参数初始化范围，默认为0.02
        summary_type="cls_index",  # 摘要生成的类型，默认为 cls_index
        summary_use_proj=True,  # 是否使用投影层来生成摘要，默认为True
        summary_activation=None,  # 摘要生成时的激活函数，默认为None
        summary_proj_to_labels=True,  # 是否将摘要投影到标签上，默认为True
        summary_first_dropout=0.1,  # 摘要生成时的第一层 dropout 概率，默认为0.1
        scale_attn_weights=True,  # 是否对注意力权重进行缩放，默认为True
        use_cache=True,  # 是否使用缓存，默认为True
        bos_token_id=50256,  # 起始 token 的 id，默认为50256
        eos_token_id=50256,  # 结束 token 的 id，默认为50256
        scale_attn_by_inverse_layer_idx=False,  # 是否按照反向层索引对注意力权重进行缩放，默认为False
        reorder_and_upcast_attn=False,  # 是否重新排序和升级注意力，默认为False
        **kwargs,  # 其他未指定的参数，使用关键字传递
    ):
        self.vocab_size = vocab_size  # 初始化词汇表大小
        self.n_positions = n_positions  # 初始化序列长度
        self.n_embd = n_embd  # 初始化 Embedding 向量维度
        self.n_layer = n_layer  # 初始化 Transformer 层的数量
        self.n_head = n_head  # 初始化注意力头数
        self.n_inner = n_inner  # 初始化 Feedforward 层中间层维度
        self.activation_function = activation_function  # 初始化激活函数类型
        self.resid_pdrop = resid_pdrop  # 初始化残差连接中的 dropout 概率
        self.embd_pdrop = embd_pdrop  # 初始化 Embedding 层的 dropout 概率
        self.attn_pdrop = attn_pdrop  # 初始化注意力层的 dropout 概率
        self.layer_norm_epsilon = layer_norm_epsilon  # 初始化 Layer normalization 中 epsilon 的值
        self.initializer_range = initializer_range  # 初始化参数初始化范围
        self.summary_type = summary_type  # 初始化摘要生成的类型
        self.summary_use_proj = summary_use_proj  # 初始化是否使用投影层来生成摘要
        self.summary_activation = summary_activation  # 初始化摘要生成时的激活函数
        self.summary_first_dropout = summary_first_dropout  # 初始化摘要生成时的第一层 dropout 概率
        self.summary_proj_to_labels = summary_proj_to_labels  # 初始化是否将摘要投影到标签上
        self.scale_attn_weights = scale_attn_weights  # 初始化是否对注意力权重进行缩放
        self.use_cache = use_cache  # 初始化是否使用缓存
        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx  # 初始化是否按照反向层索引对注意力权重进行缩放
        self.reorder_and_upcast_attn = reorder_and_upcast_attn  # 初始化是否重新排序和升级注意力

        self.bos_token_id = bos_token_id  # 初始化起始 token 的 id
        self.eos_token_id = eos_token_id  # 初始化结束 token 的 id

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)  # 调用父类的初始化函数，传入起始和结束 token 的 id，以及其他未指定的参数
    # 定义一个继承自OnnxConfigWithPast的配置类，用于GPT-2模型的ONNX导出配置
    class GPT2OnnxConfig(OnnxConfigWithPast):
        # 初始化方法，接收预训练配置、任务名称、补丁规范列表和是否使用过去信息的参数
        def __init__(
            self,
            config: PretrainedConfig,
            task: str = "default",
            patching_specs: List[PatchingSpec] = None,
            use_past: bool = False,
        ):
            # 调用父类的初始化方法，传递预训练配置、任务名称、补丁规范列表和是否使用过去信息的参数
            super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
            # 如果配置中没有定义pad_token_id属性，则设置其默认值为0
            if not getattr(self._config, "pad_token_id", None):
                # TODO: 如何更好地处理这一情况？
                self._config.pad_token_id = 0

        # 返回输入的属性，是一个映射结构，描述了输入数据的格式
        @property
        def inputs(self) -> Mapping[str, Mapping[int, str]]:
            # 创建一个有序字典，定义常见的输入结构，包含input_ids和attention_mask
            common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
            # 如果使用过去信息，则在输入结构中加入past_key_values相关的描述
            if self.use_past:
                self.fill_with_past_key_values_(common_inputs, direction="inputs")
                common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
            else:
                common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}

            return common_inputs

        # 返回层数的属性，即预训练配置中的n_layer值
        @property
        def num_layers(self) -> int:
            return self._config.n_layer

        # 返回注意力头数的属性，即预训练配置中的n_head值
        @property
        def num_attention_heads(self) -> int:
            return self._config.n_head

        # 生成虚拟输入数据的方法，用于模型推理的测试和调试
        def generate_dummy_inputs(
            self,
            tokenizer: PreTrainedTokenizer,
            batch_size: int = -1,
            seq_length: int = -1,
            is_pair: bool = False,
            framework: Optional[TensorType] = None,
        ) -> Mapping[str, Any]:
            # 调用父类的generate_dummy_inputs方法，生成通用的输入数据
            common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )

            # 按照forward()方法中的顺序排序输入数据
            ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})

            # 如果使用过去信息，生成past_key_values的虚拟输入数据
            if self.use_past:
                if not is_torch_available():
                    # 如果没有安装PyTorch，则抛出数值错误
                    raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
                else:
                    import torch

                    batch, seqlen = common_inputs["input_ids"].shape
                    # 计算past_key_values的长度，略长于当前序列长度
                    past_key_values_length = seqlen + 2
                    past_shape = (
                        batch,
                        self.num_attention_heads,
                        past_key_values_length,
                        self._config.hidden_size // self.num_attention_heads,
                    )
                    # 为每一层生成零张量作为past_key_values
                    ordered_inputs["past_key_values"] = [
                        (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
                    ]

            # 将attention_mask加入有序的输入数据中
            ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
            # 如果使用过去信息，调整attention_mask的形状以匹配past_key_values的长度
            if self.use_past:
                mask_dtype = ordered_inputs["attention_mask"].dtype
                ordered_inputs["attention_mask"] = torch.cat(
                    [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
                )

            return ordered_inputs

        # 返回层数的属性，即预训练配置中的n_layer值
        @property
    # 定义一个方法，用于返回默认的 ONNX 操作集版本号，返回整数 13
    def default_onnx_opset(self) -> int:
        return 13

`.\models\gpt2\convert_gpt2_original_tf_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert OpenAI GPT checkpoint."""


import argparse  # 导入用于解析命令行参数的模块

import torch  # 导入 PyTorch 模块

from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2  # 导入 GPT2 相关类和函数
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging  # 导入配置文件和权重文件名常量，以及日志模块


logging.set_verbosity_info()  # 设置日志级别为 info


def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
    # Construct model
    if gpt2_config_file == "":  # 如果没有提供配置文件路径，则使用默认配置创建 GPT2Config 对象
        config = GPT2Config()
    else:
        config = GPT2Config.from_json_file(gpt2_config_file)  # 从提供的 JSON 配置文件创建 GPT2Config 对象
    model = GPT2Model(config)  # 基于配置文件创建 GPT2Model 模型对象

    # Load weights from numpy
    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)  # 加载 TensorFlow 检查点中的权重到 PyTorch 模型中

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME  # 构造 PyTorch 模型权重保存路径
    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME  # 构造 PyTorch 模型配置保存路径
    print(f"Save PyTorch model to {pytorch_weights_dump_path}")  # 打印保存 PyTorch 模型权重的路径
    torch.save(model.state_dict(), pytorch_weights_dump_path)  # 将 PyTorch 模型的权重保存到指定路径
    print(f"Save configuration file to {pytorch_config_dump_path}")  # 打印保存配置文件的路径
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())  # 将模型配置对象转换为 JSON 字符串并写入文件


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器

    # Required parameters
    parser.add_argument(
        "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )  # 添加必需参数：TensorFlow 检查点路径
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )  # 添加必需参数：输出 PyTorch 模型的文件夹路径
    parser.add_argument(
        "--gpt2_config_file",
        default="",
        type=str,
        help=(
            "An optional config json file corresponding to the pre-trained OpenAI model. \n"
            "This specifies the model architecture."
        ),
    )  # 添加可选参数：预训练 OpenAI 模型的配置 JSON 文件路径

    args = parser.parse_args()  # 解析命令行参数
    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)

`.\models\gpt2\modeling_flax_gpt2.py`

# 导入必要的库和模块，包括类型提示
from typing import Any, Optional, Tuple

# 导入 Flax 和 JAX 的相关模块
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax

# 导入模型输出和工具函数
from ...modeling_flax_outputs import (
    FlaxBaseModelOutputWithPastAndCrossAttentions,
    FlaxCausalLMOutputWithCrossAttentions,
)
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging

# 导入 GPT2 配置
from .configuration_gpt2 import GPT2Config

# 获取日志记录器
logger = logging.get_logger(__name__)

# 模型使用的预训练模型检查点和配置名称
_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
_CONFIG_FOR_DOC = "GPT2Config"

# GPT2 模型的开始文档字符串
GPT2_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
    # Parameters参数描述：
    # config ([`GPT2Config`]): 模型配置类，包含模型的所有参数。
    #     通过配置文件初始化不会加载与模型相关的权重，只加载配置。
    #     可查看[`~FlaxPreTrainedModel.from_pretrained`]方法以加载模型权重。
    # dtype (`jax.numpy.dtype`, *optional*, 默认为 `jax.numpy.float32`):
    #     计算时的数据类型。可以是`jax.numpy.float32`、`jax.numpy.float16`（在GPU上）、`jax.numpy.bfloat16`（在TPU上）之一。
    #     可用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定了dtype，则所有计算将使用给定的dtype。
    #
    #     **请注意，这仅指定计算的dtype，并不影响模型参数的dtype。**
    #
    #     如果想要更改模型参数的dtype，请参阅[`~FlaxPreTrainedModel.to_fp16`]和[`~FlaxPreTrainedModel.to_bf16`]。
"""
Implementing FlaxGPT2Attention module for GPT-2 model.

This module handles the attention mechanism used in GPT-2, including optional cross-attention.
"""

class FlaxGPT2Attention(nn.Module):
    # GPT-2模型的配置，包括注意事项和交叉注意事项的配置
    config: GPT2Config
    # 数据类型，默认为32位浮点数
    dtype: jnp.dtype = jnp.float32
    # 是否是因果注意力机制，默认为True
    causal: bool = True
    # 是否是交叉注意力机制，默认为False
    is_cross_attention: bool = False

    @nn.compact
    def __call__(self, inputs):
        # 将输入转换为指定的数据类型
        inputs = jnp.asarray(inputs, self.dtype)
        # 初始化注意力层的权重矩阵，形状为 (features, input_shape[-1])
        kernel = self.param("kernel", jax.nn.initializers.normal(stddev=0.02), (self.features, inputs.shape[-1]))
        kernel = jnp.asarray(kernel.transpose(), self.dtype)
        # 使用 dot_general 函数计算输入和权重的乘积
        y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())), precision=self.precision)
        # 如果设置了使用偏置项
        if self.use_bias:
            # 初始化偏置项，形状为 (features,)
            bias = self.param("bias", jax.nn.initializers.zeros, (self.features,))
            bias = jnp.asarray(bias, self.dtype)
            # 在乘积结果上加上偏置项
            y = y + bias
        # 返回计算结果
        return y
    # 定义模型初始化设置方法
    def setup(self):
        # 从配置中获取参数
        config = self.config
        # 设置嵌入维度为隐藏层大小
        self.embed_dim = config.hidden_size
        # 设置注意力头数为配置中的头数
        self.num_heads = config.num_attention_heads
        # 计算每个头的维度
        self.head_dim = self.embed_dim // self.num_heads

        # 如果是跨注意力机制
        if self.is_cross_attention:
            # 使用两倍的嵌入维度创建跨注意力层
            self.c_attn = FlaxConv1D(2 * self.embed_dim, dtype=self.dtype)
            # 使用单个嵌入维度创建查询注意力层
            self.q_attn = FlaxConv1D(self.embed_dim, dtype=self.dtype)
        else:
            # 使用三倍的嵌入维度创建自注意力层
            self.c_attn = FlaxConv1D(3 * self.embed_dim, dtype=self.dtype)
        
        # 使用单个嵌入维度创建投影层
        self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)

        # 定义残差连接的 Dropout 层，使用配置中的残差 Dropout 概率
        self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)

        # 如果是因果注意力模型
        if self.causal:
            # 创建一个因果掩码，形状为 (1, 最大位置编码数)，类型为布尔型
            self.causal_mask = make_causal_mask(
                jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool"
            )

    # 将隐藏状态按头分割方法
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))

    # 将分割后的头合并为隐藏状态方法
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

    # 使用 nn.compact 装饰器定义紧凑的神经网络结构
    @nn.compact
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否通过检查"cache"中的"cached_key"变量来初始化
        is_initialized = self.has_variable("cache", "cached_key")
        # 如果未初始化，则将"cached_key"初始化为形状和类型与key相同的全零张量
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        # 如果未初始化，则将"cached_value"初始化为形状和类型与value相同的全零张量
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 如果未初始化，则将"cache_index"初始化为整数0
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # 提取批次维度、序列最大长度、头数和每头深度，从现有的缓存中
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的1维空间切片更新key和value缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            # 更新缓存中的key和value
            cached_key.value = key
            cached_value.value = value
            # 更新缓存索引，增加已更新的缓存向量数量
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 用于缓存解码器自注意力的因果掩码：我们的单个查询位置只能关注已生成并缓存的key位置，而不是剩余的零元素。
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            # 合并因果掩码和输入的注意力掩码
            attention_mask = combine_masks(pad_mask, attention_mask)
        # 返回更新后的key、value和注意力掩码
        return key, value, attention_mask
# 定义一个自定义的 FlaxGPT2MLP 类，继承自 nn.Module
class FlaxGPT2MLP(nn.Module):
    # 类型注解：配置信息为 GPT2Config 类型
    config: GPT2Config
    # 中间层的大小
    intermediate_size: int
    # 数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化函数，设置网络结构
    def setup(self):
        # 嵌入维度等于配置中的隐藏大小
        embed_dim = self.config.hidden_size
        # 定义一个一维卷积层，输出大小为 intermediate_size
        self.c_fc = FlaxConv1D(self.intermediate_size, dtype=self.dtype)
        # 定义另一个一维卷积层，输出大小为 embed_dim
        self.c_proj = FlaxConv1D(embed_dim, dtype=self.dtype)
        # 激活函数，根据配置中的激活函数类型选择对应的函数
        self.act = ACT2FN[self.config.activation_function]
        # Dropout 层，根据配置中的 residual dropout 率初始化
        self.dropout = nn.Dropout(rate=self.config.resid_pdrop)

    # 前向传播函数，处理隐藏状态
    def __call__(self, hidden_states, deterministic: bool = True):
        # 先通过第一个卷积层处理隐藏状态
        hidden_states = self.c_fc(hidden_states)
        # 使用配置中指定的激活函数处理卷积层输出
        hidden_states = self.act(hidden_states)
        # 再通过第二个卷积层处理激活后的隐藏状态
        hidden_states = self.c_proj(hidden_states)
        # 使用 Dropout 层对卷积层输出进行随机失活处理
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
        # 返回处理后的隐藏状态
        return hidden_states


# 定义一个自定义的 FlaxGPT2Block 类，继承自 nn.Module
class FlaxGPT2Block(nn.Module):
    # 类型注解：配置信息为 GPT2Config 类型
    config: GPT2Config
    # 数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化函数，设置网络结构
    def setup(self):
        # 隐藏层大小等于配置中的隐藏大小
        hidden_size = self.config.hidden_size
        # 内部维度为配置中指定的内部层大小，如果未指定，则为 4 倍的隐藏层大小
        inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size

        # 第一个 LayerNorm 层，使用配置中的层标准化 epsilon 值进行初始化
        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
        # 自定义的 GPT2Attention 层，使用配置信息和数据类型进行初始化
        self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
        # 第二个 LayerNorm 层，同样使用配置中的层标准化 epsilon 值进行初始化
        self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)

        # 如果配置中包含跨注意力机制
        if self.config.add_cross_attention:
            # 初始化一个用于跨注意力的 GPT2Attention 层，关闭因果性，设为跨注意力
            self.crossattention = FlaxGPT2Attention(
                config=self.config, dtype=self.dtype, causal=False, is_cross_attention=True
            )
            # 第三个 LayerNorm 层，使用配置中的层标准化 epsilon 值进行初始化
            self.ln_cross_attn = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)

        # 自定义的 GPT2MLP 类，使用配置信息、内部维度和数据类型进行初始化
        self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)

    # 前向传播函数，处理隐藏状态和注意力机制
    def __call__(
        self,
        hidden_states,
        attention_mask=None,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        ):
            residual = hidden_states  # 保存当前隐藏状态作为残差连接的起点
            hidden_states = self.ln_1(hidden_states)  # LayerNormalization 1：对隐藏状态进行归一化

            attn_outputs = self.attn(
                hidden_states,
                attention_mask=attention_mask,
                deterministic=deterministic,
                init_cache=init_cache,
                output_attentions=output_attentions,
            )
            # 获取自注意力机制的输出
            attn_output = attn_outputs[0]  # attn_output: 自注意力机制的输出，第一个元素
            outputs = attn_outputs[1:]  # outputs: 自注意力机制的其他输出，如注意力权重等

            # 残差连接
            hidden_states = attn_output + residual

            # Cross-Attention Block
            if encoder_hidden_states is not None:
                # 添加交叉注意力块
                if not hasattr(self, "crossattention"):
                    raise ValueError(
                        f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                        "cross-attention layers by setting `config.add_cross_attention=True`"
                    )

                residual = hidden_states  # 保存当前隐藏状态作为残差连接的起点
                hidden_states = self.ln_cross_attn(hidden_states)  # LayerNormalization 2：对隐藏状态进行归一化

                cross_attn_outputs = self.crossattention(
                    hidden_states,
                    key_value_states=encoder_hidden_states,
                    attention_mask=encoder_attention_mask,
                    deterministic=deterministic,
                    output_attentions=output_attentions,
                )
                # 获取交叉注意力机制的输出
                attn_output = cross_attn_outputs[0]
                # 残差连接
                hidden_states = residual + attn_output
                # 添加交叉注意力的输出到总输出中（如果输出注意力权重的话）
                outputs = outputs + cross_attn_outputs[1:]

            residual = hidden_states  # 保存当前隐藏状态作为残差连接的起点
            hidden_states = self.ln_2(hidden_states)  # LayerNormalization 3：对隐藏状态进行归一化

            feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
            # 残差连接
            hidden_states = residual + feed_forward_hidden_states

            outputs = (hidden_states,) + outputs  # 将最终隐藏状态和所有输出组成元组作为模块的最终输出

            return outputs
# 定义一个继承自FlaxPreTrainedModel的抽象类，用于处理权重初始化、预训练模型的下载和加载接口
class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
    # 配置类，指定为GPT2Config
    config_class = GPT2Config
    # 基础模型前缀，指定为"transformer"
    base_model_prefix = "transformer"
    # 模块类，暂未指定
    module_class: nn.Module = None

    def __init__(
        self,
        config: GPT2Config,
        input_shape: Tuple = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 使用给定的配置和参数初始化模块
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 调用父类初始化方法，传入配置、模块、输入形状、种子、数据类型和是否初始化的标志
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化输入张量
        input_ids = jnp.zeros(input_shape, dtype="i4")
        # 创建与input_ids相同形状的全1张量作为注意力掩码
        attention_mask = jnp.ones_like(input_ids)
        # 根据input_ids的形状广播生成位置编码张量
        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
        # 划分随机数种子rng，用于参数和dropout
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

        if self.config.add_cross_attention:
            # 如果配置需要添加交叉注意力，则初始化编码器隐藏状态为零张量，注意力掩码与attention_mask相同
            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
            encoder_attention_mask = attention_mask
            # 使用模块的初始化方法初始化模型参数和其他输出
            module_init_outputs = self.module.init(
                rngs,
                input_ids,
                attention_mask,
                position_ids,
                encoder_hidden_states,
                encoder_attention_mask,
                return_dict=False,
            )
        else:
            # 否则，只使用input_ids、attention_mask和position_ids初始化模型参数
            module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)

        # 获取随机初始化的参数
        random_params = module_init_outputs["params"]

        if params is not None:
            # 如果给定了预训练参数params，则将随机初始化的参数与params进行扁平化和解冻后的比较，处理丢失的键
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            # 返回冻结和解扁平化的参数字典
            return freeze(unflatten_dict(params))
        else:
            # 否则，直接返回随机初始化的参数字典
            return random_params
    def init_cache(self, batch_size, max_length):
        r"""
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        """
        # 初始化输入变量以获取缓存
        # 创建一个 batch_size x max_length 大小的全为1的矩阵作为输入ID
        input_ids = jnp.ones((batch_size, max_length))
        # 根据 input_ids 创建与之形状相同的全为1的注意力掩码
        attention_mask = jnp.ones_like(input_ids)
        # 根据 input_ids 的形状广播创建位置ID，形状为 input_ids.shape
        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

        # 使用模型的初始化方法初始化变量，包括输入ID、注意力掩码、位置ID，并设置初始化缓存为True
        init_variables = self.module.init(
            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
        )
        # 返回解除冻结后的缓存部分
        return unfreeze(init_variables["cache"])

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    def __call__(
        self,
        input_ids,
        attention_mask=None,
        position_ids=None,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        params: dict = None,
        past_key_values: dict = None,
        dropout_rng: jax.random.PRNGKey = None,
        train: bool = False,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
            # 如果未显式提供 `output_attentions`，则使用配置中的设定
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            # 如果未显式提供 `output_hidden_states`，则使用配置中的设定
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            # 如果未显式提供 `return_dict`，则使用配置中的设定
            return_dict = return_dict if return_dict is not None else self.config.return_dict

            # 如果提供了 `encoder_hidden_states` 但未提供 `encoder_attention_mask`，则创建一个全为1的注意力掩码
            if encoder_hidden_states is not None and encoder_attention_mask is None:
                batch_size, sequence_length = encoder_hidden_states.shape[:2]
                encoder_attention_mask = jnp.ones((batch_size, sequence_length))

            # 获取输入张量的批量大小和序列长度
            batch_size, sequence_length = input_ids.shape

            # 如果未提供 `position_ids`
            if position_ids is None:
                # 如果提供了 `past_key_values` 但未提供 `position_ids`，则引发错误
                if past_key_values is not None:
                    raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
                # 使用序列长度创建广播后的位置编码
                position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

            # 如果未提供 `attention_mask`，则创建一个全为1的注意力掩码
            if attention_mask is None:
                attention_mask = jnp.ones((batch_size, sequence_length))

            # 处理任何需要的伪随机数生成器
            rngs = {}
            if dropout_rng is not None:
                rngs["dropout"] = dropout_rng

            # 准备输入参数字典
            inputs = {"params": params or self.params}

            # 如果提供了 `past_key_values`，则将其作为缓存传递给模块，同时确保缓存是可变的
            if past_key_values:
                inputs["cache"] = past_key_values
                mutable = ["cache"]
            else:
                mutable = False

            # 应用模块的前向传播函数，计算输出
            outputs = self.module.apply(
                inputs,
                jnp.array(input_ids, dtype="i4"),
                jnp.array(attention_mask, dtype="i4"),
                jnp.array(position_ids, dtype="i4"),
                encoder_hidden_states,
                encoder_attention_mask,
                not train,
                False,
                output_attentions,
                output_hidden_states,
                return_dict,
                rngs=rngs,
                mutable=mutable,
            )

            # 如果提供了 `past_key_values` 并且设置了 `return_dict`，则将更新后的缓存添加到模型输出中
            if past_key_values is not None and return_dict:
                outputs, past_key_values = outputs
                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
                return outputs
            # 如果提供了 `past_key_values` 但未设置 `return_dict`，则更新模型输出
            elif past_key_values is not None and not return_dict:
                outputs, past_key_values = outputs
                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]

            # 返回模型输出
            return outputs
class FlaxGPT2BlockCollection(nn.Module):
    config: GPT2Config
    dtype: jnp.dtype = jnp.float32

    # 初始化模块，设置隐藏层块的集合
    def setup(self):
        # 创建一组 GPT2Block 对象，根据给定的层数配置
        self.blocks = [
            FlaxGPT2Block(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
        ]

    # 调用模块的方法，处理输入的隐藏状态并返回输出
    def __call__(
        self,
        hidden_states,
        attention_mask=None,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 初始化空的输出列表，根据需要存储注意力、隐藏状态及交叉注意力
        all_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None
        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

        # 遍历每个隐藏层块并处理隐藏状态
        for block in self.blocks:
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到列表中
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            # 调用当前块的处理方法，获取块的输出
            layer_outputs = block(
                hidden_states,
                attention_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                deterministic=deterministic,
                init_cache=init_cache,
                output_attentions=output_attentions,
            )
            # 更新当前隐藏状态为当前块的输出的第一个元素（即隐藏状态）
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力，则将当前块的注意力添加到列表中
            if output_attentions:
                all_attentions += (layer_outputs[1],)

                # 如果存在编码器的隐藏状态，则将当前块的交叉注意力添加到列表中
                if encoder_hidden_states is not None:
                    all_cross_attentions += (layer_outputs[2],)

        # 返回模块的输出，包括隐藏状态、所有隐藏状态、所有注意力和所有交叉注意力
        # 注意：这里可能包含 `None` 值，`FlaxGPT2Module` 将会过滤它们
        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)

        return outputs


class FlaxGPT2Module(nn.Module):
    config: GPT2Config
    dtype: jnp.dtype = jnp.float32

    # 初始化模块，设置词嵌入、位置嵌入、Dropout、隐藏层块集合和最终层归一化
    def setup(self):
        # 设置词嵌入维度为隐藏大小
        self.embed_dim = self.config.hidden_size

        # 初始化词嵌入和位置嵌入层
        self.wte = nn.Embed(
            self.config.vocab_size,
            self.embed_dim,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        self.wpe = nn.Embed(
            self.config.max_position_embeddings,
            self.embed_dim,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )
        # 初始化 Dropout 层
        self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
        # 初始化隐藏层块集合
        self.h = FlaxGPT2BlockCollection(self.config, dtype=self.dtype)
        # 初始化最终层的归一化层
        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
    # 定义一个调用函数，接受多个参数：
    #   - input_ids: 输入的标识符序列
    #   - attention_mask: 注意力掩码
    #   - position_ids: 位置编码
    #   - encoder_hidden_states: 编码器的隐藏状态（可选）
    #   - encoder_attention_mask: 编码器的注意力掩码（可选）
    #   - deterministic: 是否确定性操作的标志，默认为True
    #   - init_cache: 是否初始化缓存的标志，默认为False
    #   - output_attentions: 是否输出注意力权重，默认为False
    #   - output_hidden_states: 是否输出隐藏状态，默认为False
    #   - return_dict: 是否以字典形式返回结果，默认为True
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        deterministic=True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 通过输入标识符序列获取对应的词嵌入
        input_embeds = self.wte(input_ids.astype("i4"))
        # 根据位置编码获取对应的位置嵌入
        position_embeds = self.wpe(position_ids.astype("i4"))

        # 将词嵌入和位置嵌入相加得到初始的隐藏状态
        hidden_states = input_embeds + position_embeds
        # 对隐藏状态进行丢弃操作，以减少过拟合，deterministic参数控制是否确定性
        hidden_states = self.dropout(hidden_states, deterministic=deterministic)

        # 调用模型的前向传播函数h，处理隐藏状态和相关参数，获取输出
        outputs = self.h(
            hidden_states,
            attention_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中取出新的隐藏状态
        hidden_states = outputs[0]
        # 对新的隐藏状态进行 LayerNorm 归一化操作
        hidden_states = self.ln_f(hidden_states)

        # 如果需要输出所有隐藏状态，则将它们与最新的隐藏状态合并
        if output_hidden_states:
            all_hidden_states = outputs[1] + (hidden_states,)
            outputs = (hidden_states, all_hidden_states) + outputs[2:]
        else:
            outputs = (hidden_states,) + outputs[1:]

        # 如果不需要以字典形式返回结果，则以元组形式返回所有非空结果
        if not return_dict:
            return tuple(v for v in outputs if v is not None)

        # 以特定格式返回最终结果，包含最终隐藏状态、所有隐藏状态、注意力权重和交叉注意力权重
        return FlaxBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=outputs[1],
            attentions=outputs[2],
            cross_attentions=outputs[3],
        )
# 使用装饰器添加文档字符串，描述这是一个不带特定顶层头部的原始隐藏状态输出的 GPT2 模型转换器
@add_start_docstrings(
    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
    GPT2_START_DOCSTRING,
)
# 定义 FlaxGPT2Model 类，它继承自 FlaxGPT2PreTrainedModel
class FlaxGPT2Model(FlaxGPT2PreTrainedModel):
    # 模块类别设置为 FlaxGPT2Module
    module_class = FlaxGPT2Module


# 使用示例调用文档字符串添加函数，为 FlaxGPT2Model 类添加文档字符串
append_call_sample_docstring(
    FlaxGPT2Model,
    _CHECKPOINT_FOR_DOC,  # 使用的检查点文档
    FlaxBaseModelOutputWithPastAndCrossAttentions,  # 输出的模型输出类
    _CONFIG_FOR_DOC,  # 使用的配置文档
)


# 定义 FlaxGPT2LMHeadModule 类，继承自 nn.Module
class FlaxGPT2LMHeadModule(nn.Module):
    config: GPT2Config  # 类型为 GPT2Config 的配置对象
    dtype: jnp.dtype = jnp.float32  # 数据类型设置为 jnp.float32，默认为 float32

    def setup(self):
        # 初始化 transformer 层，使用 FlaxGPT2Module 类
        self.transformer = FlaxGPT2Module(self.config, dtype=self.dtype)
        # 初始化 lm_head 层，使用全连接层 Dense，设置参数包括词汇表大小、无偏置、dtype 和初始化范围
        self.lm_head = nn.Dense(
            self.config.vocab_size,
            use_bias=False,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用 transformer 层进行前向传播，获取输出结果
        outputs = self.transformer(
            input_ids,
            attention_mask,
            position_ids,
            encoder_hidden_states,
            encoder_attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]  # 获取 transformer 输出的隐藏状态

        if self.config.tie_word_embeddings:
            # 如果配置要求共享词嵌入权重，则从 transformer 的参数中获取共享的词嵌入权重矩阵
            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
            # 应用共享的词嵌入权重进行 lm_head 的计算
            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
        else:
            # 否则直接使用 lm_head 层计算 lm_logits
            lm_logits = self.lm_head(hidden_states)

        if not return_dict:
            # 如果不要求返回字典形式的输出，则返回元组形式的结果
            return (lm_logits,) + outputs[1:]

        # 返回 FlaxCausalLMOutputWithCrossAttentions 类的实例，包括 lm_logits、hidden_states、attentions 和 cross_attentions
        return FlaxCausalLMOutputWithCrossAttentions(
            logits=lm_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


# 使用装饰器添加文档字符串，描述这是一个带有语言建模头部（线性层权重与输入嵌入绑定）的 GPT2 模型转换器
@add_start_docstrings(
    """
    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    GPT2_START_DOCSTRING,
)
# 定义 FlaxGPT2LMHeadModel 类，继承自 FlaxGPT2PreTrainedModel
class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
    module_class = FlaxGPT2LMHeadModule  # 模块类别设置为 FlaxGPT2LMHeadModule
    # 准备生成过程的输入数据，包括初始化缓存和生成位置信息的处理
    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
        # 初始化缓存，获取输入的批量大小和序列长度
        batch_size, seq_length = input_ids.shape

        # 使用初始化方法创建缓存的键值对
        past_key_values = self.init_cache(batch_size, max_length)

        # 创建一个扩展的注意力掩码，初始值为全1数组
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")

        # 如果有提供注意力掩码，则进行处理
        if attention_mask is not None:
            # 计算位置编码，累积求和减去1以确保位置正确
            position_ids = attention_mask.cumsum(axis=-1) - 1
            # 使用动态更新切片操作，将注意力掩码更新到扩展的注意力掩码中
            extended_attention_mask = lax.dynamic_update_slice(
                extended_attention_mask, attention_mask.astype("i4"), (0, 0)
            )
        else:
            # 如果未提供注意力掩码，则根据输入序列长度广播生成位置编码
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        # 返回准备好的输入字典，包括缓存的键值对、扩展的注意力掩码和位置编码
        return {
            "past_key_values": past_key_values,
            "attention_mask": extended_attention_mask,
            "position_ids": position_ids,
        }

    # 更新生成过程的输入数据，主要是更新缓存和位置编码
    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        # 更新模型关键字参数中的缓存键值对和位置编码
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1

        # 返回更新后的模型关键字参数
        return model_kwargs
# 将样例文档字符串附加到函数或类中
append_call_sample_docstring(
    # 将 FlaxGPT2LMHeadModel 类作为目标对象
    FlaxGPT2LMHeadModel,
    # 使用 _CHECKPOINT_FOR_DOC 作为样例文档字符串的检查点
    _CHECKPOINT_FOR_DOC,
    # 将 FlaxCausalLMOutputWithCrossAttentions 类作为相关交叉注意力的因果语言模型输出
    FlaxCausalLMOutputWithCrossAttentions,
    # 使用 _CONFIG_FOR_DOC 作为样例文档字符串的配置
    _CONFIG_FOR_DOC,
)

`.\models\gpt2\modeling_gpt2.py`

# coding=utf-8
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch OpenAI GPT-2 model."""

import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.cuda.amp import autocast
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ...utils.model_parallel_utils import assert_device_map, get_device_map
from .configuration_gpt2 import GPT2Config

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
_CONFIG_FOR_DOC = "GPT2Config"

GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai-community/gpt2",
    "openai-community/gpt2-medium",
    "openai-community/gpt2-large",
    "openai-community/gpt2-xl",
    "distilbert/distilgpt2",
    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
]

def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    """Load tf checkpoints in a pytorch model"""
    try:
        import re

        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(gpt2_checkpoint_path)
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    # Collect names and arrays of weights and biases
    names = []
    arrays = []
    for name, shape in init_vars:
        # 记录日志，显示正在加载的 TensorFlow 权重的名称和形状
        logger.info(f"Loading TF weight {name} with shape {shape}")
        # 使用 TensorFlow 提供的方法加载变量的值
        array = tf.train.load_variable(tf_path, name)
        # 将加载的变量名添加到列表中
        names.append(name)
        # 将加载的变量值添加到列表中，并将其压缩（去除多余的维度）
        arrays.append(array.squeeze())

    for name, array in zip(names, arrays):
        # 跳过变量名中的 "model/" 部分
        name = name[6:]
        # 根据 "/" 分割变量名
        name = name.split("/")
        # 初始化指针为模型对象
        pointer = model
        # 遍历分割后的变量名
        for m_name in name:
            # 如果变量名匹配字母+数字的模式
            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
                # 使用数字分割变量名
                scope_names = re.split(r"(\d+)", m_name)
            else:
                # 否则，直接使用变量名
                scope_names = [m_name]
            # 根据变量名的首字符选择操作的属性
            if scope_names[0] == "w" or scope_names[0] == "g":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "b":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
                pointer = getattr(pointer, scope_names[0])
                pointer = getattr(pointer, "weight")
            else:
                pointer = getattr(pointer, scope_names[0])
            # 如果变量名有多个部分，则根据数字选择指定的属性
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        try:
            # 检查指针的形状与加载的数组的形状是否匹配
            if pointer.shape != array.shape:
                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
        except ValueError as e:
            # 如果形状不匹配，将错误信息添加到异常中并抛出异常
            e.args += (pointer.shape, array.shape)
            raise
        # 记录日志，显示正在初始化的 PyTorch 权重的名称
        logger.info(f"Initialize PyTorch weight {name}")
        # 将加载的数组转换为 PyTorch 张量，并赋值给指针的数据
        pointer.data = torch.from_numpy(array)
    # 返回更新后的模型对象
    return model
# 定义一个名为 GPT2Attention 的类，继承自 nn.Module
class GPT2Attention(nn.Module):
    # 初始化方法，接受 config、is_cross_attention 和 layer_idx 三个参数
    def __init__(self, config, is_cross_attention=False, layer_idx=None):
        # 调用父类的初始化方法
        super().__init__()

        # 获取最大位置嵌入数，并将其设为缓冲区 "bias"
        max_positions = config.max_position_embeddings
        self.register_buffer(
            "bias",
            # 创建一个下三角形矩阵，并转换为布尔型张量
            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                1, 1, max_positions, max_positions
            ),
            persistent=False,  # 非持久化缓冲区
        )
        # 设置 "masked_bias" 缓冲区，用于掩码操作
        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)

        # 获取隐藏大小
        self.embed_dim = config.hidden_size
        # 获取注意力头数和头维度
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.split_size = self.embed_dim
        # 如果 embed_dim 不能被 num_heads 整除，抛出异常
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        # 是否缩放注意力权重
        self.scale_attn_weights = config.scale_attn_weights
        # 是否为交叉注意力
        self.is_cross_attention = is_cross_attention

        # 层级注意力权重缩放、重新排序和向上转型
        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
        self.layer_idx = layer_idx
        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn

        # 如果是交叉注意力，则创建 c_attn 和 q_attn 两个卷积层
        if self.is_cross_attention:
            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
        else:
            # 否则创建 c_attn 卷积层
            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
        # 创建 c_proj 卷积层
        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)

        # 创建注意力的 dropout 层和残差的 dropout 层
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

        # 初始化剪枝的注意力头集合
        self.pruned_heads = set()

    # 剪枝注意力头的方法
    def prune_heads(self, heads):
        # 如果没有要剪枝的头部，直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数找到可剪枝的头部及其索引
        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])

        # 剪枝 conv1d 层
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)

        # 更新超参数
        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
        self.num_heads = self.num_heads - len(heads)
        self.pruned_heads = self.pruned_heads.union(heads)
    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # 计算注意力权重，query 和 key 的点积
        attn_weights = torch.matmul(query, key.transpose(-1, -2))

        if self.scale_attn_weights:
            # 如果需要缩放注意力权重，按照数值开方缩放
            attn_weights = attn_weights / torch.full(
                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
            )

        # 根据层索引按照逆序缩放注意力权重
        if self.scale_attn_by_inverse_layer_idx:
            attn_weights = attn_weights / float(self.layer_idx + 1)

        if not self.is_cross_attention:
            # 如果不是跨注意力，实施因果遮罩
            query_length, key_length = query.size(-2), key.size(-2)
            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
            mask_value = torch.finfo(attn_weights.dtype).min
            # 创建遮罩值的张量，确保与注意力权重张量类型和设备一致
            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
            # 根据因果遮罩条件调整注意力权重
            attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)

        if attention_mask is not None:
            # 应用外部注意力遮罩
            attn_weights = attn_weights + attention_mask

        # 使用 softmax 函数归一化注意力权重
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        # 将注意力权重转换回 value 张量的数据类型（如果需要）
        attn_weights = attn_weights.type(value.dtype)
        # 对注意力权重应用 dropout
        attn_weights = self.attn_dropout(attn_weights)

        # 如果需要，对注意力头部进行遮罩处理
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 计算注意力输出
        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights
    # 对给定的 query, key, value 张量进行上转型和重新排序的注意力计算
    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
        # 使用 `torch.baddbmm` 进行计算（在 Megatron-LM 中效率更高，带有 alpha 参数用于缩放）
        bsz, num_heads, q_seq_len, dk = query.size()
        _, _, k_seq_len, _ = key.size()

        # 预先分配用于 `baddbmm` 的 attn_weights 张量
        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)

        # 计算缩放因子
        scale_factor = 1.0
        if self.scale_attn_weights:
            scale_factor /= float(value.size(-1)) ** 0.5

        if self.scale_attn_by_inverse_layer_idx:
            scale_factor /= float(self.layer_idx + 1)

        # 上转型（关闭自动转型）和重新排序（将 K 转置并展平）
        with autocast(enabled=False):
            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)

        if not self.is_cross_attention:
            # 如果不是跨注意力层，则实现因果遮罩
            query_length, key_length = query.size(-2), key.size(-2)
            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
            mask_value = torch.finfo(attn_weights.dtype).min
            # 需要将 mask_value 转换为张量，以匹配 attn_weights 的数据类型和设备
            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
            attn_weights = torch.where(causal_mask, attn_weights, mask_value)

        if attention_mask is not None:
            # 应用注意力遮罩
            attn_weights = attn_weights + attention_mask

        # 对 attn_weights 进行 softmax 归一化
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        # 如果 attn_weights 的数据类型不是 torch.float32，则抛出运行时错误
        # 将 attn_weights 转换回 value 的数据类型（如果在混合精度中）
        if attn_weights.dtype != torch.float32:
            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
        attn_weights = attn_weights.type(value.dtype)

        # 对 attn_weights 应用注意力丢弃
        attn_weights = self.attn_dropout(attn_weights)

        # 如果存在 head_mask，则对注意力权重应用头部掩码
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        # 计算最终的注意力输出
        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights

    # 将张量按照指定的头数和注意力头大小进行分割
    def _split_heads(self, tensor, num_heads, attn_head_size):
        """
        Splits hidden_size dim into attn_head_size and num_heads
        """
        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
        tensor = tensor.view(new_shape)
        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
    def _merge_heads(self, tensor, num_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        """
        # 调整张量维度顺序，将头数维度和注意力头尺寸维度合并到隐藏大小维度中
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
        return tensor.view(new_shape)

    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
        if encoder_hidden_states is not None:
            if not hasattr(self, "q_attn"):
                # 如果作为跨注意力使用，必须定义权重 `q_attn`。
                # 实例化类时，请确保使用 `GPT2Attention(..., is_cross_attention=True)`。
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
                )

            # 使用自注意力机制生成查询
            query = self.q_attn(hidden_states)
            # 使用编码器的自注意力机制生成键和值，并按照 split_size 在维度2上拆分
            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
            attention_mask = encoder_attention_mask
        else:
            # 使用自注意力机制生成查询、键和值，并按照 split_size 在维度2上拆分
            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)

        # 将查询、键和值分割成多头注意力
        query = self._split_heads(query, self.num_heads, self.head_dim)
        key = self._split_heads(key, self.num_heads, self.head_dim)
        value = self._split_heads(value, self.num_heads, self.head_dim)

        if layer_past is not None:
            # 如果存在过去的键和值，则将当前的键和值与过去的连接
            past_key, past_value = layer_past
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)

        if use_cache is True:
            # 如果需要缓存，将当前的键和值保存在 present 变量中
            present = (key, value)
        else:
            present = None

        if self.reorder_and_upcast_attn:
            # 如果需要重新排序和升级注意力，则调用对应方法
            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
        else:
            # 否则直接调用注意力计算方法
            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        # 合并多头注意力的输出
        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
        # 使用投影层映射到特征空间
        attn_output = self.c_proj(attn_output)
        # 应用残差连接的 dropout
        attn_output = self.resid_dropout(attn_output)

        # 准备输出，包括注意力输出和可能的缓存
        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，则添加到输出中

        return outputs  # 返回注意力输出和可能的缓存信息
# 定义一个 GPT2MLP 类，继承自 nn.Module 类
class GPT2MLP(nn.Module):
    # 初始化方法，接受中间层大小和配置对象作为参数
    def __init__(self, intermediate_size, config):
        # 调用父类的初始化方法
        super().__init__()
        # 从配置对象中获取隐藏层大小作为嵌入维度
        embed_dim = config.hidden_size
        # 创建一个一维卷积层，输出大小为 intermediate_size，输入大小为 embed_dim
        self.c_fc = Conv1D(intermediate_size, embed_dim)
        # 创建一个一维卷积层，输出大小为 embed_dim，输入大小为 intermediate_size
        self.c_proj = Conv1D(embed_dim, intermediate_size)
        # 获取激活函数名称对应的激活函数，并赋值给 self.act
        self.act = ACT2FN[config.activation_function]
        # 创建一个以 config.resid_pdrop 为概率的 Dropout 层
        self.dropout = nn.Dropout(config.resid_pdrop)

    # 前向传播方法，接受隐藏状态作为输入，返回处理后的隐藏状态
    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        # 使用 self.c_fc 对隐藏状态进行一维卷积
        hidden_states = self.c_fc(hidden_states)
        # 使用 self.act 对卷积结果进行激活函数处理
        hidden_states = self.act(hidden_states)
        # 使用 self.c_proj 对激活后的结果进行一维卷积
        hidden_states = self.c_proj(hidden_states)
        # 使用 self.dropout 对卷积结果进行 Dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的隐藏状态
        return hidden_states


# 定义一个 GPT2Block 类，继承自 nn.Module 类
class GPT2Block(nn.Module):
    # 初始化方法，接受配置对象和可选的层索引作为参数
    def __init__(self, config, layer_idx=None):
        # 调用父类的初始化方法
        super().__init__()
        # 从配置对象中获取隐藏层大小
        hidden_size = config.hidden_size
        # 如果配置对象中指定了内部维度，则使用该值，否则使用默认值 4 * hidden_size
        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size

        # 创建一个 LayerNorm 层，对隐藏状态进行归一化，epsilon 参数由配置对象提供
        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        # 创建一个 GPT2Attention 层，用于注意力机制处理
        self.attn = GPT2Attention(config, layer_idx=layer_idx)
        # 创建一个 LayerNorm 层，对隐藏状态进行归一化，epsilon 参数由配置对象提供
        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # 如果配置中指定需要添加交叉注意力机制，则创建相应的交叉注意力层和归一化层
        if config.add_cross_attention:
            self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)
            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

        # 创建一个 GPT2MLP 类型的多层感知机层
        self.mlp = GPT2MLP(inner_dim, config)

    # 前向传播方法，接受多个可选的参数，包括隐藏状态、过去的层状态、注意力掩码等，返回处理后的隐藏状态
    def forward(
        self,
        hidden_states: Optional[Tuple[torch.FloatTensor]],
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
        # 保留残差连接的输入隐藏状态
        residual = hidden_states
        # Layer normalization 层，用于规范化输入隐藏状态
        hidden_states = self.ln_1(hidden_states)
        # 使用注意力机制进行计算
        attn_outputs = self.attn(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 提取注意力输出中的主要输出
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        # 保留其它输出（如果有的话）
        outputs = attn_outputs[1:]
        # 残差连接，将注意力输出与输入相加
        hidden_states = attn_output + residual

        if encoder_hidden_states is not None:
            # 如果存在编码器的隐藏状态，则添加一个自注意力块用于交叉注意力
            if not hasattr(self, "crossattention"):
                # 如果没有定义交叉注意力层，抛出错误
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )
            # 保留残差连接的输入隐藏状态
            residual = hidden_states
            # Layer normalization 层，用于规范化输入隐藏状态
            hidden_states = self.ln_cross_attn(hidden_states)
            # 使用交叉注意力机制进行计算
            cross_attn_outputs = self.crossattention(
                hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            # 提取交叉注意力输出中的主要输出
            attn_output = cross_attn_outputs[0]
            # 残差连接，将交叉注意力输出与之前的隐藏状态相加
            hidden_states = residual + attn_output
            # 将交叉注意力的其它输出（如果有的话）添加到总输出中
            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights

        # 保留残差连接的输入隐藏状态
        residual = hidden_states
        # Layer normalization 层，用于规范化输入隐藏状态
        hidden_states = self.ln_2(hidden_states)
        # 使用前馈神经网络（MLP）进行计算
        feed_forward_hidden_states = self.mlp(hidden_states)
        # 残差连接，将前馈网络输出与输入隐藏状态相加
        hidden_states = residual + feed_forward_hidden_states

        if use_cache:
            # 如果使用缓存，则将当前隐藏状态和其它输出作为结果返回
            outputs = (hidden_states,) + outputs
        else:
            # 如果不使用缓存，则将当前隐藏状态和除了第一个元素以外的其它输出返回
            outputs = (hidden_states,) + outputs[1:]

        # 返回计算结果，包括隐藏状态、注意力（如果有的话）、交叉注意力（如果有的话）
        return outputs  # hidden_states, present, (attentions, cross_attentions)
class GPT2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 GPT2Config 类作为默认配置类
    config_class = GPT2Config
    # 使用 load_tf_weights_in_gpt2 函数来加载 TensorFlow 权重
    load_tf_weights = load_tf_weights_in_gpt2
    # 在模型中 Transformer 部分的名称前缀
    base_model_prefix = "transformer"
    # 模型是否支持并行化计算
    is_parallelizable = True
    # 模型是否支持梯度检查点（gradient checkpointing）
    supports_gradient_checkpointing = True
    # 不需要进行参数分割的模块名称列表
    _no_split_modules = ["GPT2Block"]
    # 在设备上跳过指定的键（key）设备放置（device placement）
    _skip_keys_device_placement = "past_key_values"

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, (nn.Linear, Conv1D)):
            # 使用正态分布初始化权重，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置项，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果指定了填充索引，则将填充索引位置的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零，权重初始化为1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        # 根据 OpenAI GPT-2 论文中的方案重新初始化选定的权重：
        #   > 使用一个修改后的初始化方法，考虑模型深度的累积在残差路径上的影响。在初始化时，将残差层的权重按照
        #   > 1/√N 的因子进行缩放，其中 N 是残差层的数量。
        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
        #
        # 参考（Megatron-LM）：https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
        for name, p in module.named_parameters():
            if name == "c_proj.weight":
                # 特殊的缩放初始化 --> 每个 Transformer 块有两个 Layer Norms
                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))


@dataclass
class GPT2DoubleHeadsModelOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.
    """
    # 定义可选的语言建模损失，类型为 torch.FloatTensor，形状为 (1,)
    loss: Optional[torch.FloatTensor] = None
    # 定义可选的多项选择分类损失，类型为 torch.FloatTensor，形状为 (1,)
    mc_loss: Optional[torch.FloatTensor] = None
    # 定义预测的语言建模头部得分，类型为 torch.FloatTensor，形状为 (batch_size, num_choices, sequence_length, config.vocab_size)
    # 表示每个词汇标记的预测分数，SoftMax 之前的值
    logits: torch.FloatTensor = None
    # 定义预测的多项选择分类头部得分，类型为 torch.FloatTensor，形状为 (batch_size, num_choices)
    # 表示每个选择的预测分数，SoftMax 之前的值
    mc_logits: torch.FloatTensor = None
    # 定义预先计算的键值对 (past_key_values)，类型为 Tuple[Tuple[torch.FloatTensor]]
    # 长度为 config.n_layers，包含每个层的键和值张量，形状为 (batch_size, num_heads, sequence_length, embed_size_per_head)
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    # 定义模型的隐藏状态 (hidden_states)，类型为 tuple(torch.FloatTensor)
    # 包含每个层输出的隐藏状态，形状为 (batch_size, sequence_length, hidden_size)，包括初始嵌入的输出
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 定义注意力权重 (attentions)，类型为 tuple(torch.FloatTensor)
    # 包含每个层的自注意力权重，形状为 (batch_size, num_heads, sequence_length, sequence_length)
    # 用于计算自注意力头部的加权平均值
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# GPT2_START_DOCSTRING 是一个包含模型文档字符串的原始字符串常量，描述了 GPT-2 模型的继承关系和使用说明。
GPT2_START_DOCSTRING = r"""

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# GPT2_INPUTS_DOCSTRING 是一个空的原始字符串常量，可能用于描述模型的输入参数文档字符串，但在当前代码中未进行定义。
GPT2_INPUTS_DOCSTRING = r"""
"""

# PARALLELIZE_DOCSTRING 是一个包含并行化模型方法文档字符串的原始字符串常量，描述了其实验性质以及如何使用。
PARALLELIZE_DOCSTRING = r"""
    This is an experimental feature and is a subject to change at a moment's notice.

    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
    it will evenly distribute blocks across all devices.

    Args:
        device_map (`Dict[int, list]`, optional, defaults to None):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
            following number of attention modules:

                - openai-community/gpt2: 12
                - openai-community/gpt2-medium: 24
                - openai-community/gpt2-large: 36
                - openai-community/gpt2-xl: 48

    Example:

    ```
    # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
    }
    model.parallelize(device_map)
    ```
"""

# DEPARALLELIZE_DOCSTRING 是一个包含取消并行化模型方法文档字符串的原始字符串常量，描述了如何将模型从并行状态移到 CPU 上。
DEPARALLELIZE_DOCSTRING = r"""
    Moves the model to cpu from a model parallel state.

    Example:

    ```
    # On a 4 GPU machine with openai-community/gpt2-large:
    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
    device_map = {
        0: [0, 1, 2, 3, 4, 5, 6, 7],
        1: [8, 9, 10, 11, 12, 13, 14, 15],
        2: [16, 17, 18, 19, 20, 21, 22, 23],
        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
    }
    model.parallelize(device_map)  # Splits the model across several devices
    # 将模型从并行模式转换为单机模式，将模型放回CPU并通过调用torch.cuda.empty_cache()清理内存
    model.deparallelize()
"""
@add_start_docstrings(
    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
    GPT2_START_DOCSTRING,
)
class GPT2Model(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.embed_dim = config.hidden_size  # 从配置中获取隐藏层大小作为嵌入维度

        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)  # 创建词嵌入层
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)  # 创建位置嵌入层

        self.drop = nn.Dropout(config.embd_pdrop)  # 创建dropout层
        self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])  # 创建GPT2Block的模块列表，用于堆叠层

        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)  # 创建LayerNorm层用于最终处理

        # Model parallel
        self.model_parallel = False  # 模型并行标志位初始化为False
        self.device_map = None  # 设备映射初始化为None
        self.gradient_checkpointing = False  # 梯度检查点初始化为False

        # Initialize weights and apply final processing
        self.post_init()  # 调用后处理函数完成权重初始化和最终处理

    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        # 检查device_map的有效性
        warnings.warn(
            "`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
            " ...}",
            FutureWarning,
        )
        # 如果device_map为None，则使用默认的均衡设备映射
        self.device_map = (
            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
        )
        # 断言设备映射的有效性
        assert_device_map(self.device_map, len(self.h))
        # 将模型设为模型并行模式
        self.model_parallel = True
        # 确定第一个和最后一个设备
        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
        self.last_device = "cuda:" + str(max(self.device_map.keys()))
        # 将词嵌入层和位置嵌入层加载到第一个设备上
        self.wte = self.wte.to(self.first_device)
        self.wpe = self.wpe.to(self.first_device)
        # 加载各个块到对应的设备
        for k, v in self.device_map.items():
            for block in v:
                cuda_device = "cuda:" + str(k)
                self.h[block] = self.h[block].to(cuda_device)
        # 将ln_f加载到最后一个设备上
        self.ln_f = self.ln_f.to(self.last_device)

    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        # 发出警告，提示函数即将被移除
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        # 将模型并行模式设置为False
        self.model_parallel = False
        # 将设备映射设置为None
        self.device_map = None
        # 将第一个和最后一个设备设置为cpu
        self.first_device = "cpu"
        self.last_device = "cpu"
        # 将词嵌入层和位置嵌入层加载到cpu
        self.wte = self.wte.to("cpu")
        self.wpe = self.wpe.to("cpu")
        # 将所有块加载到cpu
        for index in range(len(self.h)):
            self.h[index] = self.h[index].to("cpu")
        # 将ln_f加载到cpu，并清空cuda缓存
        self.ln_f = self.ln_f.to("cpu")
        torch.cuda.empty_cache()  # 清空CUDA缓存

    def get_input_embeddings(self):
        return self.wte  # 返回词嵌入层
    # 设置新的输入嵌入（词向量）到模型中
    def set_input_embeddings(self, new_embeddings):
        self.wte = new_embeddings

    # 剪枝模型中的注意力头部
    # heads_to_prune: 需要剪枝的头部字典 {层号: 需要在该层剪枝的头部列表}
    def _prune_heads(self, heads_to_prune):
        for layer, heads in heads_to_prune.items():
            # 在指定层的注意力模块中剪枝特定的注意力头部
            self.h[layer].attn.prune_heads(heads)

    # 前向传播函数，处理模型的输入和参数，并返回输出
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用装饰器添加文档字符串，描述该类是基于GPT2模型的语言建模头部模型
@add_start_docstrings(
    """
    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    GPT2_START_DOCSTRING,
)
class GPT2LMHeadModel(GPT2PreTrainedModel):
    # 定义权重共享的键名列表
    _tied_weights_keys = ["lm_head.weight"]

    # 初始化函数，接收一个配置参数config对象
    def __init__(self, config):
        # 调用父类构造函数，初始化模型
        super().__init__(config)
        # 创建GPT2模型的实例并赋值给self.transformer
        self.transformer = GPT2Model(config)
        # 创建一个线性层作为语言建模的头部，输入维度为config.n_embd，输出维度为config.vocab_size，无偏置
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # 模型并行处理的标志和设备映射初始化为None
        self.model_parallel = False
        self.device_map = None

        # 调用后处理函数，初始化权重并进行最终处理
        self.post_init()

    # 使用装饰器添加文档字符串，描述该方法用于模型并行化
    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        # 发出警告，表明该方法将在未来版本中删除，建议使用from_pretrained函数加载模型
        warnings.warn(
            "`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
            " 0, 'transformer.h.1': 1, ...}",
            FutureWarning,
        )
        # 根据设备映射或默认情况下创建设备映射
        self.device_map = (
            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        # 断言设备映射的正确性
        assert_device_map(self.device_map, len(self.transformer.h))
        # 在模型上应用并行化，使用设备映射
        self.transformer.parallelize(self.device_map)
        # 将lm_head移动到第一个设备
        self.lm_head = self.lm_head.to(self.transformer.first_device)
        # 设置模型并行标志为True
        self.model_parallel = True

    # 使用装饰器添加文档字符串，描述该方法用于取消模型的并行化
    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        # 发出警告，表明该方法将在未来版本中删除
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        # 取消模型的并行化
        self.transformer.deparallelize()
        # 将模型和lm_head移动到CPU上
        self.transformer = self.transformer.to("cpu")
        self.lm_head = self.lm_head.to("cpu")
        # 设置模型并行标志为False
        self.model_parallel = False
        # 清空CUDA缓存
        torch.cuda.empty_cache()

    # 返回lm_head作为输出的嵌入层
    def get_output_embeddings(self):
        return self.lm_head

    # 设置新的输出嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        # 从 kwargs 中获取 token_type_ids，如果不存在则设为 None
        token_type_ids = kwargs.get("token_type_ids", None)
        
        # 如果 past_key_values 存在，则根据其信息决定是否跳过部分输入 ID
        if past_key_values:
            # 获取 past_key_values 中的长度信息，通常是上一次生成的序列长度
            past_length = past_key_values[0][0].shape[2]

            # 如果输入 ID 的长度大于 past_length，则移除前缀长度为 past_length
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认行为是只保留最后一个输入 ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 更新输入 ID，移除前缀部分
            input_ids = input_ids[:, remove_prefix_length:]
            
            # 如果 token_type_ids 存在，则同步更新其长度
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]

        # 从 kwargs 中获取 attention_mask 和 position_ids
        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        # 如果 attention_mask 存在而 position_ids 不存在，则创建新的 position_ids
        if attention_mask is not None and position_ids is None:
            # 在批量生成时动态创建 position_ids
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            
            # 如果 past_key_values 存在，则根据输入 ID 的长度截取 position_ids
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]
        else:
            position_ids = None

        # 如果传入 inputs_embeds，且 past_key_values 不存在，则只在第一个生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新 model_inputs 字典，包括 past_key_values、use_cache、position_ids、attention_mask、token_type_ids
        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
            }
        )

        return model_inputs

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,


        # 此方法定义了模型的前向传播逻辑，接收多个可选的输入参数
        self,
        # input_ids：输入的token IDs，可以是LongTensor类型，可选
        input_ids: Optional[torch.LongTensor] = None,
        # past_key_values：用于保存过去的键值状态，可选，类型为Tuple[Tuple[torch.Tensor]]
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        # attention_mask：注意力遮罩，指定哪些token需要被attention，可选，类型为FloatTensor
        attention_mask: Optional[torch.FloatTensor] = None,
        # token_type_ids：用于区分不同句子的token类型ID，可选，类型为LongTensor
        token_type_ids: Optional[torch.LongTensor] = None,
        # position_ids：位置ID，标识token在输入序列中的位置，可选，类型为LongTensor
        position_ids: Optional[torch.LongTensor] = None,
        # head_mask：用于指定哪些注意力头是激活的，可选，类型为FloatTensor
        head_mask: Optional[torch.FloatTensor] = None,
        # inputs_embeds：用于直接传入嵌入向量而不是token IDs，可选，类型为FloatTensor
        inputs_embeds: Optional[torch.FloatTensor] = None,
        # encoder_hidden_states：编码器的隐藏状态，可选，类型为Tensor
        encoder_hidden_states: Optional[torch.Tensor] = None,
        # encoder_attention_mask：编码器的注意力遮罩，可选，类型为FloatTensor
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        # labels：用于训练时指定的标签，可选，类型为LongTensor
        labels: Optional[torch.LongTensor] = None,
        # use_cache：是否使用缓存，可选，类型为bool
        use_cache: Optional[bool] = None,
        # output_attentions：是否输出注意力权重，可选，类型为bool
        output_attentions: Optional[bool] = None,
        # output_hidden_states：是否输出隐藏状态，可选，类型为bool
        output_hidden_states: Optional[bool] = None,
        # return_dict：是否返回结果字典形式的输出，可选，类型为bool
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 如果未指定return_dict，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用transformer处理输入数据，获取transformer的输出
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从transformer输出中获取隐藏状态
        hidden_states = transformer_outputs[0]

        # 如果启用模型并行化，则设置隐藏状态的设备
        if self.model_parallel:
            torch.cuda.set_device(self.transformer.first_device)
            hidden_states = hidden_states.to(self.lm_head.weight.device)

        # 生成语言模型的预测输出
        lm_logits = self.lm_head(hidden_states)

        # 初始化损失为None
        loss = None
        # 如果提供了标签，则计算损失
        if labels is not None:
            # 将标签移动到正确的设备以启用模型并行化
            labels = labels.to(lm_logits.device)
            # 将logits向左偏移一个位置，以便预测下一个标记
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # 展平标记
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # 如果不使用return_dict选项，则返回输出元组
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 使用return_dict选项返回带有交叉注意力的CausalLMOutputWithCrossAttentions对象
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
            cross_attentions=transformer_outputs.cross_attentions,
        )

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        # 返回类型声明为一个元组，元组中包含多个元组，每个元组中包含 torch.Tensor 对象
        return tuple(
            # 外层元组的每个元素是一个通过索引操作重新排序后的 past_state 的元组
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            # 对于 past_key_values 中的每个 layer_past 元素，重新排序其中的 past_state 对象
            for layer_past in past_key_values
        )
# 使用装饰器为类添加起始文档字符串，描述其作为 GPT2 模型变换器的功能，包括语言建模和多选分类头部
@add_start_docstrings(
    """
    The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
    input embeddings, the classification head takes as input the input of a specified classification token index in the
    input sequence).
    """,
    GPT2_START_DOCSTRING,
)
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    # 定义需要权重共享的键列表
    _tied_weights_keys = ["lm_head.weight"]

    # 初始化方法，接收配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将标签数目设置为 1
        config.num_labels = 1
        # 创建 GPT2Model 的实例并赋值给 self.transformer
        self.transformer = GPT2Model(config)
        # 创建 nn.Linear 实例作为语言模型头部 self.lm_head，连接数为 config.n_embd 到 config.vocab_size，无偏置
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # 创建 SequenceSummary 的实例作为多选头 self.multiple_choice_head
        self.multiple_choice_head = SequenceSummary(config)

        # Model parallel
        # 初始化 model parallel 和 device_map 参数
        self.model_parallel = False
        self.device_map = None

        # 调用自定义的初始化方法，应用最终处理
        self.post_init()

    # 使用装饰器为方法添加并行化的起始文档字符串
    @add_start_docstrings(PARALLELIZE_DOCSTRING)
    def parallelize(self, device_map=None):
        # 发出警告，提示该方法即将在 Transformers v5 中移除，并提供替代方法
        warnings.warn(
            "`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should"
            " load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your"
            " own `device_map` but it needs to be a dictionary module_name to device, so for instance"
            " {'transformer.h.0': 0, 'transformer.h.1': 1, ...}",
            FutureWarning,
        )
        # 根据设备映射创建 device_map
        self.device_map = (
            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        # 验证设备映射的正确性
        assert_device_map(self.device_map, len(self.transformer.h))
        # 在 transformer 上应用并行化，根据 device_map
        self.transformer.parallelize(self.device_map)
        # 将 lm_head 和 multiple_choice_head 移至 transformer 的第一个设备
        self.lm_head = self.lm_head.to(self.transformer.first_device)
        self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
        # 设置 model_parallel 为 True
        self.model_parallel = True

    # 使用装饰器为方法添加取消并行化的起始文档字符串
    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
    def deparallelize(self):
        # 发出警告，提示该方法即将在 Transformers v5 中移除，并提供替代方法
        warnings.warn(
            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        # 取消 transformer 的并行化
        self.transformer.deparallelize()
        # 将 transformer、lm_head、multiple_choice_head 移至 CPU
        self.transformer = self.transformer.to("cpu")
        self.lm_head = self.lm_head.to("cpu")
        self.multiple_choice_head = self.multiple_choice_head.to("cpu")
        # 设置 model_parallel 为 False，并清空 CUDA 缓存
        self.model_parallel = False
        torch.cuda.empty_cache()

    # 获取输出的嵌入层
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出的嵌入层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
        # 获取额外参数中的 token_type_ids
        token_type_ids = kwargs.get("token_type_ids", None)
        
        # 如果存在过去的键值，则需要根据 past_key_values 进行处理
        if past_key_values:
            # 获取过去的长度，通常是通过 past_key_values 的第一个元素获取
            past_length = past_key_values[0][0].shape[2]

            # 如果输入的 input_ids 的长度大于过去的长度，截取掉前面的部分
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认保留最后一个输入的 ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]
            # 如果存在 token_type_ids，则也需要相应地调整它的长度
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]

        # 获取 attention_mask 和 position_ids
        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        # 如果 attention_mask 存在而 position_ids 不存在，则需要动态生成 position_ids
        if attention_mask is not None and position_ids is None:
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            # 如果存在 past_key_values，则只保留与 input_ids 相关的部分 position_ids
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]
        else:
            position_ids = None

        # 返回准备好的输入参数字典
        return {
            "input_ids": input_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        mc_token_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        mc_labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ):
        # 前向传播函数，接收多种输入参数并输出模型的返回结果
        pass

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        # 返回一个元组，其中每个元素都是一个元组，每个元组包含经过重新排序后的每个层的过去键-值缓存
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            # 对每个层的过去键-值缓存执行重新排序
            for layer_past in past_key_values
        )
"""
GPT2 模型转换器，顶部带有序列分类头（线性层）。

[`GPT2ForSequenceClassification`] 使用最后一个令牌进行分类，与其他因果模型（例如 GPT-1）类似。

由于它在最后一个令牌上执行分类，因此需要知道最后一个令牌的位置。如果在配置中定义了 `pad_token_id`，它会在每行中找到不是填充令牌的最后一个令牌。如果没有定义 `pad_token_id`，它会简单地取批次中每行的最后一个值。当传递 `inputs_embeds` 而不是 `input_ids` 时，由于无法猜测填充令牌，它会执行相同的操作（取批次中每行的最后一个值）。
"""
@add_start_docstrings(
    """
    GPT2 模型，顶部带有标记分类头（即隐藏状态输出的线性层），例如用于命名实体识别（NER）任务。
    """,
    GPT2_START_DOCSTRING,
)
class GPT2ForTokenClassification(GPT2PreTrainedModel):
    def __init__(self, config):
        # 调用父类构造函数初始化模型
        super().__init__(config)
        # 设置模型的标签数量
        self.num_labels = config.num_labels

        # 使用给定的配置创建 GPT2Model 实例作为模型的转换器
        self.transformer = GPT2Model(config)
        
        # 确定分类器的 dropout 率
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        
        # 根据确定的 dropout 率创建 Dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        
        # 创建线性层作为分类器，连接 GPT2 模型的隐藏层到分类标签数量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化模型并设置模型并行计算相关的变量
        self.model_parallel = False
        self.device_map = None

        # 调用额外的初始化函数
        self.post_init()

    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    # 设置代码示例文档字符串和格式化
    # fmt: off
    @add_code_sample_docstrings(
        checkpoint="brad1141/gpt2-finetuned-comp2",
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_loss=0.25,
        expected_output=[
            "Lead",
            "Lead",
            "Lead",
            "Position",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
            "Lead",
        ],
    )
    # fmt: on
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 已定义，则保持其值不变；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 Transformer 模型进行处理
        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 Transformer 模型的输出中获取隐藏状态
        hidden_states = transformer_outputs[0]
        # 对隐藏状态应用 dropout
        hidden_states = self.dropout(hidden_states)
        # 使用分类器获取 logits
        logits = self.classifier(hidden_states)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签，则计算损失
        if labels is not None:
            # 将标签移动到 logits 的设备上
            labels = labels.to(logits.device)
            # 使用交叉熵损失函数计算损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不要求返回字典，则返回 logits 和其他 Transformer 模型输出的组合
        if not return_dict:
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则返回 TokenClassifierOutput 对象
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
# 添加文档字符串以描述 GPT-2 模型在抽取式问答任务（如 SQuAD）上的应用，包括在隐藏状态输出之上的线性层用于计算“span start logits”和“span end logits”。
@add_start_docstrings(
    """
    The GPT-2 Model transformer with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    GPT2_START_DOCSTRING,
)
class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
    def __init__(self, config):
        # 调用父类构造函数初始化模型
        super().__init__(config)
        # 设定标签数目
        self.num_labels = config.num_labels
        # 使用给定的配置初始化 GPT-2 模型的主体部分
        self.transformer = GPT2Model(config)
        # 定义一个线性层，用于输出问题回答的起始位置和结束位置的logits
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        # Model parallel
        # 模型并行设为False
        self.model_parallel = False
        # 设备映射为空
        self.device_map = None

        # 初始化权重并应用最终处理
        self.post_init()

    # 添加文档字符串以描述模型的前向传播过程，包括输入的详细说明
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例文档字符串，指定检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        real_checkpoint=_CHECKPOINT_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Determine whether to use the provided return_dict or the default from configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass inputs through the transformer model with specified arguments
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract sequence output from the model outputs
        sequence_output = outputs[0]

        # Compute logits for question answering from sequence output
        logits = self.qa_outputs(sequence_output)
        
        # Split logits into start and end logits
        start_logits, end_logits = logits.split(1, dim=-1)
        
        # Squeeze and make contiguous the logits tensors
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # Ensure start_positions and end_positions have correct dimensions and device
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1).to(start_logits.device)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1).to(end_logits.device)
            
            # Clamp positions to ignore indices outside the model input
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Define loss function and compute start and end losses
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # Return outputs without dictionary format if return_dict is False
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # Return outputs in QuestionAnsweringModelOutput format
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\gpt2\modeling_tf_gpt2.py`

# coding=utf-8
# 文件编码声明，指明使用 UTF-8 编码
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# 版权声明，版权归属于 OpenAI 团队和 HuggingFace Inc. 团队
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
# 版权声明，版权归属于 NVIDIA 公司，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 进行许可，可以免费使用该文件
# you may not use this file except in compliance with the License.
# 除非符合 Apache License，Version 2.0 的规定，否则不能使用此文件
# You may obtain a copy of the License at
# 可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，否则本文件中分发的软件按"原样"分发，
# 没有任何明示或暗示的担保或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 请参阅许可证，了解具体的语言管理权限和限制

""" TF 2.0 OpenAI GPT-2 model."""
# 此文件是 TF 2.0 版本的 OpenAI GPT-2 模型的实现

from __future__ import annotations

from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

# 从外部模块导入必要的函数和类
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFCausalLMOutputWithCrossAttentions,
    TFSequenceClassifierOutputWithPast,
)
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFConv1D,
    TFModelInputType,
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    TFSequenceSummary,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_gpt2 import GPT2Config

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置
_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
_CONFIG_FOR_DOC = "GPT2Config"

# GPT-2 的预训练模型存档列表
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai-community/gpt2",
    "openai-community/gpt2-medium",
    "openai-community/gpt2-large",
    "openai-community/gpt2-xl",
    "distilbert/distilgpt2",
    # See all GPT-2 models at https://huggingface.co/models?filter=openai-community/gpt2
]

# 定义 TFAttention 类，继承自 keras.layers.Layer
class TFAttention(keras.layers.Layer):
    def __init__(self, nx, config, scale=False, is_cross_attention=False, **kwargs):
        super().__init__(**kwargs)

        n_state = nx  # 在注意力机制中，n_state=768 (nx=n_embd)
        # [将 nx 替换为 n_state，以保持与 TF 实现的一致性]
        assert n_state % config.n_head == 0
        self.n_head = config.n_head  # 设置注意头的数量
        self.split_size = n_state  # 分割大小设置为 n_state
        self.scale = scale  # 是否进行缩放
        self.output_attentions = config.output_attentions  # 是否输出注意力权重

        self.is_cross_attention = is_cross_attention  # 是否为交叉注意力

        if self.is_cross_attention:
            self.c_attn = TFConv1D(n_state * 2, nx, initializer_range=config.initializer_range, name="c_attn")
            self.q_attn = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="q_attn")
        else:
            self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
        # 根据是否为交叉注意力设置不同的卷积层

        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
        # c_proj 卷积层设置

        self.attn_dropout = keras.layers.Dropout(config.attn_pdrop)  # 注意力分数的 dropout
        self.resid_dropout = keras.layers.Dropout(config.resid_pdrop)  # 残差的 dropout
        self.pruned_heads = set()  # 初始化修剪的注意力头集合
        self.embed_dim = n_state  # 嵌入维度设置为 n_state

    def prune_heads(self, heads):
        pass  # 修剪注意力头的方法，当前为空

    @staticmethod
    def causal_attention_mask(nd, ns, dtype):
        """
        生成因果注意力掩码，下三角矩阵，从右下角开始计算。与 tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd) 相同，
        但在 TPUs 上不会产生垃圾数据。
        """
        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype)

    def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False):
        # q, k, v 的形状为 [batch, heads, sequence, features]

        w = tf.matmul(q, k, transpose_b=True)  # 计算注意力分数

        if self.scale:
            dk = tf.cast(shape_list(k)[-1], dtype=w.dtype)  # 缩放注意力分数
            w = w / tf.math.sqrt(dk)

        if not self.is_cross_attention:
            # 如果不是交叉注意力，实现因果掩码
            _, _, nd, ns = shape_list(w)
            b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
            b = tf.reshape(b, [1, 1, nd, ns])
            w = w * b - 1e4 * (1 - b)

        if attention_mask is not None:
            # 应用给定的注意力掩码
            attention_mask = tf.cast(attention_mask, dtype=w.dtype)
            w = w + attention_mask

        w = stable_softmax(w, axis=-1)  # 对注意力分数进行 softmax
        w = self.attn_dropout(w, training=training)  # 应用注意力 dropout

        if head_mask is not None:
            w = w * head_mask  # 如果有头部掩码，应用头部掩码

        outputs = [tf.matmul(w, v)]  # 计算加权和
        if output_attentions:
            outputs.append(w)  # 如果需要输出注意力权重，添加到输出中
        return outputs  # 返回输出结果
    def merge_heads(self, x):
        # 将输入张量 x 的维度进行转置，[0, 2, 1, 3] 表示将第二维和第三维进行交换
        x = tf.transpose(x, [0, 2, 1, 3])
        # 获取输入张量 x 的形状
        x_shape = shape_list(x)
        # 根据 x 的形状，将其最后两个维度合并成一个新的维度
        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
        # 将输入张量 x 重新按照新的形状进行重塑
        return tf.reshape(x, new_x_shape)

    def split_heads(self, x):
        # 获取输入张量 x 的形状
        x_shape = shape_list(x)
        # 根据 x 的形状，将其最后一个维度分割成两个维度，用于实现多头注意力机制
        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
        # 将输入张量 x 按照新的形状进行重塑
        x = tf.reshape(x, new_x_shape)
        # 将输入张量 x 的维度进行转置，(0, 2, 1, 3) 表示维度的调整顺序
        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)

    def call(
        self,
        x,
        layer_past,
        attention_mask,
        head_mask,
        encoder_hidden_states,
        encoder_attention_mask,
        use_cache,
        output_attentions,
        training=False,
    ):
        if encoder_hidden_states is not None:
            if not hasattr(self, "q_attn"):
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
                )

            # 对输入张量 x 进行 q_attn 权重的操作，得到查询张量 query
            query = self.q_attn(x)
            # 对编码器隐藏状态进行 c_attn 权重的操作，得到键值对张量 kv_out
            kv_out = self.c_attn(encoder_hidden_states)
            # 将键值对张量 kv_out 沿着最后一个维度分割成两个张量，分别表示键和值
            key, value = tf.split(kv_out, 2, axis=2)
            # 注意力遮罩掩码为编码器的注意力掩码
            attention_mask = encoder_attention_mask
        else:
            # 对输入张量 x 进行 c_attn 权重的操作
            x = self.c_attn(x)
            # 将处理后的张量 x 分割成查询、键、值三个张量
            query, key, value = tf.split(x, 3, axis=2)

        # 将查询张量 query 进行多头分割处理
        query = self.split_heads(query)
        # 将键张量 key 进行多头分割处理
        key = self.split_heads(key)
        # 将值张量 value 进行多头分割处理
        value = self.split_heads(value)

        if layer_past is not None:
            # 如果过去的层存在，则进行未来信息的拼接
            past_key, past_value = tf.unstack(layer_past, axis=0, num=2)
            key = tf.concat([past_key, key], axis=-2)
            value = tf.concat([past_value, value], axis=-2)

        # 用于处理 keras 序列化问题，根据 use_cache 决定返回的张量
        if use_cache:
            present = tf.stack([key, value], axis=0)
        else:
            present = (None,)

        # 对查询、键、值张量进行自注意力计算，包括非归一化的抑制机制、掩码、多头注意力输出等
        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
        a = attn_outputs[0]

        # 将多头注意力输出张量进行头合并操作
        a = self.merge_heads(a)
        # 对合并后的张量进行 c_proj 权重的操作
        a = self.c_proj(a)
        # 对 c_proj 结果进行残差连接和 dropout 处理
        a = self.resid_dropout(a, training=training)

        # 输出结果包括 a（处理后的张量）、present（用于处理 keras 序列化问题）、attentions（注意力结果）
        outputs = [a, present] + attn_outputs[1:]
        return outputs  # a, present, (attentions)

    def build(self, input_shape=None):
        # 如果已经构建过则直接返回
        if self.built:
            return
        self.built = True
        # 如果是交叉注意力，则 c_attn_shape 为 2 倍的 embed_dim，否则为 3 倍
        if self.is_cross_attention:
            c_attn_shape = 2 * self.embed_dim
        else:
            c_attn_shape = 3 * self.embed_dim
        # 对 c_proj 层、c_attn 层、q_attn 层进行构建
        if getattr(self, "c_proj", None) is not None:
            with tf.name_scope(self.c_proj.name):
                # 构建 c_proj 层
                self.c_proj.build([None, None, self.embed_dim])
        if getattr(self, "c_attn", None) is not None:
            with tf.name_scope(self.c_attn.name):
                # 构建 c_attn 层
                self.c_attn.build([None, None, c_attn_shape])
        if getattr(self, "q_attn", None) is not None:
            with tf.name_scope(self.q_attn.name):
                # 构建 q_attn 层
                self.q_attn.build([None, None, self.embed_dim])
class TFMLP(keras.layers.Layer):
    def __init__(self, n_state, config, **kwargs):
        super().__init__(**kwargs)
        nx = config.n_embd  # 从配置中获取嵌入维度大小
        # 创建第一个一维卷积层，用于处理状态和嵌入维度之间的转换
        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
        # 创建第二个一维卷积层，用于嵌入维度和状态之间的转换
        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
        # 获取激活函数
        self.act = get_tf_activation(config.activation_function)
        # 创建一个丢弃层，用于在训练时随机丢弃部分数据，以减少过拟合
        self.dropout = keras.layers.Dropout(config.resid_pdrop)
        self.intermediate_size = n_state  # 中间层的大小
        self.embed_dim = nx  # 嵌入维度大小

    def call(self, x, training=False):
        # 应用激活函数到第一个卷积层的输出
        h = self.act(self.c_fc(x))
        # 将第一个卷积层的输出输入到第二个卷积层中
        h2 = self.c_proj(h)
        # 在训练时对第二个卷积层的输出进行随机丢弃
        h2 = self.dropout(h2, training=training)
        return h2

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "c_fc", None) is not None:
            with tf.name_scope(self.c_fc.name):
                # 构建第一个卷积层
                self.c_fc.build([None, None, self.intermediate_size])
        if getattr(self, "c_proj", None) is not None:
            with tf.name_scope(self.c_proj.name):
                # 构建第二个卷积层
                self.c_proj.build([None, None, self.embed_dim])


class TFBlock(keras.layers.Layer):
    def __init__(self, config, scale=False, **kwargs):
        super().__init__(**kwargs)
        nx = config.n_embd  # 从配置中获取嵌入维度大小
        # 内部维度大小为 n_inner 或者默认为 4 * nx
        inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
        # 第一个层归一化层，用于归一化输入数据
        self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
        # 自注意力层，用于学习输入序列内部的依赖关系
        self.attn = TFAttention(nx, config, scale, name="attn")
        # 第二个层归一化层，用于归一化自注意力层的输出
        self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")

        if config.add_cross_attention:
            # 如果配置中需要加入跨注意力机制，则创建跨注意力层
            self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True)
            # 对跨注意力层的输出进行归一化
            self.ln_cross_attn = keras.layers.LayerNormalization(
                epsilon=config.layer_norm_epsilon, name="ln_cross_attn"
            )

        # 多层感知机，用于处理每个注意力块的输出
        self.mlp = TFMLP(inner_dim, config, name="mlp")
        self.hidden_size = config.hidden_size  # 隐藏层大小

    def call(
        self,
        x,
        layer_past,
        attention_mask,
        head_mask,
        encoder_hidden_states,
        encoder_attention_mask,
        use_cache,
        output_attentions,
        training=False,
    ):
        # 使用 self.ln_1 对输入 x 进行层归一化处理
        a = self.ln_1(x)
        # 使用 self.attn 进行自注意力机制操作
        output_attn = self.attn(
            a,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            use_cache=use_cache,
            output_attentions=output_attentions,
            training=training,
        )
        # 从 output_attn 中提取注意力权重 a
        a = output_attn[0]  # output_attn: a, present, (attentions)
        # 提取除注意力权重外的其他输出
        outputs = output_attn[1:]
        # 更新 x，加上注意力权重 a
        x = x + a

        # Cross-Attention Block
        # 如果存在编码器隐藏状态，则添加交叉注意力块
        if encoder_hidden_states is not None:
            # 检查是否已实例化 self.crossattention
            if not hasattr(self, "crossattention"):
                # 抛出异常，要求在实例化时设置 config.add_cross_attention=True
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )

            # 使用 self.ln_cross_attn 对输入 x 进行层归一化处理
            ca = self.ln_cross_attn(x)
            # 使用 self.crossattention 进行交叉注意力机制操作
            output_cross_attn = self.crossattention(
                ca,
                layer_past=None,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                use_cache=False,
                output_attentions=output_attentions,
                training=training,
            )
            # 从 output_cross_attn 中提取交叉注意力权重 ca
            ca = output_cross_attn[0]  # output_attn: a, present, (cross_attentions)
            # 更新 x，加上交叉注意力权重 ca
            x = x + ca
            # 添加交叉注意力权重到输出
            outputs = outputs + output_cross_attn[2:]  # add cross attentions if we output attention weights

        # 使用 self.ln_2 对更新后的 x 进行层归一化处理
        m = self.ln_2(x)
        # 使用 self.mlp 进行多层感知机操作
        m = self.mlp(m, training=training)
        # 更新 x，加上多层感知机输出 m
        x = x + m

        # 将更新后的 x 和输出列表组合成最终输出
        outputs = [x] + outputs
        # 返回最终输出，包括 x、present、(attentions, cross_attentions)
        return outputs  # x, present, (attentions, cross_attentions)

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 检查并构建 self.ln_1
        if getattr(self, "ln_1", None) is not None:
            with tf.name_scope(self.ln_1.name):
                self.ln_1.build([None, None, self.hidden_size])
        # 检查并构建 self.attn
        if getattr(self, "attn", None) is not None:
            with tf.name_scope(self.attn.name):
                self.attn.build(None)
        # 检查并构建 self.ln_2
        if getattr(self, "ln_2", None) is not None:
            with tf.name_scope(self.ln_2.name):
                self.ln_2.build([None, None, self.hidden_size])
        # 检查并构建 self.mlp
        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)
        # 检查并构建 self.crossattention
        if getattr(self, "crossattention", None) is not None:
            with tf.name_scope(self.crossattention.name):
                self.crossattention.build(None)
        # 检查并构建 self.ln_cross_attn
        if getattr(self, "ln_cross_attn", None) is not None:
            with tf.name_scope(self.ln_cross_attn.name):
                self.ln_cross_attn.build([None, None, self.hidden_size])
# 定义一个自定义的 Keras 层 TFGPT2MainLayer，用于实现 GPT-2 主层的功能
@keras_serializable
class TFGPT2MainLayer(keras.layers.Layer):
    # 配置类，指定为 GPT-2 的配置类 GPT2Config
    config_class = GPT2Config

    # 初始化方法，接受配置对象 config 和其他输入参数
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(*inputs, **kwargs)

        # 将配置对象保存到实例变量 self.config 中
        self.config = config
        # 是否输出注意力权重
        self.output_attentions = config.output_attentions
        # 是否输出隐藏状态
        self.output_hidden_states = config.output_hidden_states
        # 是否使用缓存
        self.use_cache = config.use_cache
        # 是否返回字典形式的输出
        self.return_dict = config.use_return_dict

        # 隐藏层的数量
        self.num_hidden_layers = config.n_layer
        # 嵌入向量的维度
        self.n_embd = config.n_embd
        # 位置编码的长度
        self.n_positions = config.n_positions
        # 初始化范围
        self.initializer_range = config.initializer_range

        # 词嵌入层，用于将输入的词索引转换为词向量
        self.wte = keras.layers.Embedding(
            input_dim=config.vocab_size,
            output_dim=config.hidden_size,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="wte",
        )
        # 位置嵌入层，用于将位置索引转换为位置向量
        self.wpe = keras.layers.Embedding(
            input_dim=config.n_positions,
            output_dim=config.n_embd,
            embeddings_initializer=get_initializer(config.initializer_range),
            name="wpe",
        )
        # Dropout 层，用于在训练过程中随机置零部分输入向量，防止过拟合
        self.drop = keras.layers.Dropout(config.embd_pdrop)
        
        # 多头注意力层列表，使用 TFBlock 类创建，共 config.n_layer 个
        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
        
        # 最后的 LayerNormalization 层，用于归一化隐藏层的输出
        self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
        
        # 嵌入向量的维度，即隐藏层的维度
        self.embed_dim = config.hidden_size

    # 返回词嵌入层对象
    def get_input_embeddings(self):
        return self.wte

    # 设置新的输入词嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.wte = new_embeddings

    # 未实现的方法，用于剪枝模型的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError

    # 解包输入的装饰器，用于处理输入参数并调用模型
    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    # 构建方法，用于构造模型的各个组件的计算图
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位，表示模型已经构建
        self.built = True
        
        # 如果存在 self.wte 属性，则构建 wte 组件
        if getattr(self, "wte", None) is not None:
            # 在命名作用域内构建 wte 组件
            with tf.name_scope(self.wte.name):
                self.wte.build(None)
        
        # 如果存在 self.wpe 属性，则构建 wpe 组件
        if getattr(self, "wpe", None) is not None:
            # 在命名作用域内构建 wpe 组件
            with tf.name_scope(self.wpe.name):
                self.wpe.build(None)
        
        # 如果存在 self.ln_f 属性，则构建 ln_f 组件
        if getattr(self, "ln_f", None) is not None:
            # 在命名作用域内构建 ln_f 组件
            with tf.name_scope(self.ln_f.name):
                self.ln_f.build([None, None, self.embed_dim])
        
        # 如果存在 self.h 属性，则依次构建其中的每个 layer 组件
        if getattr(self, "h", None) is not None:
            for layer in self.h:
                # 在命名作用域内构建当前 layer 组件
                with tf.name_scope(layer.name):
                    layer.build(None)
# 定义一个名为 TFGPT2PreTrainedModel 的类，继承自 TFPreTrainedModel，用于处理权重初始化和预训练模型的下载和加载接口
class TFGPT2PreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 GPT2Config
    config_class = GPT2Config
    # 基础模型的前缀为 "transformer"
    base_model_prefix = "transformer"
    
    # 在从 PyTorch 模型加载到 TensorFlow 模型时，忽略掉指定的层名中含有 'h.\d+.attn.bias' 或 'h.\d+.crossattention.bias' 的层
    # 这些层在加载过程中被视为授权的意外/丢失层
    _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias", r"h.\d+.crossattention.bias"]

    @property
    def input_signature(self):
        # 返回输入签名，指定了输入张量的规格
        # GPT-2 理论上支持 token_type_ids，但在实践中很少使用，而且其实现意味着传递 token_type_ids=0 会产生与 token_type_ids=None 不同的输出
        # 因此，默认情况下移除 token_type_ids 参数，即使通常应该包含它
        return {
            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
        }


@dataclass
class TFGPT2DoubleHeadsModelOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.

    Args:
        logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 定义 TFGPT2DoubleHeadsModelOutput 类，用于存储模型输出，包括语言建模头部的预测分数、多选分类头部的预测分数以及可选的额外信息如过去的键值、隐藏状态和注意力权重
    logits: tf.Tensor = None
    # 定义一个变量 mc_logits，类型为 tf.Tensor，默认为 None
    mc_logits: tf.Tensor = None
    # 定义一个变量 past_key_values，类型为 List[tf.Tensor] 或 None，默认为 None
    past_key_values: List[tf.Tensor] | None = None
    # 定义一个变量 hidden_states，类型为 Tuple[tf.Tensor] 或 None，默认为 None
    hidden_states: Tuple[tf.Tensor] | None = None
    # 定义一个变量 attentions，类型为 Tuple[tf.Tensor] 或 None，默认为 None
    attentions: Tuple[tf.Tensor] | None = None
# 定义 GPT2_START_DOCSTRING 为多行字符串，包含模型的继承关系、使用说明和参数说明
GPT2_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义 GPT2_INPUTS_DOCSTRING 为空字符串
GPT2_INPUTS_DOCSTRING = r"""
"""

# 在类文档字符串中添加该类的说明并引用 GPT2_START_DOCSTRING
@add_start_docstrings(
    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
    GPT2_START_DOCSTRING,
)
# 定义 TFGPT2Model 类继承自 TFGPT2PreTrainedModel
class TFGPT2Model(TFGPT2PreTrainedModel):
    # 初始化函数，接受模型配置和输入参数
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类构造函数初始化模型配置和输入参数
        super().__init__(config, *inputs, **kwargs)
        # 使用TFGPT2MainLayer类创建transformer关键字参数
        self.transformer = TFGPT2MainLayer(config, name="transformer")

    # 使用装饰器函数unpack_inputs和add_start_docstrings_to_model_forward添加函数说明
    @unpack_inputs
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    定义一个装饰器，用于为代码示例添加文档字符串，指定了文档检查点、输出类型和配置类

    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
        r"""
        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
            their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past`). Set to `False` during training, `True` during generation
        """

        outputs = self.transformer(
            input_ids=input_ids,  # 输入的token序列的ID
            past_key_values=past_key_values,  # 预先计算的注意力机制的键值对状态，用于加速解码
            attention_mask=attention_mask,  # 注意力掩码，避免对编码器输入的填充token进行注意力计算
            token_type_ids=token_type_ids,  # token类型ID，用于区分不同的句子或段落
            position_ids=position_ids,  # token在序列中的位置ID
            head_mask=head_mask,  # 头部掩码，控制哪些注意力头部被保留或屏蔽
            inputs_embeds=inputs_embeds,  # 输入的嵌入表示，用于提供预先计算的嵌入向量
            encoder_hidden_states=encoder_hidden_states,  # 编码器的隐藏状态序列，用于解码器的交叉注意力
            encoder_attention_mask=encoder_attention_mask,  # 编码器的注意力掩码，用于解码器的交叉注意力
            use_cache=use_cache,  # 是否使用缓存来加速解码
            output_attentions=output_attentions,  # 是否返回注意力权重
            output_hidden_states=output_hidden_states,  # 是否返回隐藏状态
            return_dict=return_dict,  # 是否以字典形式返回结果
            training=training,  # 是否在训练模式下运行
        )

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
"""
The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
# 声明一个 TF 模型类，继承自 TFGPT2PreTrainedModel 和 TFCausalLanguageModelingLoss
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
    
    # 初始化方法，接收配置参数和其他输入
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        # 创建 GPT2 的主要层，即 transformer 层
        self.transformer = TFGPT2MainLayer(config, name="transformer")

    # 获取输出嵌入的方法，返回输入嵌入
    def get_output_embeddings(self):
        return self.get_input_embeddings()

    # 设置输出嵌入的方法，设置输入嵌入
    def set_output_embeddings(self, value):
        self.set_input_embeddings(value)

    # 为生成准备输入的方法，根据输入参数组织生成所需的输入数据
    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
        # 获取 token_type_ids，如果在 kwargs 中定义了，则获取最后一个 token 的 token_type_ids
        token_type_ids = kwargs.get("token_type_ids", None)
        
        # 如果 past_key_values 存在，则仅使用 inputs 的最后一个 token
        if past_key_values:
            inputs = tf.expand_dims(inputs[:, -1], -1)
            if token_type_ids is not None:
                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)

        # 获取 position_ids、attention_mask 等参数
        position_ids = kwargs.get("position_ids", None)
        attention_mask = kwargs.get("attention_mask", None)

        # 如果 attention_mask 存在且 position_ids 不存在，则根据 attention_mask 计算 position_ids
        if attention_mask is not None and position_ids is None:
            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
            if past_key_values:
                position_ids = tf.expand_dims(position_ids[:, -1], -1)

        # 返回生成模型所需的输入数据字典
        return {
            "input_ids": inputs,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
            "token_type_ids": token_type_ids,
        }

    # 调用方法，实现模型的前向传播
    @unpack_inputs
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 这里省略了函数体的部分
    # 定义神经网络层的构建方法，参数input_shape为输入形状，默认为None
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将标记设为已构建状态
        self.built = True
        # 检查是否存在transformer属性，并且不为None
        if getattr(self, "transformer", None) is not None:
            # 使用transformer的名称作为命名空间
            with tf.name_scope(self.transformer.name):
                # 调用transformer对象的build方法，传入None作为输入形状
                self.transformer.build(None)
"""
    通过在顶部添加多选分类头来扩展 GPT2 模型变换器，例如用于 RocStories/SWAG 任务。这两个头部都是线性层。语言建模头部将其权重绑定到输入嵌入，分类头部将输入分类令牌索引的输入序列。
""",
    GPT2_START_DOCSTRING,
)
class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        config.num_labels = 1  # 设置分类数量为1
        self.transformer = TFGPT2MainLayer(config, name="transformer")  # 初始化 GPT2 主层
        self.multiple_choice_head = TFSequenceSummary(
            config, initializer_range=config.initializer_range, name="multiple_choice_head"  # 初始化多选头部
        )

    @unpack_inputs
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)  # 添加模型前向传播的文档字符串
    @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)  # 替换返回文档字符串
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的模型 ID
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,  # 过去的键值对
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力遮罩
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # 令牌类型 ID
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置 ID
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部遮罩
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入的嵌入
        mc_token_ids: np.ndarray | tf.Tensor | None = None,  # 多选令牌 ID
        use_cache: Optional[bool] = None,  # 是否使用缓存
        output_attentions: Optional[bool] = None,  # 输出注意力
        output_hidden_states: Optional[bool] = None,  # 输出隐藏状态
        return_dict: Optional[bool] = None,  # 返回字典
        training: Optional[bool] = False,  # 训练模式
    ):
        # 实现模型的前向传播逻辑，详细见 GPT2 输入文档字符串

    @property
    def input_signature(self):
        return {
            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),  # 输入的模型 ID 规范
            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),  # 注意力遮罩规范
            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),  # 多选令牌 ID 规范
        }

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)  # 构建 GPT2 主层
        if getattr(self, "multiple_choice_head", None) is not None:
            with tf.name_scope(self.multiple_choice_head.name):
                self.multiple_choice_head.build(None)  # 构建多选头部
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    GPT2_START_DOCSTRING,
    )
    # 定义一个名为 TFGPT2ForSequenceClassification 的类，继承自 TFGPT2PreTrainedModel 和 TFSequenceClassificationLoss
    class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassificationLoss):
        def __init__(self, config, *inputs, **kwargs):
            # 调用父类的初始化方法
            super().__init__(config, *inputs, **kwargs)
            # 设置类属性 num_labels，表示分类的标签数量
            self.num_labels = config.num_labels
            # 创建一个名为 score 的全连接层 Dense，用于分类得分计算
            self.score = keras.layers.Dense(
                config.num_labels,
                kernel_initializer=get_initializer(config.initializer_range),
                name="score",
                use_bias=False,
            )
            # 创建一个 GPT2 主体层，用于序列转换
            self.transformer = TFGPT2MainLayer(config, name="transformer")
            # 存储配置信息
            self.config = config

        @unpack_inputs
        @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
        @add_code_sample_docstrings(
            checkpoint="microsoft/DialogRPT-updown",
            output_type=TFSequenceClassifierOutputWithPast,
            config_class=_CONFIG_FOR_DOC,
        )
        # 定义 call 方法，接收输入并进行模型前向传播
        def call(
            self,
            input_ids: TFModelInputType | None = None,
            past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
            attention_mask: np.ndarray | tf.Tensor | None = None,
            token_type_ids: np.ndarray | tf.Tensor | None = None,
            position_ids: np.ndarray | tf.Tensor | None = None,
            head_mask: np.ndarray | tf.Tensor | None = None,
            inputs_embeds: np.ndarray | tf.Tensor | None = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            labels: np.ndarray | tf.Tensor | None = None,
            training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutputWithPast, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 调用Transformer模型处理输入数据，获取Transformer的输出结果
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从Transformer的输出中获取隐藏状态（hidden_states）
        hidden_states = transformer_outputs[0]
        
        # 将隐藏状态通过分类器得到预测的logits
        logits = self.score(hidden_states)
        
        # 获取logits的形状信息
        logits_shape = shape_list(logits)
        
        # 初始化in_logits变量
        in_logits = None
        
        # 如果模型配置中没有定义pad_token_id
        if self.config.pad_token_id is None:
            # 将序列长度设置为-1
            sequence_lengths = -1
        else:
            # 如果输入中包含input_ids
            if input_ids is not None:
                # 计算每个序列的有效长度
                sequence_lengths = (
                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                    - 1
                )
                # 将小于0的长度替换为默认序列长度-1
                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                # 根据有效长度从logits中抽取相应的部分
                in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
            else:
                # 如果没有输入input_ids，则将序列长度设置为-1
                sequence_lengths = -1
                # 记录警告日志，说明在使用inputs_embeds时无法检测到填充标记
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )
        
        # 初始化损失值
        loss = None
        
        # 如果提供了标签数据
        if labels is not None:
            # 断言条件，确保模型能处理批次大小大于1的情况，或者至少有一个填充标记被定义
            assert (
                self.config.pad_token_id is not None or logits_shape[0] == 1
            ), "Cannot handle batch sizes > 1 if no padding token is defined."

            # 如果sequence_lengths不是Tensor，说明在计算中已经从logits中获取了相应的部分
            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0 : logits_shape[0], sequence_lengths]

            # 计算交叉熵损失
            loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(in_logits, [-1, self.num_labels]))
        
        # 如果没有in_logits，则使用原始logits作为池化后的logits
        pooled_logits = in_logits if in_logits is not None else logits
        
        # 如果return_dict为False，则返回一个元组
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output
        
        # 如果return_dict为True，则返回TFSequenceClassifierOutputWithPast对象
        return TFSequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
    # 定义一个方法 `build`，用于构建模型，可以接受输入形状参数 `input_shape`
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，不再重复构建
        if self.built:
            return
        # 设置标志位，表示模型已经构建
        self.built = True
        # 如果模型具有 `score` 属性且不为 `None`
        if getattr(self, "score", None) is not None:
            # 使用 `tf.name_scope` 建立命名空间，命名为 `self.score.name`
            with tf.name_scope(self.score.name):
                # 调用 `self.score` 的 `build` 方法，传入形状参数 `[None, None, self.config.n_embd]`
                self.score.build([None, None, self.config.n_embd])
        # 如果模型具有 `transformer` 属性且不为 `None`
        if getattr(self, "transformer", None) is not None:
            # 使用 `tf.name_scope` 建立命名空间，命名为 `self.transformer.name`
            with tf.name_scope(self.transformer.name):
                # 调用 `self.transformer` 的 `build` 方法，传入 `None` 作为参数
                self.transformer.build(None)

Transformers-源码解析-五十三-

Transformers 源码解析（五十三）

.\models\glpn\convert_glpn_to_pytorch.py

.\models\glpn\feature_extraction_glpn.py

.\models\glpn\image_processing_glpn.py

.\models\glpn\modeling_glpn.py

.\models\glpn\__init__.py

.\models\gpt2\configuration_gpt2.py

.\models\gpt2\convert_gpt2_original_tf_checkpoint_to_pytorch.py

.\models\gpt2\modeling_flax_gpt2.py

.\models\gpt2\modeling_gpt2.py

.\models\gpt2\modeling_tf_gpt2.py

`.\models\glpn\convert_glpn_to_pytorch.py`

`.\models\glpn\feature_extraction_glpn.py`

`.\models\glpn\image_processing_glpn.py`

`.\models\glpn\modeling_glpn.py`

`.\models\glpn\init.py`

`.\models\gpt2\configuration_gpt2.py`

`.\models\gpt2\convert_gpt2_original_tf_checkpoint_to_pytorch.py`

`.\models\gpt2\modeling_flax_gpt2.py`

`.\models\gpt2\modeling_gpt2.py`

`.\models\gpt2\modeling_tf_gpt2.py`