Transformers 源码解析（三十）

`.\models\convnextv2\modeling_convnextv2.py`

# coding=utf-8
# Copyright 2023 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch ConvNextV2 model."""


from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BackboneOutput,
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_convnextv2 import ConvNextV2Config


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "ConvNextV2Config"

# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/convnextv2-tiny-1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "facebook/convnextv2-tiny-1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/convnextv2-tiny-1k-224",
    # See all ConvNextV2 models at https://huggingface.co/models?filter=convnextv2
]


# Copied from transformers.models.beit.modeling_beit.drop_path
# 定义函数 drop_path，实现随机深度（Stochastic Depth）机制
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    # 如果 drop_prob 为 0 或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留概率
    keep_prob = 1 - drop_prob
    # 创建与输入张量相同形状的随机张量，用于随机深度的保留路径
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 对随机生成的张量进行向下取整操作，用于二值化处理
    output = input.div(keep_prob) * random_tensor  # 对输入张量进行按元素除法操作，并乘以随机生成的张量，用于Dropout处理
    return output  # 返回处理后的张量作为输出
# 从 transformers.models.beit.modeling_beit.BeitDropPath 复制过来的代码，将 Beit 替换为 ConvNextV2
class ConvNextV2DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数，对隐藏状态进行随机深度路径丢弃
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


# 从 transformers.models.convnext.modeling_convnext.ConvNextLayerNorm 复制过来的代码，将 ConvNext 替换为 ConvNextV2
class ConvNextV2LayerNorm(nn.Module):
    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    """

    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ["channels_last", "channels_first"]:
            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
        self.normalized_shape = (normalized_shape,)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.data_format == "channels_last":
            # 对 channels_last 格式的输入进行 layer_norm
            x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
        elif self.data_format == "channels_first":
            # 对 channels_first 格式的输入进行 layer_norm
            input_dtype = x.dtype
            x = x.float()
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.eps)
            x = x.to(dtype=input_dtype)
            x = self.weight[:, None, None] * x + self.bias[:, None, None]
        return x


# 从 transformers.models.convnext.modeling_convnext.ConvNextEmbeddings 复制过来的代码，将 ConvNext 替换为 ConvNextV2
class ConvNextV2Embeddings(nn.Module):
    """This class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    """

    def __init__(self, config):
        super().__init__()
        # Patch embedding layer using 2D convolution
        self.patch_embeddings = nn.Conv2d(
            config.num_channels, config.hidden_sizes[0], kernel_size=config.patch_size, stride=config.patch_size
        )
        # Layer normalization specific to ConvNeXTV2 embeddings
        self.layernorm = ConvNextV2LayerNorm(config.hidden_sizes[0], eps=1e-6, data_format="channels_first")
        self.num_channels = config.num_channels

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        num_channels = pixel_values.shape[1]
        # Check if input pixel values have the expected number of channels
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # Compute patch embeddings using the defined convolutional layer
        embeddings = self.patch_embeddings(pixel_values)
        # Apply layer normalization to the embeddings
        embeddings = self.layernorm(embeddings)
        return embeddings


class ConvNextV2Layer(nn.Module):
    """This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    """

    def __init__(self, config, dim, drop_path=0):
        super().__init__()
        # Depthwise convolutional layer
        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
        # Layer normalization specific to ConvNeXTV2 layers
        self.layernorm = ConvNextV2LayerNorm(dim, eps=1e-6)
        # Pointwise (1x1) convolutional layers implemented as linear transformations
        self.pwconv1 = nn.Linear(dim, 4 * dim)
        # Activation function chosen from the configuration
        self.act = ACT2FN[config.hidden_act]
        # Gated residual network (GRN) layer
        self.grn = ConvNextV2GRN(4 * dim)
        self.pwconv2 = nn.Linear(4 * dim, dim)
        # Drop path regularization if specified
        self.drop_path = ConvNextV2DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        input = hidden_states
        # Apply depthwise convolution
        x = self.dwconv(hidden_states)
        # Permute dimensions for compatibility with subsequent operations
        # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
        x = x.permute(0, 2, 3, 1)
        # Apply layer normalization
        x = self.layernorm(x)
        # Apply first pointwise convolution followed by activation
        x = self.pwconv1(x)
        x = self.act(x)
        # Apply gated residual network (GRN) layer
        x = self.grn(x)
        # Apply second pointwise convolution
        x = self.pwconv2(x)
        # Permute dimensions back to the original form
        # (batch_size, height, width, num_channels) -> (batch_size, num_channels, height, width)
        x = x.permute(0, 3, 1, 2)

        # Add the input tensor and the output of the drop path layer
        x = input + self.drop_path(x)
        return x


# Copied from transformers.models.convnext.modeling_convnext.ConvNextStage with ConvNeXT->ConvNeXTV2, ConvNext->ConvNextV2
class ConvNextV2Stage(nn.Module):
    """Represents a stage in the ConvNeXTV2 model."""
    """ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
    """
    
    # 定义 ConvNeXTV2 阶段的网络模块，包括可选的下采样层和多个残差块

    def __init__(self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None):
        super().__init__()  # 调用父类的初始化方法

        # 如果输入通道数与输出通道数不同或者步长大于1，则创建一个下采样层的序列
        if in_channels != out_channels or stride > 1:
            self.downsampling_layer = nn.Sequential(
                ConvNextV2LayerNorm(in_channels, eps=1e-6, data_format="channels_first"),  # 添加通道规范化层
                nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride),  # 添加卷积层
            )
        else:
            self.downsampling_layer = nn.Identity()  # 否则使用恒等映射作为下采样层

        # 根据传入的深度参数，创建一个包含多个 ConvNextV2Layer 的序列
        drop_path_rates = drop_path_rates or [0.0] * depth  # 如果未提供 drop_path_rates，则初始化为0
        self.layers = nn.Sequential(
            *[ConvNextV2Layer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]
        )

    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        hidden_states = self.downsampling_layer(hidden_states)  # 应用下采样层
        hidden_states = self.layers(hidden_states)  # 应用多个 ConvNextV2Layer 层
        return hidden_states
# 从 transformers.models.convnext.modeling_convnext.ConvNextEncoder 复制并修改为 ConvNextV2
class ConvNextV2Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化各阶段的神经网络模块列表
        self.stages = nn.ModuleList()
        # 根据深度和dropout率生成一个列表，用于每个阶段的路径丢弃率
        drop_path_rates = [
            x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths)
        ]
        prev_chs = config.hidden_sizes[0]
        # 遍历每个阶段的配置
        for i in range(config.num_stages):
            out_chs = config.hidden_sizes[i]
            # 创建 ConvNextV2Stage 实例作为每个阶段的神经网络模块
            stage = ConvNextV2Stage(
                config,
                in_channels=prev_chs,
                out_channels=out_chs,
                stride=2 if i > 0 else 1,
                depth=config.depths[i],
                drop_path_rates=drop_path_rates[i],
            )
            self.stages.append(stage)
            prev_chs = out_chs

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
        # 如果需要输出所有隐藏状态，则初始化一个空元组
        all_hidden_states = () if output_hidden_states else None

        # 遍历每个阶段的神经网络模块
        for i, layer_module in enumerate(self.stages):
            if output_hidden_states:
                # 如果需要输出所有隐藏状态，则将当前隐藏状态添加到 all_hidden_states 元组中
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 将当前隐藏状态传递给当前阶段的神经网络模块，更新隐藏状态
            hidden_states = layer_module(hidden_states)

        # 如果需要输出所有隐藏状态，则将最终隐藏状态添加到 all_hidden_states 元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典，则根据情况返回隐藏状态和所有隐藏状态元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        # 返回一个 BaseModelOutputWithNoAttention 对象，包含最终隐藏状态和所有隐藏状态元组
        return BaseModelOutputWithNoAttention(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )


# 从 transformers.models.convnext.modeling_convnext.ConvNextPreTrainedModel 复制并修改为 ConvNextV2
class ConvNextV2PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 与 ConvNextV2 相关的配置类
    config_class = ConvNextV2Config
    # Base model 的前缀名称
    base_model_prefix = "convnextv2"
    # 主要输入的名称
    main_input_name = "pixel_values"

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层或二维卷积层，初始化权重
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置项，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            # 如果是 LayerNorm 层，将偏置项初始化为零，权重初始化为 1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


# 开始的文档字符串，说明这是一个 PyTorch 的 nn.Module 子类
CONVNEXTV2_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    # 将其作为常规的 PyTorch 模块使用，并参考 PyTorch 文档处理所有与一般使用和行为相关的问题。

    Parameters:
        config ([`ConvNextV2Config`]): 包含模型所有参数的模型配置类。
            使用配置文件初始化模型时，不会加载与模型关联的权重，仅加载配置信息。
            可查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
"""

CONVNEXTV2_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`ConvNextImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 为 ConvNextV2Model 添加文档注释，描述其作为 ConvNextV2 模型的基础输出模型，不带特定的顶层头部。
# 同时继承了 CONVNEXTV2_START_DOCSTRING 中的描述。
@add_start_docstrings(
    "The bare ConvNextV2 model outputting raw features without any specific head on top.",
    CONVNEXTV2_START_DOCSTRING,
)
# 从 transformers.models.convnext.modeling_convnext.ConvNextModel 复制代码，替换为 ConvNextV2Model，CONVNEXT->CONVNEXTV2
class ConvNextV2Model(ConvNextV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        # 初始化 ConvNextV2Embeddings，用于处理输入特征
        self.embeddings = ConvNextV2Embeddings(config)
        # 初始化 ConvNextV2Encoder，用于处理嵌入特征
        self.encoder = ConvNextV2Encoder(config)

        # 最终的 layernorm 层，用于标准化最后隐藏层的特征
        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)

        # 初始化权重并应用最终处理
        self.post_init()

    # 添加 CONVNEXTV2_INPUTS_DOCSTRING 作为 forward 方法的文档注释
    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
    # 添加代码示例的文档注释，包括 _CHECKPOINT_FOR_DOC、BaseModelOutputWithPoolingAndNoAttention 等信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 前向传播函数，接受像素值 pixel_values 作为输入，返回隐藏状态的 BaseModelOutputWithPoolingAndNoAttention
    def forward(
        self,
        pixel_values: torch.FloatTensor = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
        # 如果未指定 output_hidden_states，则使用 self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 return_dict，则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果 pixel_values 为 None，则抛出 ValueError
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将 pixel_values 传递给 embeddings，得到嵌入特征输出
        embedding_output = self.embeddings(pixel_values)

        # 将嵌入特征输出传递给 encoder，得到编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 取编码器输出中的最后隐藏状态
        last_hidden_state = encoder_outputs[0]

        # 全局平均池化，将 (N, C, H, W) 的张量池化为 (N, C)
        pooled_output = self.layernorm(last_hidden_state.mean([-2, -1]))

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 否则，返回带有池化器输出的 BaseModelOutputWithPoolingAndNoAttention
        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
    """
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """
    # 使用ConvNextV2模型进行图像分类，顶部有一个分类头部（线性层在池化特征之上），例如用于ImageNet数据集
    CONVNEXTV2_START_DOCSTRING,

class ConvNextV2ForImageClassification(ConvNextV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化模型参数
        self.num_labels = config.num_labels
        self.convnextv2 = ConvNextV2Model(config)

        # 分类器头部
        self.classifier = (
            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 前向传播函数，接受像素值、标签等参数，并返回模型输出
    def forward(
        self,
        pixel_values: torch.FloatTensor = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用它；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 convnextv2 方法处理像素值，根据 return_dict 参数返回结果
        outputs = self.convnextv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果 return_dict 为 True，则从 outputs 中获取 pooler_output；否则从 outputs 的第二个元素获取
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 使用分类器对 pooled_output 进行分类得到 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        # 如果 labels 不为 None，则计算损失
        if labels is not None:
            # 如果 self.config.problem_type 为 None，则根据 num_labels 确定 problem_type
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据 problem_type 计算相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回 logits 与 outputs 的其余部分作为输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 ImageClassifierOutputWithNoAttention 对象
        return ImageClassifierOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )
@add_start_docstrings(
    """
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    """,
    CONVNEXTV2_START_DOCSTRING,
)
# 基于ConvNeXT V2的主干网络，用于与DETR和MaskFormer等框架配合使用
# 从transformers.models.convnext.modeling_convnext.ConvNextBackbone复制而来，修改了名称和配置
class ConvNextV2Backbone(ConvNextV2PreTrainedModel, BackboneMixin):
    def __init__(self, config):
        super().__init__(config)
        # 调用父类初始化函数
        super()._init_backbone(config)

        # 初始化嵌入层和编码器
        self.embeddings = ConvNextV2Embeddings(config)
        self.encoder = ConvNextV2Encoder(config)
        self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes

        # 为输出特征的隐藏状态添加层归一化
        hidden_states_norms = {}
        for stage, num_channels in zip(self._out_features, self.channels):
            hidden_states_norms[stage] = ConvNextV2LayerNorm(num_channels, data_format="channels_first")
        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # ConvNeXT V2模型的前向传播函数，接受像素值张量和一些可选的返回参数
        # 返回值类型为BackboneOutput，具体配置类为_CONFIG_FOR_DOC
        ) -> BackboneOutput:
        """
        返回：模型输出的BackboneOutput对象。

        Examples: 示例代码展示了如何使用该函数来处理图像和调用模型。

        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```"""

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 确定是否使用返回字典的配置，默认使用模型配置中的设定

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否输出隐藏状态的配置，默认使用模型配置中的设定

        embedding_output = self.embeddings(pixel_values)
        # 将输入的像素值嵌入到模型的嵌入层中得到嵌入输出

        outputs = self.encoder(
            embedding_output,
            output_hidden_states=True,  # 强制输出隐藏状态
            return_dict=return_dict,    # 按需返回字典或元组
        )
        # 使用编码器对嵌入输出进行编码，并根据配置决定返回字典或元组

        hidden_states = outputs.hidden_states if return_dict else outputs[1]
        # 根据返回字典的配置选择输出的隐藏状态

        feature_maps = ()
        for stage, hidden_state in zip(self.stage_names, hidden_states):
            if stage in self.out_features:
                hidden_state = self.hidden_states_norms[stage](hidden_state)
                # 对特定阶段的隐藏状态进行归一化处理
                feature_maps += (hidden_state,)

        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (hidden_states,)
            return output
        # 如果不要求返回字典，则返回一个包含特征图和隐藏状态的元组

        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=hidden_states if output_hidden_states else None,
            attentions=None,
        )
        # 否则，返回一个BackboneOutput对象，包含特征图、隐藏状态和注意力信息（注意力默认为None）

`.\models\convnextv2\modeling_tf_convnextv2.py`

# coding=utf-8
# Copyright 2023 Meta Platforms Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 ConvNextV2 model."""

from __future__ import annotations

from typing import List, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithNoAttention,
    TFBaseModelOutputWithPooling,
    TFBaseModelOutputWithPoolingAndNoAttention,
    TFImageClassifierOutputWithNoAttention,
)
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import shape_list
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_convnextv2 import ConvNextV2Config

logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "ConvNextV2Config"

# Base docstring
_CHECKPOINT_FOR_DOC = "facebook/convnextv2-tiny-1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "facebook/convnextv2-tiny-1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/convnextv2-tiny-1k-224",
    # See all ConvNextV2 models at https://huggingface.co/models?filter=convnextv2
]

# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->ConvNextV2
class TFConvNextV2DropPath(keras.layers.Layer):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    """

    def __init__(self, drop_path: float, **kwargs):
        super().__init__(**kwargs)
        self.drop_path = drop_path

    def call(self, x: tf.Tensor, training=None):
        if training:
            # 计算保留概率
            keep_prob = 1 - self.drop_path
            # 创建与输入张量相同形状的随机张量
            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
            random_tensor = tf.floor(random_tensor)
            # 应用随机深度操作
            return (x / keep_prob) * random_tensor
        # 若不训练状态，则直接返回输入张量
        return x


class TFConvNextV2GRN(keras.layers.Layer):
    """GRN (Global Response Normalization) layer"""
    # 初始化函数，接受一个配置对象和一个整数维度作为参数
    def __init__(self, config: ConvNextV2Config, dim: int, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将输入的维度参数存储在对象的属性中
        self.dim = dim

    # 构建模型的方法，用于创建模型的权重
    def build(self, input_shape: tf.TensorShape = None):
        # 创建名为 "weight" 的模型权重，形状为 (1, 1, 1, self.dim)，使用零初始化器
        self.weight = self.add_weight(
            name="weight",
            shape=(1, 1, 1, self.dim),
            initializer=keras.initializers.Zeros(),
        )
        # 创建名为 "bias" 的模型偏置，形状同样为 (1, 1, 1, self.dim)，使用零初始化器
        self.bias = self.add_weight(
            name="bias",
            shape=(1, 1, 1, self.dim),
            initializer=keras.initializers.Zeros(),
        )
        # 调用父类的构建方法，传递输入形状参数
        return super().build(input_shape)

    # 模型的调用方法，用于执行前向传播
    def call(self, hidden_states: tf.Tensor):
        # 计算每个样本的全局特征向量的欧几里得范数
        global_features = tf.norm(hidden_states, ord="euclidean", axis=(1, 2), keepdims=True)
        # 对全局特征向量进行归一化，确保分母不为零
        norm_features = global_features / (tf.reduce_mean(global_features, axis=-1, keepdims=True) + 1e-6)
        # 计算加权后的隐藏状态并加上偏置项
        hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states
        # 返回加权后的隐藏状态作为输出
        return hidden_states
# 从transformers.models.convnext.modeling_tf_convnext.TFConvNextEmbeddings复制并改为ConvNextV2
class TFConvNextV2Embeddings(keras.layers.Layer):
    """这个类与src/transformers/models/swin/modeling_swin.py中的SwinEmbeddings类类似（并受其启发）。"""

    def __init__(self, config: ConvNextV2Config, **kwargs):
        super().__init__(**kwargs)
        # 定义用于提取补丁嵌入的卷积层
        self.patch_embeddings = keras.layers.Conv2D(
            filters=config.hidden_sizes[0],    # 输出特征的数量
            kernel_size=config.patch_size,     # 补丁大小
            strides=config.patch_size,         # 步幅大小
            name="patch_embeddings",           # 层名称
            kernel_initializer=get_initializer(config.initializer_range),  # 卷积核初始化器
            bias_initializer=keras.initializers.Zeros(),  # 偏置项初始化器
        )
        # LayerNormalization 层，用于标准化输入数据
        self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm")
        self.num_channels = config.num_channels  # 通道数
        self.config = config  # 配置参数

    def call(self, pixel_values):
        if isinstance(pixel_values, dict):
            pixel_values = pixel_values["pixel_values"]

        # 检查像素值张量的通道维度是否与配置中设置的一致
        tf.debugging.assert_equal(
            shape_list(pixel_values)[1],
            self.num_channels,
            message="确保像素值的通道维度与配置中设置的一致。",
        )

        # 当在CPU上运行时，`keras.layers.Conv2D`不支持`NCHW`格式，需要将输入格式从`NCHW`转换为`NHWC`
        # shape = (batch_size, in_height, in_width, in_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 提取补丁嵌入特征
        embeddings = self.patch_embeddings(pixel_values)
        # 应用层标准化
        embeddings = self.layernorm(embeddings)
        return embeddings

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，则直接返回
        if getattr(self, "patch_embeddings", None) is not None:
            with tf.name_scope(self.patch_embeddings.name):
                # 根据配置构建补丁嵌入层
                self.patch_embeddings.build([None, None, None, self.config.num_channels])
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                # 根据配置构建层标准化层
                self.layernorm.build([None, None, None, self.config.hidden_sizes[0]])


class TFConvNextV2Layer(keras.layers.Layer):
    """这对应于原始实现中的`Block`类。

    有两个等效的实现方式：
    [DwConv, LayerNorm (channels_first), Conv, GELU, 1x1 Conv]; 全部在(N, C, H, W)中
    [DwConv, 转换到(N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; 再转换回来

    作者在PyTorch中发现第二种方式略快。由于我们已经将输入调整为遵循NHWC顺序，因此可以直接应用操作而无需排列。
    """
    # 初始化函数，用于初始化类的实例
    def __init__(self, config: ConvNextV2Config, dim: int, drop_path: float = 0.0, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 设置输入通道数
        self.dim = dim
        # 设置模型配置
        self.config = config
        # 深度可分离卷积层，使用7x7的卷积核
        self.dwconv = keras.layers.Conv2D(
            filters=dim,
            kernel_size=7,
            padding="same",
            groups=dim,  # 分组数与输入通道数相同，实现深度可分离卷积
            kernel_initializer=get_initializer(config.initializer_range),  # 设置卷积核的初始化器
            bias_initializer=keras.initializers.Zeros(),  # 设置偏置的初始化器为零
            name="dwconv",  # 层的名称为dwconv
        )  # depthwise conv，深度可分离卷积
        # 层归一化层，使用默认的epsilon值为1e-6
        self.layernorm = keras.layers.LayerNormalization(
            epsilon=1e-6,
            name="layernorm",
        )
        # 点卷积层，输出单元数为4倍的输入通道数dim
        self.pwconv1 = keras.layers.Dense(
            units=4 * dim,
            kernel_initializer=get_initializer(config.initializer_range),  # 设置全连接层的权重初始化器
            bias_initializer=keras.initializers.Zeros(),  # 设置偏置的初始化器为零
            name="pwconv1",  # 层的名称为pwconv1
        )  # pointwise/1x1 convs，使用线性层实现的1x1卷积
        # 获取激活函数
        self.act = get_tf_activation(config.hidden_act)
        # ConvNextV2GRN模块，使用4倍dim的输出单元数
        self.grn = TFConvNextV2GRN(config, 4 * dim, dtype=tf.float32, name="grn")
        # 点卷积层2，输出单元数为dim
        self.pwconv2 = keras.layers.Dense(
            units=dim,
            kernel_initializer=get_initializer(config.initializer_range),  # 设置全连接层的权重初始化器
            bias_initializer=keras.initializers.Zeros(),  # 设置偏置的初始化器为零
            name="pwconv2",  # 层的名称为pwconv2
        )
        # 使用`layers.Activation`代替`tf.identity`，以更好地控制训练行为
        # 如果drop_path大于0.0，则使用TFConvNextV2DropPath进行随机深度跳连，否则使用线性激活层
        self.drop_path = (
            TFConvNextV2DropPath(drop_path, name="drop_path")
            if drop_path > 0.0
            else keras.layers.Activation("linear", name="drop_path")
        )

    # 定义call方法，实现类的函数调用功能
    def call(self, hidden_states, training=False):
        # 将输入数据赋值给input变量
        input = hidden_states
        # 深度可分离卷积层的前向传播
        x = self.dwconv(hidden_states)
        # 层归一化层的前向传播
        x = self.layernorm(x)
        # 点卷积层1的前向传播
        x = self.pwconv1(x)
        # 使用激活函数激活输出
        x = self.act(x)
        # ConvNextV2GRN模块的前向传播
        x = self.grn(x)
        # 点卷积层2的前向传播
        x = self.pwconv2(x)
        # 使用drop_path函数进行深度跳连，控制是否训练过程
        x = self.drop_path(x, training=training)
        # 返回原始输入和处理后的结果的和，实现残差连接
        x = input + x
        # 返回最终的输出结果
        return x
    # 定义 build 方法，用于构建模型结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        
        # 如果存在 dwconv 属性，则构建 depthwise convolution 层
        if getattr(self, "dwconv", None) is not None:
            # 在命名空间内构建 dwconv 层
            with tf.name_scope(self.dwconv.name):
                self.dwconv.build([None, None, None, self.dim])
        
        # 如果存在 layernorm 属性，则构建 Layer Normalization 层
        if getattr(self, "layernorm", None) is not None:
            # 在命名空间内构建 layernorm 层
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, None, None, self.dim])
        
        # 如果存在 pwconv1 属性，则构建 pointwise convolution 层
        if getattr(self, "pwconv1", None) is not None:
            # 在命名空间内构建 pwconv1 层
            with tf.name_scope(self.pwconv1.name):
                self.pwconv1.build([None, None, self.dim])
        
        # 如果存在 grn 属性，则构建 global reduction network 层
        if getattr(self, "grn", None) is not None:
            # 在命名空间内构建 grn 层
            with tf.name_scope(self.grn.name):
                self.grn.build(None)
        
        # 如果存在 pwconv2 属性，则构建第二个 pointwise convolution 层
        if getattr(self, "pwconv2", None) is not None:
            # 在命名空间内构建 pwconv2 层
            with tf.name_scope(self.pwconv2.name):
                self.pwconv2.build([None, None, 4 * self.dim])
        
        # 如果存在 drop_path 属性，则构建 drop path 层
        if getattr(self, "drop_path", None) is not None:
            # 在命名空间内构建 drop path 层
            with tf.name_scope(self.drop_path.name):
                self.drop_path.build(None)
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2
class TFConvNextV2Stage(keras.layers.Layer):
    """ConvNextV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config (`ConvNextV2Config`):
            Model configuration class.
        in_channels (`int`):
            Number of input channels.
        out_channels (`int`):
            Number of output channels.
        depth (`int`):
            Number of residual blocks.
        drop_path_rates(`List[float]`):
            Stochastic depth rates for each layer.
    """

    def __init__(
        self,
        config: ConvNextV2Config,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 2,
        stride: int = 2,
        depth: int = 2,
        drop_path_rates: Optional[List[float]] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # Check if downsampling is needed based on input and output channels, or if stride > 1
        if in_channels != out_channels or stride > 1:
            # Define a downsampling layer if conditions are met
            self.downsampling_layer = [
                keras.layers.LayerNormalization(
                    epsilon=1e-6,
                    name="downsampling_layer.0",
                ),
                # Additional comment on the Conv2D layer
                # This layer expects NHWC input format due to a previous format transformation.
                # Outputs are in NHWC format throughout until the format is changed back to NCHW.
                keras.layers.Conv2D(
                    filters=out_channels,
                    kernel_size=kernel_size,
                    strides=stride,
                    kernel_initializer=get_initializer(config.initializer_range),
                    bias_initializer=keras.initializers.Zeros(),
                    name="downsampling_layer.1",
                ),
            ]
        else:
            # If no downsampling is needed, use an identity function
            self.downsampling_layer = [tf.identity]

        # Initialize stochastic depth rates or set them to 0.0 if not provided
        drop_path_rates = drop_path_rates or [0.0] * depth
        # Create a list of TFConvNextV2Layer instances based on depth
        self.layers = [
            TFConvNextV2Layer(
                config,
                dim=out_channels,
                drop_path=drop_path_rates[j],
                name=f"layers.{j}",
            )
            for j in range(depth)
        ]
        # Store input and output channel counts and the stride value
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride

    def call(self, hidden_states):
        # Apply the downsampling layers to the input hidden_states
        for layer in self.downsampling_layer:
            hidden_states = layer(hidden_states)
        # Apply each residual layer in self.layers sequentially to hidden_states
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        # Return the processed hidden_states after all layers
        return hidden_states
    # 如果已经建立过网络结构，则直接返回，不做重复建立
    if self.built:
        return
    # 设置标志位表示网络已经建立
    self.built = True

    # 检查是否存在子层，若存在则逐层建立网络结构
    if getattr(self, "layers", None) is not None:
        for layer in self.layers:
            # 使用 TensorFlow 的命名空间，将每个子层的建立过程包裹起来
            with tf.name_scope(layer.name):
                layer.build(None)

    # 如果输入通道数不等于输出通道数，或者步幅大于1，需要建立降采样层
    if self.in_channels != self.out_channels or self.stride > 1:
        # 使用 TensorFlow 的命名空间，建立第一个降采样层
        with tf.name_scope(self.downsampling_layer[0].name):
            self.downsampling_layer[0].build([None, None, None, self.in_channels])
        # 使用 TensorFlow 的命名空间，建立第二个降采样层
        with tf.name_scope(self.downsampling_layer[1].name):
            self.downsampling_layer[1].build([None, None, None, self.in_channels])
# 定义 TFConvNextV2Encoder 类，继承自 keras.layers.Layer
class TFConvNextV2Encoder(keras.layers.Layer):
    
    # 初始化方法，接受一个 ConvNextV2Config 类型的配置对象和其他关键字参数
    def __init__(self, config: ConvNextV2Config, **kwargs):
        super().__init__(**kwargs)
        
        # 初始化空列表 stages 用于存储 TFConvNextV2Stage 对象
        self.stages = []
        
        # 生成一个线性空间的张量，作为各阶段的丢弃路径率，根据配置对象中的深度计算
        drop_path_rates = tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
        drop_path_rates = tf.split(drop_path_rates, config.depths)
        drop_path_rates = [x.numpy().tolist() for x in drop_path_rates]
        
        # 设置初始通道数为配置对象中隐藏层大小列表的第一个元素
        prev_chs = config.hidden_sizes[0]
        
        # 遍历每个阶段的数量，并创建 TFConvNextV2Stage 实例，加入到 stages 列表中
        for i in range(config.num_stages):
            out_chs = config.hidden_sizes[i]
            stage = TFConvNextV2Stage(
                config,
                in_channels=prev_chs,
                out_channels=out_chs,
                stride=2 if i > 0 else 1,
                depth=config.depths[i],
                drop_path_rates=drop_path_rates[i],
                name=f"stages.{i}",
            )
            self.stages.append(stage)
            prev_chs = out_chs

    # 定义 call 方法，处理输入的隐藏状态张量及一些可选的参数，返回模型输出
    def call(
        self,
        hidden_states: tf.Tensor,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, TFBaseModelOutputWithNoAttention]:
        # 初始化 all_hidden_states 为空元组或 None，根据输出隐藏状态的参数设置
        all_hidden_states = () if output_hidden_states else None
        
        # 遍历 self.stages 中的每个层模块，对隐藏状态进行处理
        for i, layer_module in enumerate(self.stages):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
            
            hidden_states = layer_module(hidden_states)
        
        # 如果输出隐藏状态，将最终隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
        
        # 如果 return_dict 为 False，返回一个元组，包括隐藏状态和所有隐藏状态
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
        
        # 如果 return_dict 为 True，返回 TFBaseModelOutputWithNoAttention 类型的对象
        return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)

    # 定义 build 方法，用于构建层次结构，对每个阶段应用命名空间
    def build(self, input_shape=None):
        for stage in self.stages:
            with tf.name_scope(stage.name):
                stage.build(None)


# 使用 keras_serializable 装饰器声明 TFConvNextV2MainLayer 类
@keras_serializable
class TFConvNextV2MainLayer(keras.layers.Layer):
    # 指定配置类为 ConvNextV2Config
    config_class = ConvNextV2Config

    # 初始化方法，接受一个 ConvNextV2Config 类型的配置对象和其他关键字参数
    def __init__(self, config: ConvNextV2Config, **kwargs):
        super().__init__(**kwargs)
        
        # 将配置对象保存到 self.config 属性中
        self.config = config
        
        # 创建 TFConvNextV2Embeddings 实例，用于处理嵌入层
        self.embeddings = TFConvNextV2Embeddings(config, name="embeddings")
        
        # 创建 TFConvNextV2Encoder 实例，用于处理编码器层
        self.encoder = TFConvNextV2Encoder(config, name="encoder")
        
        # 创建 LayerNormalization 层，用于规范化层次结构
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
        
        # 创建 GlobalAvgPool2D 层，用于全局平均池化，设置 data_format 参数为 "channels_last"
        self.pooler = keras.layers.GlobalAvgPool2D(data_format="channels_last")

    # 使用 unpack_inputs 装饰器声明 call 方法，处理输入像素值、输出隐藏状态、返回字典等参数
    @unpack_inputs
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        # 其他输入参数省略，这里未列出的参数将由装饰器处理
        ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 设置是否输出隐藏状态，如果未指定则使用模型配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典形式的输出，如果未指定则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值进行嵌入处理，获取嵌入输出
        embedding_output = self.embeddings(pixel_values, training=training)

        # 将嵌入输出传递给编码器进行编码
        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器的最后隐藏状态作为编码器的第一个输出
        last_hidden_state = encoder_outputs[0]

        # 使用池化器处理最后隐藏状态，生成池化输出
        pooled_output = self.pooler(last_hidden_state)
        
        # 调整最后隐藏状态的维度顺序为NCHW格式
        last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2))
        
        # 对池化输出进行层归一化处理
        pooled_output = self.layernorm(pooled_output)

        # 如果设置了输出隐藏状态，则将其他隐藏状态输出也转换为NCHW格式
        if output_hidden_states:
            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])

        # 如果不返回字典形式的输出，则根据设置返回相应的输出元组
        if not return_dict:
            hidden_states = hidden_states if output_hidden_states else ()
            return (last_hidden_state, pooled_output) + hidden_states

        # 返回带有池化输出和其他隐藏状态的TFBaseModelOutputWithPoolingAndNoAttention对象
        return TFBaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True

        # 如果嵌入层存在，则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        
        # 如果编码器存在，则构建编码器
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        
        # 如果层归一化存在，则构建层归一化
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, self.config.hidden_sizes[-1]])
"""
    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
      `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
      `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>
"""
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to `True`.
"""
@add_start_docstrings(
    "The bare ConvNextV2 model outputting raw features without any specific head on top.",
    CONVNEXTV2_START_DOCSTRING,
)
"""
class TFConvNextV2Model(TFConvNextV2PreTrainedModel):
    """
    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    """
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPoolingAndNoAttention, Tuple[tf.Tensor]]:
        """
        Process inputs through the TFConvNextV2 model and return outputs.

        Args:
            pixel_values: Input pixel values (image tensors).
            output_hidden_states: Whether to output hidden states.
            return_dict: Whether to return outputs as a dictionary.
            training: Whether the model is in training mode.

        Returns:
            Either a TFBaseModelOutputWithPoolingAndNoAttention object or a tuple with tensors.
        """
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # Forward pass through TFConvNextV2MainLayer
        outputs = self.convnextv2(
            pixel_values=pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        if not return_dict:
            return outputs[:]  # Return all outputs as a tuple

        # Return structured outputs as TFBaseModelOutputWithPoolingAndNoAttention
        return TFBaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=outputs.last_hidden_state,
            pooler_output=outputs.pooler_output,
            hidden_states=outputs.hidden_states,
        )

    def build(self, input_shape=None):
        """
        Build method for TFConvNextV2Model. Checks if model is already built before constructing layers.

        Args:
            input_shape: Shape of the input tensors (not used in this implementation).
        """
        if self.built:
            return
        self.built = True
        if getattr(self, "convnextv2", None) is not None:
            with tf.name_scope(self.convnextv2.name):
                self.convnextv2.build(None)


"""
@add_start_docstrings(
    """
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """,
    CONVNEXTV2_START_DOCSTRING,
)
"""
class TFConvNextV2ForImageClassification(TFConvNextV2PreTrainedModel, TFSequenceClassificationLoss):
    """
    Initialize TFConvNextV2ForImageClassification model.

    Args:
        config: ConvNextV2 configuration object.
        *inputs: Variable length argument list.
        **kwargs: Additional keyword arguments.

    Attributes:
        num_labels: Number of output labels for classification.
        convnextv2: TFConvNextV2MainLayer instance for feature extraction.
        classifier: Dense layer for classification head.
    """
    def __init__(self, config: ConvNextV2Config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels
        self.convnextv2 = TFConvNextV2MainLayer(config, name="convnextv2")

        # Classifier head
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            bias_initializer=keras.initializers.Zeros(),
            name="classifier",
        )

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CONVNEXTV2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 定义模型的调用方法，用于推理过程
    def call(
        self,
        pixel_values: TFModelInputType | None = None,  # 输入像素值，可以是TensorFlow模型输入类型或者None
        output_hidden_states: Optional[bool] = None,   # 是否输出隐藏状态，默认为None，如果为True则输出隐藏状态
        return_dict: Optional[bool] = None,             # 是否返回字典格式的输出，默认为None，如果为True则返回字典
        labels: np.ndarray | tf.Tensor | None = None,   # 图像分类/回归的标签，可以是numpy数组或TensorFlow张量类型，可选
        training: Optional[bool] = False,               # 是否处于训练模式，默认为False
    ) -> Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 根据输入或默认配置确定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据输入或默认配置确定是否使用字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果未提供像素值，则抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 调用ConvNextV2模型进行前向传播
        outputs = self.convnextv2(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 根据返回格式选择池化输出
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 使用分类器模型进行分类或回归预测
        logits = self.classifier(pooled_output)

        # 如果提供了标签，则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不要求字典格式的输出，则返回相应的元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求字典格式的输出，则返回TFImageClassifierOutputWithNoAttention对象
        return TFImageClassifierOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )

    # 构建模型的方法，用于设置模型的构建过程
    def build(self, input_shape=None):
        # 如果模型已经构建过则直接返回
        if self.built:
            return
        self.built = True

        # 如果定义了ConvNextV2模型，则构建ConvNextV2模型
        if getattr(self, "convnextv2", None) is not None:
            with tf.name_scope(self.convnextv2.name):
                self.convnextv2.build(None)

        # 如果定义了分类器模型，则根据配置构建分类器模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_sizes[-1]])

`.\models\convnextv2\init.py`

# flake8: noqa
# 无法在本模块中忽略 "F401 '...' imported but unused" 警告，以保留其他警告。因此，完全不检查本模块。

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# 根据 Apache 许可证版本 2.0（“许可证”）进行许可；
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件根据“原样”分发，
# 没有任何明示或暗示的保证或条件。
# 有关特定语言的权限，请参阅许可证。

from typing import TYPE_CHECKING

# 依赖于 isort 来合并导入项
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_tf_available,
)

# 定义导入结构
_import_structure = {
    "configuration_convnextv2": [
        "CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "ConvNextV2Config",
    ]
}

try:
    # 如果没有 Torch 可用，则引发 OptionalDependencyNotAvailable 异常
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加以下模型定义到导入结构中
    _import_structure["modeling_convnextv2"] = [
        "CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ConvNextV2ForImageClassification",
        "ConvNextV2Model",
        "ConvNextV2PreTrainedModel",
        "ConvNextV2Backbone",
    ]

try:
    # 如果没有 TensorFlow 可用，则引发 OptionalDependencyNotAvailable 异常
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 TensorFlow 可用，则添加以下 TensorFlow 模型定义到导入结构中
    _import_structure["modeling_tf_convnextv2"] = [
        "TFConvNextV2ForImageClassification",
        "TFConvNextV2Model",
        "TFConvNextV2PreTrainedModel",
    ]

if TYPE_CHECKING:
    # 如果是类型检查阶段，则导入以下类型相关的定义
    from .configuration_convnextv2 import (
        CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        ConvNextV2Config,
    )

    try:
        # 如果没有 Torch 可用，则引发 OptionalDependencyNotAvailable 异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，则导入以下 Torch 模型相关的定义
        from .modeling_convnextv2 import (
            CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LIST,
            ConvNextV2Backbone,
            ConvNextV2ForImageClassification,
            ConvNextV2Model,
            ConvNextV2PreTrainedModel,
        )

    try:
        # 如果没有 TensorFlow 可用，则引发 OptionalDependencyNotAvailable 异常
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 TensorFlow 可用，则导入以下 TensorFlow 模型相关的定义
        from .modeling_tf_convnextv2 import (
            TFConvNextV2ForImageClassification,
            TFConvNextV2Model,
            TFConvNextV2PreTrainedModel,
        )

else:
    # 如果不是类型检查阶段，则创建懒加载模块
    import sys

    # 使用懒加载模块将当前模块注册到 sys.modules 中
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\cpm\tokenization_cpm.py`

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
import os
import unicodedata
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple

import sentencepiece as spm  # 导入 sentencepiece 库

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import SPIECE_UNDERLINE, logging  # 导入特定工具和库


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}  # 定义词汇文件的名称字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "TsinghuaAI/CPM-Generate": "https://huggingface.co/TsinghuaAI/CPM-Generate/resolve/main/spiece.model",
    }
}


class CpmTokenizer(PreTrainedTokenizer):
    """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件名字典
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 设置预训练词汇文件映射

    def __init__(
        self,
        vocab_file,
        do_lower_case=False,
        remove_space=True,
        keep_accents=False,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        additional_special_tokens=["<eop>", "<eod>"],
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):
        super().__init__(  # 调用父类的初始化方法
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )
        self.vocab_file = vocab_file  # 设置词汇文件路径
        self.do_lower_case = do_lower_case  # 是否将输入转换为小写
        self.remove_space = remove_space  # 是否移除空格
        self.keep_accents = keep_accents  # 是否保留重音符号

        if sp_model_kwargs is None:
            sp_model_kwargs = {}  # 如果未提供参数，默认为空字典
        self.sp_model_kwargs = sp_model_kwargs  # 设置 sentencepiece 模型的参数

    @property
    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size
    def vocab_size(self):
        return len(self.sp_model)  # 返回 sentencepiece 模型的词汇量大小

    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_vocab
    def get_vocab(self):
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}  # 创建并返回词汇表
        vocab.update(self.added_tokens_encoder)  # 将额外添加的特殊标记加入词汇表
        return vocab

    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__getstate__
    def __getstate__(self):
        state = self.__dict__.copy()  # 复制当前对象的字典表示
        state["sp_model"] = None  # 将 sentencepiece 模型设为 None
        return state

    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__setstate__
    def __setstate__(self, d):
        self.__dict__ = d  # 恢复对象的状态字典

        # for backward compatibility
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}  # 处理旧版本兼容性

        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)  # 初始化 sentencepiece 模型
        self.sp_model.Load(self.vocab_file)  # 加载指定的词汇文件
    # 从 XLNetTokenizer 类中复制的方法，用于预处理文本输入
    def preprocess_text(self, inputs):
        # 如果设置了 remove_space 标志，去除输入文本两端空格并用单个空格连接
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        # 替换特定的引号符号，将 `` 和 '' 替换为双引号 "
        outputs = outputs.replace("``", '"').replace("''", '"')

        # 如果不保留重音符号，进行 Unicode 标准化处理
        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)
            # 过滤掉组合字符，保留单个字符
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        # 如果进行小写处理，将输出文本转换为小写
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    # 从 XLNetTokenizer 类中复制的方法，用于将文本分词为子词列表
    def _tokenize(self, text: str) -> List[str]:
        """Tokenize a string."""
        # 使用 preprocess_text 方法预处理文本
        text = self.preprocess_text(text)
        # 使用 sp_model 对象对文本进行编码，得到编码后的片段列表
        pieces = self.sp_model.encode(text, out_type=str)
        new_pieces = []
        # 遍历编码后的片段列表
        for piece in pieces:
            # 如果片段长度大于1且以逗号结尾且倒数第二个字符是数字，进行特殊处理
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                # 使用 sp_model.EncodeAsPieces 方法对片段进行进一步分解
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                # 如果原片段不以 SPIECE_UNDERLINE 开头且当前片段以此开头，调整处理
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                # 将处理后的片段加入到新片段列表中
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                # 否则直接将片段加入到新片段列表中
                new_pieces.append(piece)

        return new_pieces

    # 从 XLNetTokenizer 类中复制的方法，用于将 token 转换为其在词汇表中的 id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用 sp_model 对象的 PieceToId 方法将 token 转换为对应的 id
        return self.sp_model.PieceToId(token)

    # 从 XLNetTokenizer 类中复制的方法，用于将 id 转换为其在词汇表中的 token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用 sp_model 对象的 IdToPiece 方法将 index 转换为对应的 token
        return self.sp_model.IdToPiece(index)

    # 从 XLNetTokenizer 类中复制的方法，用于将 token 列表转换为单个字符串
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        # 将 token 列表连接为一个字符串，并替换 SPIECE_UNDERLINE 为空格，去除首尾空格
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    # 从 XLNetTokenizer 类中复制的方法，用于构建包含特殊标记的输入
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ):
        # 这个方法未完整给出，需要继续补充完整以符合原代码功能
    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.create_token_type_ids_from_sequences
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Generate token type IDs from a sequence or a pair of sequences. XLNet uses token type IDs to distinguish
        between sequences in a pair.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs where each ID corresponds to a sequence or a pair of sequences.
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]  # 分隔符列表，包含 XLNet 分隔符的 ID
        cls_segment_id = [2]  # 表示 XLNet 中的类别分割 ID

        if token_ids_1 is None:
            # 如果 token_ids_1 为 None，则返回仅包含第一个序列部分的 mask（全为 0）
            return len(token_ids_0 + sep) * [0] + cls_segment_id
        # 否则，返回包含两个序列部分的 mask，第一个序列部分为 0，第二个序列部分为 1，最后是类别分割 ID
        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id

    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.save_vocabulary
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            # 如果当前词汇表文件与目标路径不同并且当前词汇表文件存在，则复制当前词汇表文件到目标路径
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            # 否则，将当前词汇表文件的序列化模型写入目标路径
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file,)

    def _decode(self, *args, **kwargs):
        text = super()._decode(*args, **kwargs)  # 调用父类的 _decode 方法获取文本
        text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")  # 处理文本中的特殊字符替换
        return text

`.\models\cpm\tokenization_cpm_fast.py`

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
import os
from shutil import copyfile
from typing import List, Optional, Tuple

from ...tokenization_utils_fast import AddedToken, PreTrainedTokenizerFast
from ...utils import logging

# 获取全局的日志记录器
logger = logging.get_logger(__name__)

# 定义预设的词汇文件名
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型与其对应的词汇文件和分词器文件的映射关系
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "TsinghuaAI/CPM-Generate": "https://huggingface.co/TsinghuaAI/CPM-Generate/resolve/main/spiece.model",
    },
    "tokenizer_file": {
        "TsinghuaAI/CPM-Generate": "https://huggingface.co/TsinghuaAI/CPM-Generate/resolve/main/tokenizer.json",
    },
}

# 定义 CpmTokenizerFast 类，继承自 PreTrainedTokenizerFast
class CpmTokenizerFast(PreTrainedTokenizerFast):
    """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""

    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=False,
        remove_space=True,
        keep_accents=False,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        additional_special_tokens=["<eop>", "<eod>"],
        **kwargs,
    ):
        # 继承父类的初始化方法，设定各种标记和参数
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查是否可以保存慢速分词器，基于词汇文件的存在性
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    # 从 transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast 复制而来
    # 用于构建带有特殊标记的输入序列
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    # 定义一个方法用于生成用于序列分类任务的模型输入，通过连接和添加特殊标记来构建。XLNet 序列的格式如下：
    #
    # - 单个序列：`X <sep> <cls>`
    # - 序列对：`A <sep> B <sep> <cls>`
    #
    # Args:
    #     token_ids_0 (`List[int]`):
    #         要添加特殊标记的 ID 列表。
    #     token_ids_1 (`List[int]`, *optional*):
    #         第二个序列的可选 ID 列表，用于序列对。
    #
    # Returns:
    #     `List[int]`: 包含适当特殊标记的输入 ID 列表。
    def create_inputs_for_sequence_classification(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        sep = [self.sep_token_id]  # 分隔符的 ID 列表
        cls = [self.cls_token_id]  # 类别标记的 ID 列表

        if token_ids_1 is None:
            return token_ids_0 + sep + cls  # 单个序列的情况
        return token_ids_0 + sep + token_ids_1 + sep + cls  # 序列对的情况

    # 从两个序列创建用于序列对分类任务的 token 类型 ID 列表。XLNet 的序列对 mask 格式如下：
    #
    # ```
    # 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
    # | 第一个序列        | 第二个序列     |
    # ```
    #
    # 如果 `token_ids_1` 是 `None`，则该方法仅返回 mask 的第一部分（全为 0）。
    #
    # Args:
    #     token_ids_0 (`List[int]`):
    #         第一个序列的 ID 列表。
    #     token_ids_1 (`List[int]`, *optional*):
    #         第二个序列的可选 ID 列表，用于序列对。
    #
    # Returns:
    #     `List[int]`: 根据给定序列(s)生成的 token 类型 ID 列表。
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        sep = [self.sep_token_id]  # 分隔符的 ID 列表
        cls_segment_id = [2]  # 类别片段 ID 列表

        if token_ids_1 is None:
            return len(token_ids_0 + sep) * [0] + cls_segment_id  # 只有第一个序列的情况
        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id  # 序列对的情况
    # 将词汇表保存到指定目录下的文件中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果无法保存慢速分词器的词汇表，则引发值错误异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存目录不存在，则记录错误日志并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建输出词汇表文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与输出路径不一致，则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回保存的词汇表文件路径
        return (out_vocab_file,)

    # 对批量文本或文本对进行编码处理
    def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
        # 使用结巴分词器处理每个文本，去除空格和特殊字符后进行拼接
        batch_text_or_text_pairs = [
            " ".join([x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)])
            for text in batch_text_or_text_pairs
        ]
        # 调用父类方法对处理后的文本进行编码处理
        return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)

    # 解码处理方法
    def _decode(self, *args, **kwargs):
        # 调用父类方法进行解码处理
        text = super()._decode(*args, **kwargs)
        # 替换文本中的特殊空格和分隔符
        text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
        # 返回处理后的文本
        return text

`.\models\cpm\init.py`

# 引入类型检查模块，用于判断当前是否处于类型检查模式
from typing import TYPE_CHECKING

# 引入自定义的异常类，用于处理可选依赖项不可用的情况
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available

# 定义一个空的导入结构字典，用于存储延迟导入的模块和类
_import_structure = {}

# 尝试检查是否可用 sentencepiece，如果不可用则抛出自定义异常 OptionalDependencyNotAvailable
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 CpmTokenizer 添加到导入结构中
    _import_structure["tokenization_cpm"] = ["CpmTokenizer"]

# 尝试检查是否可用 tokenizers，如果不可用则抛出自定义异常 OptionalDependencyNotAvailable
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 CpmTokenizerFast 添加到导入结构中
    _import_structure["tokenization_cpm_fast"] = ["CpmTokenizerFast"]

# 如果当前是类型检查模式
if TYPE_CHECKING:
    try:
        # 再次检查是否可用 sentencepiece，如果不可用则抛出自定义异常 OptionalDependencyNotAvailable
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，在类型检查模式下从 tokenization_cpm 导入 CpmTokenizer
        from .tokenization_cpm import CpmTokenizer

    try:
        # 再次检查是否可用 tokenizers，如果不可用则抛出自定义异常 OptionalDependencyNotAvailable
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，在类型检查模式下从 tokenization_cpm_fast 导入 CpmTokenizerFast
        from .tokenization_cpm_fast import CpmTokenizerFast

# 如果不是类型检查模式
else:
    import sys

    # 将当前模块设为一个延迟加载模块，使用 _LazyModule 将 _import_structure 作为导入结构
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\cpmant\configuration_cpmant.py`

# coding=utf-8
# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" CPMAnt model configuration"""

# 从configuration_utils模块导入PretrainedConfig类
from ...configuration_utils import PretrainedConfig
# 从utils模块导入logging函数
from ...utils import logging

# 获取logger对象，用于日志记录
logger = logging.get_logger(__name__)

# CPMAnt预训练配置文件的映射字典，指定模型名称和其对应的配置文件URL
CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/config.json"
    # 查看所有CPMAnt模型：https://huggingface.co/models?filter=cpmant
}


class CpmAntConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`CpmAntModel`]. It is used to instantiate an
    CPMAnt model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the CPMAnt
    [openbmb/cpm-ant-10b](https://huggingface.co/openbmb/cpm-ant-10b) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 模型类型设定为 "cpmant"
    model_type = "cpmant"
    # 初始化函数，用于初始化一个 Transformer 模型的参数和配置
    def __init__(
        self,
        vocab_size: int = 30720,  # 词汇表大小，默认为 30720
        hidden_size: int = 4096,  # 隐藏层的尺寸，默认为 4096
        num_attention_heads: int = 32,  # 注意力头的数量，默认为 32
        dim_head: int = 128,  # 注意力头的维度，默认为 128
        dim_ff: int = 10240,  # FeedForward 层的尺寸，默认为 10240
        num_hidden_layers: int = 48,  # Transformer 层的数量，默认为 48
        dropout_p: int = 0.0,  # Dropout 概率，默认为 0.0，即无 dropout
        position_bias_num_buckets: int = 512,  # 位置偏置的哈希桶数量，默认为 512
        position_bias_max_distance: int = 2048,  # 位置偏置的最大距离，默认为 2048
        eps: int = 1e-6,  # 避免除零的小数，默认为 1e-6
        init_std: float = 1.0,  # 参数初始化的标准差，默认为 1.0
        prompt_types: int = 32,  # 提示类型的数量，默认为 32
        prompt_length: int = 32,  # 提示长度，默认为 32
        segment_types: int = 32,  # 段落类型的数量，默认为 32
        use_cache: bool = True,  # 是否使用缓存，默认为 True
        **kwargs,  # 其他额外的参数，以字典形式接收
    ):
        # 调用父类的初始化方法，传递额外的关键字参数
        super().__init__(**kwargs)
        # 初始化特定于 Prompt 的参数
        self.prompt_types = prompt_types
        self.prompt_length = prompt_length
        self.segment_types = segment_types
        # 初始化通用的 Transformer 参数
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.dim_head = dim_head
        self.dim_ff = dim_ff
        self.num_hidden_layers = num_hidden_layers
        self.position_bias_num_buckets = position_bias_num_buckets
        self.position_bias_max_distance = position_bias_max_distance
        self.dropout_p = dropout_p
        self.eps = eps
        self.use_cache = use_cache
        self.vocab_size = vocab_size
        self.init_std = init_std

`.\models\cpmant\modeling_cpmant.py`

# 设置文件编码格式为UTF-8
# 版权声明，指明本代码文件的版权归属
# 根据Apache许可证2.0，除非符合许可证条件，否则不得使用此文件
# 可在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据本软件的发布是在"按现状"的基础上，不附带任何明示或暗示的担保或条件
# 有关授权的详细信息，请参阅许可证。
""" PyTorch CPMAnt"""

import math  # 导入数学库，用于执行数学运算
from typing import List, Optional, Tuple, Union  # 引入类型提示的相关类

import torch  # 导入PyTorch深度学习库
import torch.nn.functional as F  # 导入PyTorch中的函数库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint工具
from torch import nn  # 从PyTorch导入神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 从上层目录中导入激活函数
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast  # 从模型输出中导入基础模型输出和有过去上下文的因果语言建模输出
from ...modeling_utils import PreTrainedModel  # 从模型工具中导入预训练模型类
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging  # 从工具类中导入文档字符串添加工具和日志记录工具
from .configuration_cpmant import CpmAntConfig  # 从当前目录导入CPMAnt模型的配置类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "openbmb/cpm-ant-10b"  # CPMAnt模型的预训练检查点路径
_CONFIG_FOR_DOC = "CpmAntConfig"  # CPMAnt模型的配置类名称

CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openbmb/cpm-ant-10b",  # CPMAnt模型的预训练模型存档列表中包含的路径
    # 可在此处查看所有CPMAnt模型：https://huggingface.co/models?filter=cpmant
]


class CpmAntLayerNorm(nn.Module):
    """
    We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
    """

    def __init__(self, config: CpmAntConfig):
        super().__init__()  # 调用父类的初始化方法

        self.eps = config.eps  # 初始化层归一化时的epsilon值
        self.dim_norm = config.hidden_size  # 从配置中获取隐藏尺寸
        self.weight = nn.Parameter(torch.empty(config.hidden_size))  # 初始化权重参数

    def forward(self, hidden_states: torch.Tensor):
        """
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        """
        if hidden_states.size(-1) != self.dim_norm:
            raise AssertionError("hidden_states.size(-1) != self.dim_norm")  # 如果隐藏状态的最后一个维度不等于预期的尺寸，则引发断言错误

        old_dtype = hidden_states.dtype  # 保存旧的数据类型
        variance = hidden_states.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)  # 计算方差
        hidden_states = (hidden_states * torch.rsqrt(variance + self.eps)).to(old_dtype) * self.weight  # 应用层归一化
        return hidden_states  # 返回归一化后的隐藏状态


class CpmAntAttention(nn.Module):
    # 初始化方法，接受一个配置对象 config: CpmAntConfig
    def __init__(self, config: CpmAntConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置模型的隐藏大小为配置中的隐藏大小
        self.dim_model = config.hidden_size
        # 设置注意力头的数量为配置中的注意力头数量
        self.num_heads = config.num_attention_heads
        # 设置每个注意力头的维度为配置中的注意力头维度
        self.dim_head = config.dim_head

        # 创建用于投影查询向量的线性层，输出维度为注意力头数乘以每个头的维度
        self.project_q = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
        # 创建用于投影键向量的线性层，输出维度为注意力头数乘以每个头的维度
        self.project_k = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)
        # 创建用于投影值向量的线性层，输出维度为注意力头数乘以每个头的维度
        self.project_v = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False)

        # 创建用于输出注意力计算结果的线性层，输入为注意力头数乘以每个头的维度，输出为隐藏大小
        self.attention_out = nn.Linear(self.num_heads * self.dim_head, self.dim_model, bias=False)

        # 创建一个在最后一个维度上进行 softmax 操作的 Softmax 层
        self.softmax = torch.nn.Softmax(dim=-1)

        # 如果配置中指定了 dropout 概率，则创建一个 Dropout 层，否则设为 None
        if config.dropout_p is not None:
            self.dropout = torch.nn.Dropout(p=config.dropout_p)
        else:
            self.dropout = None
class CpmAntSelfAttentionBlock(nn.Module):
    def __init__(self, config: CpmAntConfig):
        super().__init__()
        # 初始化自注意力模块前的 LayerNormalization 层
        self.layernorm_before_attention = CpmAntLayerNorm(config)
        # 初始化自注意力机制模块
        self.self_attention = CpmAntAttention(config)
        # 如果配置中定义了 dropout 概率，则创建对应的 Dropout 层，否则设为 None
        if config.dropout_p:
            self.dropout = torch.nn.Dropout(config.dropout_p)
        else:
            self.dropout = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        position_bias: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
    ):
        """
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                自注意力模块的输入，可以是一批序列的原始嵌入。
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                遮罩矩阵，避免无效区域参与自注意力计算。
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                位置偏置，提供给自注意力模块的位置信息。
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
                缓存的过去键和值投影状态。
            use_cache (`bool`, *optional*):
                如果设置为 `True`，则返回 `past_key_values` 键值状态，可用于加速解码过程（参见 `past_key_values`）。
        """
        # 应用 LayerNormalization 到输入的 hidden_states
        outputs = self.layernorm_before_attention(hidden_states)
        # 调用自注意力模块进行计算
        outputs = self.self_attention(
            outputs, outputs, attention_mask, position_bias, output_attentions, past_key_values, use_cache
        )

        outputs, attn_weights, current_key_value = outputs

        # 如果存在 Dropout 层，则应用 Dropout
        if self.dropout is not None:
            outputs = self.dropout(outputs)
        # 将输出与原始输入相加，作为最终输出
        hidden_states = hidden_states + outputs

        return hidden_states, attn_weights, current_key_value


class CpmAntDenseGatedACT(nn.Module):
    def __init__(self, config: CpmAntConfig):
        super().__init__()
        # 初始化线性变换 w_0 和 w_1
        self.w_0 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
        self.w_1 = nn.Linear(config.hidden_size, config.dim_ff, bias=False)
        # 初始化激活函数 GELU
        self.act = torch.nn.GELU()

    def forward(self, hidden_states: torch.Tensor):
        """通过非线性操作将输入张量从一个特征空间转换到另一个特征空间

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        """
        # 计算门控分数
        gate_score = self.act(self.w_0(hidden_states))
        # 进行线性变换
        hidden_states = self.w_1(hidden_states)
        # 使用门控分数对 hidden_states 进行加权乘法
        hidden_states = gate_score * hidden_states
        return hidden_states
# 定义一个名为 CpmAntTransformerBlock 的类，继承自 nn.Module
class CpmAntTransformerBlock(nn.Module):
    # 初始化函数，接收一个 config 参数，类型为 CpmAntConfig
    def __init__(self, config: CpmAntConfig):
        super().__init__()
        # 创建 self_att 属性，使用 CpmAntSelfAttentionBlock 类初始化，传入 config 参数
        self.self_att = CpmAntSelfAttentionBlock(config)
        # 创建 ffn 属性，使用 CpmAntFFNBlock 类初始化，传入 config 参数
        self.ffn = CpmAntFFNBlock(config)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        position_bias: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
    ):
        """
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                输入的隐藏状态，形状为 (batch, len_seq, dim_model)。
            attention_mask (`torch.Tensor`):
                注意力掩码，形状可以根据具体应用而变化。
            position_bias (`Optional[torch.Tensor]`, optional):
                位置偏置张量，形状可以根据具体应用而变化，默认为 None。
            output_attentions (`Optional[bool]`, optional):
                是否输出注意力权重，默认为 False。
            past_key_values (`Optional[Tuple[torch.Tensor, torch.Tensor]]`, optional):
                过去的键-值对，用于缓存，形状为 (key, value)，默认为 None。
            use_cache (`Optional[bool]`, optional):
                是否使用缓存，默认为 None。

        Returns:
            `torch.Tensor`: 经过自注意力和前馈网络后的隐藏状态张量。
        """
        # 对输入的 hidden_states 进行自注意力操作，并将结果保存在 ln_outputs 中
        ln_outputs = self.self_att(hidden_states, attention_mask, position_bias,
                                   output_attentions, past_key_values, use_cache)
        # 将 ln_outputs 输入到前馈网络 ffn 中，得到输出并保存在 outputs 中
        outputs = self.ffn(ln_outputs)
        # 如果存在 dropout 层，则对输出进行 dropout 处理
        if self.dropout is not None:
            outputs = self.dropout(outputs)
        # 将原始隐藏状态 hidden_states 与前馈网络的输出相加，得到最终的隐藏状态结果
        hidden_states = hidden_states + outputs
        # 返回最终的隐藏状态结果
        return hidden_states
    ):
        """
        Args:
            hidden_states (`torch.Tensor`):
                输入到层的张量，形状为 `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                避免无效区域参与计算的张量，形状为 `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                提供位置信息给注意力机制的张量，形状为 `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *可选*):
                是否返回所有注意力层的注意力张量。
            past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *可选*):
                缓存的过去键和值投影状态
            use_cache (`bool`, *可选*):
                如果设置为 `True`，则返回 `past_key_values` 键值状态，可用于加速解码 (参见 `past_key_values`)。
        """
        # 使用 self_att 层处理隐藏状态
        hidden_states = self.self_att(
            hidden_states,
            attention_mask=attention_mask,
            position_bias=position_bias,
            output_attentions=output_attentions,
            past_key_values=past_key_values,
            use_cache=use_cache,
        )

        # 解包处理后的隐藏状态、注意力权重和当前键值
        hidden_states, attn_weights, current_key_value = hidden_states

        # 使用 ffn 层处理隐藏状态
        hidden_states = self.ffn(hidden_states)

        # 返回处理后的隐藏状态、注意力权重和当前键值
        return hidden_states, attn_weights, current_key_value
class CpmAntEncoder(nn.Module):
    # CpmAntEncoder 类定义，继承自 nn.Module
    def __init__(self, config: CpmAntConfig):
        # 初始化方法，接受一个 CpmAntConfig 类型的参数 config
        super().__init__()
        # 调用父类的初始化方法
        self.num_layers = config.num_hidden_layers
        # 从 config 中获取隐藏层的数量
        self.layers = nn.ModuleList([CpmAntTransformerBlock(config) for ith in range(self.num_layers)])
        # 使用列表推导式创建一个包含多个 CpmAntTransformerBlock 实例的 ModuleList

        self.output_layernorm = CpmAntLayerNorm(config)
        # 初始化输出层的 LayerNorm

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        position_bias: torch.Tensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
    ):
        """
        Args:
            hidden_states (`torch.Tensor`):
                输入的张量，形状为 `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                注意力掩码张量，形状为 `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                位置偏置张量，提供位置信息给注意力机制，形状为 `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量
            output_hidden_states (`bool`, *optional*):
                是否返回所有层的隐藏状态
            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
                缓存的过去键和值投影状态
            use_cache (`bool`, *optional*):
                如果为 `True`，返回 `past_key_values` 键值状态以加速解码
        """
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出隐藏状态，则初始化空元组，否则为 None
        all_self_attns = () if output_attentions else None
        # 如果需要输出注意力张量，则初始化空元组，否则为 None
        current_key_values = () if use_cache else None
        # 如果使用缓存，则初始化空元组，否则为 None

        for i, layer in enumerate(self.layers):
            # 遍历所有 Transformer 层
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            layer_outputs = layer(
                hidden_states,
                attention_mask,
                position_bias,
                output_attentions=output_attentions,
                past_key_values=past_key_values[i] if past_key_values else None,
                use_cache=use_cache,
            )
            # 调用 Transformer 层的 forward 方法
            hidden_states, attn_weights, current_key_value = layer_outputs
            # 获取 Transformer 层的输出：隐藏状态、注意力权重、当前键值状态
            if output_attentions:
                all_self_attns += (attn_weights,)
                # 如果需要输出注意力张量，则将当前注意力权重添加到 all_self_attns 中
            if current_key_value is not None:
                current_key_values = current_key_values + (current_key_value,)
                # 如果当前键值状态不为 None，则添加到 current_key_values 中

        hidden_states = self.output_layernorm(hidden_states)
        # 对最终的隐藏状态进行 LayerNorm 处理

        if output_hidden_states:
            all_hidden_states += (hidden_states,)
            # 如果需要输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中

        return hidden_states, current_key_values, all_hidden_states, all_self_attns
        # 返回最终的隐藏状态、当前键值状态、所有隐藏状态、所有注意力张量
# 从transformers.models.bert.modeling_bert.BertIntermediate复制而来，将Bert->CPMAnt
class CpmAntIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将输入特征大小转换为中间层特征大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数，可能是预定义的激活函数或者自定义的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用线性层进行特征转换
        hidden_states = self.dense(hidden_states)
        # 应用中间层的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class CpmAntSegmentPositionEmbedding(nn.Module):
    def __init__(self, config: CpmAntConfig):
        super().__init__()

        # 设置注意力头数、位置偏置的桶数、最大距离和段落数
        self.num_heads = config.num_attention_heads
        self.num_buckets = config.position_bias_num_buckets
        self.max_distance = config.position_bias_max_distance
        self.num_segments = config.segment_types

        # 定义相对注意力偏置的参数，形状为 (段落数 * 段落数 + 桶数, 注意力头数)
        self.relative_attention_bias = nn.Parameter(
            torch.empty(
                config.segment_types * config.segment_types + config.position_bias_num_buckets,
                config.num_attention_heads,
            )
        )

    def forward(
        self,
        key_pos: torch.Tensor,
        query_pos: torch.Tensor,
        key_segment: torch.Tensor,
        query_segment: torch.Tensor,
        ):
            # 进入上下文管理器，禁用梯度计算
            with torch.no_grad():
                # 获取批量大小和键值位置序列的长度
                batch = key_pos.size(0)
                keylen = key_pos.size(1)
                querylen = query_pos.size(1)

                # 检查键值位置序列的批量大小是否与查询位置序列相同，若不同则引发断言错误
                if key_pos.size(0) != query_pos.size(0):
                    raise AssertionError(
                        f"key_pos.size(0) should be equal to query_pos.size(0), but got {key_pos.size(0)} and {query_pos.size(0)}!"
                    )
                # 检查键值长度和键段长度是否一致，若不一致则引发断言错误
                if keylen != key_segment.size(1) or querylen != query_segment.size(1):
                    raise AssertionError(
                        f"keylen should be equal to key_segment.size(1), but got {keylen} and {key_segment.size(1)}!"
                    )
                # 检查查询长度和查询段长度是否一致，若不一致则引发断言错误
                if querylen != query_segment.size(1):
                    raise AssertionError(
                        f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.szie(1)}!"
                    )

                # 对键值位置序列和查询位置序列进行形状重塑
                key_pos = key_pos.view(batch, -1, keylen)
                query_pos = query_pos.view(batch, querylen, -1)
                key_segment = key_segment.view(batch, -1, keylen)
                query_segment = query_segment.view(batch, querylen, -1)

                # 计算相对位置桶
                relative_position_bucket = self._segment_relative_position_bucket(query_segment, key_segment)
                relative_position_bucket = relative_position_bucket + self.num_buckets

                # (batch, len_q, len_k)
                # 计算绝对位置桶
                absolute_position_bucket = self._position_bucket(
                    torch.arange(keylen, dtype=torch.int32, device=relative_position_bucket.device)[None, :]
                    - torch.arange(querylen, dtype=torch.int32, device=relative_position_bucket.device)[:, None],
                    num_buckets=self.num_buckets,
                    max_distance=self.max_distance,
                )
                # 根据条件更新相对位置桶
                relative_position_bucket = torch.where(
                    (key_segment == query_segment),
                    absolute_position_bucket[None, :, :],
                    relative_position_bucket,
                )

            # (batch, len_q, len_k, num_heads)
            # 使用相对注意力偏置对相对位置桶进行嵌入
            embeds = F.embedding(relative_position_bucket, self.relative_attention_bias)
            # (batch, num_heads, len_q, len_k)
            # 重新排列张量维度以匹配注意力矩阵的期望格式
            embeds = embeds.permute(0, 3, 1, 2).contiguous()
            return embeds

    # 计算查询段和键段的相对位置桶
    def _segment_relative_position_bucket(self, query_segment, key_segment):
        return query_segment * self.num_segments + key_segment
    # 定义一个方法来计算相对位置对应的桶号
    def _position_bucket(self, relative_position, num_buckets=32, max_distance=128):
        # CPMAnt 算法中始终是双向的
        num_buckets //= 2
        # 根据相对位置是否大于零来确定相对桶号的基数
        relative_buckets = (relative_position > 0).to(torch.int32) * num_buckets
        # 计算相对位置的绝对值
        relative_position = torch.abs(relative_position)
        # 定义桶的最大精确值
        max_exact = num_buckets // 2
        # 判断相对位置是否属于小距离
        is_small = relative_position < max_exact
        # 如果是大距离，则计算大距离情况下的相对桶号
        relative_postion_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.int32)
        # 确保相对桶号不超出桶的最大数量
        relative_postion_if_large = torch.min(
            relative_postion_if_large,
            torch.full_like(relative_postion_if_large, num_buckets - 1),
        )
        # 根据距离大小选择最终的相对桶号
        relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_postion_if_large)
        # 返回计算得到的相对桶号
        return relative_buckets
# 从transformers.models.bert.modeling_bert.BertOutput复制并将Bert->CPMAnt
class CpmAntOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层，将中间尺寸的输出转换为隐藏尺寸
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个LayerNorm层，用于规范隐藏状态的输出
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个Dropout层，用于随机失活隐藏状态中的一部分单元
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态输入全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的隐藏状态进行随机失活
        hidden_states = self.dropout(hidden_states)
        # 将随机失活后的隐藏状态与输入张量相加，再输入LayerNorm层进行规范化
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回规范化后的隐藏状态作为输出
        return hidden_states


class CpmAntPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定模型的配置类为CpmAntConfig
    config_class = CpmAntConfig
    # 模型参数的前缀设置为"cpmant"
    base_model_prefix = "cpmant"

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 对线性层的权重进行正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
            # 如果存在偏置项，则将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 对嵌入层的权重进行正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
            # 如果设置了padding_idx，则将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 对LayerNorm层的偏置项初始化为零，权重初始化为1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, CpmAntLayerNorm):
            # 对自定义的CpmAntLayerNorm层的权重初始化为1.0
            module.weight.data.fill_(1.0)
        elif isinstance(module, CpmAntSegmentPositionEmbedding):
            # 对自定义的CpmAntSegmentPositionEmbedding层的相对注意力偏置项进行正态分布初始化
            module.relative_attention_bias.data.normal_(mean=0.0, std=self.config.init_std)


CPMANT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters
        config ([`~CpmAntConfig`]): Model configuration class with all the parameters of the
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

CPMANT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            # 输入序列标记在词汇表中的索引。

            # 可以使用 `CPMAntTokenizer` 获得这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__` 获取更多细节。

            # [什么是输入ID？](../glossary#input-ids)

        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            # 包含预先计算的隐藏状态（自注意力块和交叉注意力块中的键和值），可以用于加速序列解码。

            # 当 `use_cache=True` 或 `config.use_cache=True` 时返回。

        use_cache (`bool`, *optional*):
            # 如果设置为 `True`，则返回 `past_key_values` 中的键值状态，可用于加速解码。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。

        return_dict (`bool`, *optional*):
            # 是否返回一个 `~utils.ModelOutput` 而不是一个普通的元组。
"""
@add_start_docstrings(
    """
    The CPMAnt Model outputting raw hidden-states without any specific head on top.
    """,
    CPMANT_START_DOCSTRING,
)
"""
定义一个 CPMAnt 模型类，用于生成不带特定输出头的原始隐藏状态。

class CpmAntModel(CpmAntPreTrainedModel):
    """
    CPMAnt 模型类，继承自 CpmAntPreTrainedModel。
    """
    def __init__(self, config: CpmAntConfig):
        """
        初始化方法，接受一个 CpmAntConfig 对象作为参数。
        """
        super().__init__(config)
        # 初始化编码器
        self.encoder = CpmAntEncoder(config)
        # 初始化分段嵌入
        self.segment_embedding = nn.Embedding(config.segment_types, config.hidden_size)
        # 初始化输入嵌入
        self.input_embedding = nn.Embedding(
            config.vocab_size + config.prompt_types * config.prompt_length, config.hidden_size
        )
        # 初始化位置偏置
        self.position_bias = CpmAntSegmentPositionEmbedding(config)
        # 设置提示长度和词汇表大小
        self.prompt_length = config.prompt_length
        self.vocab_size = config.vocab_size

        # 执行初始化后的附加步骤
        self.post_init()

    def get_input_embeddings(self):
        """
        返回输入嵌入层。
        """
        return self.input_embedding

    def set_input_embeddings(self, embeddings, **kwargs):
        """
        设置输入嵌入层。
        """
        self.input_embedding = embeddings

    def _prepare_attention_mask(self, input_ids, span, context, length):
        """
        准备注意力掩码。
        """
        batch = input_ids.size(0)
        seqlen = input_ids.size(1)
        device = input_ids.device

        # 创建方向性掩码
        directional_mask_2d = torch.arange(seqlen, device=device) <= torch.arange(seqlen, device=device).view(-1, 1)
        attention_mask = context[:, None, :] | (
            context[:, :, None].logical_not() & directional_mask_2d.view(1, seqlen, seqlen)
        )
        attention_mask = attention_mask & (span[:, None, :] == span[:, :, None])

        # 创建左填充掩码
        mask_1d = (
            torch.tensor(list(range(seqlen - self.prompt_length))[::-1], device=device)[None, :].repeat(batch, 1)
            < length[:, None]
        )
        mask_1d = torch.cat((torch.ones(batch, self.prompt_length, device=device).bool(), mask_1d), dim=1)
        attention_mask = mask_1d.view(batch, seqlen, 1) & mask_1d.view(batch, 1, seqlen) & attention_mask

        return attention_mask

    @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        use_cache: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ):
        """
        CPMAnt 模型的前向传播方法，接受多个输入参数并返回输出。

        Args:
            input_ids (Optional[torch.Tensor], optional): 输入张量，默认为 None。
            output_attentions (Optional[bool], optional): 是否输出注意力，默认为 None。
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态，默认为 None。
            past_key_values (Optional[Tuple[Tuple[torch.Tensor]]], optional): 过去键值元组，默认为 None。
            use_cache (Optional[bool], optional): 是否使用缓存，默认为 None。
            return_dict (Optional[bool], optional): 是否返回字典，默认为 None。
            **kwargs: 其他关键字参数。

        Returns:
            模型的输出，包含过去的键值对。
        """
        # 实际前向传播逻辑在子类中实现
        pass


@add_start_docstrings(
    """
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    """,
    CPMANT_START_DOCSTRING,
)
"""
定义一个带有语言建模头的 CPMAnt 模型类，使用输入嵌入层权重来绑定线性层。
class CpmAntForCausalLM(CpmAntPreTrainedModel):
    """
    CPMAnt 用于因果语言建模的模型类，继承自 CpmAntPreTrainedModel。
    """
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config: CpmAntConfig):
        # 调用父类的构造方法，传入配置参数
        super().__init__(config)
        # 使用给定的配置参数初始化 CpmAntModel 实例
        self.cpmant = CpmAntModel(config)

        # lm_head.weight 被绑定到 cpmant.input_embedding.weight
        # 初始化一个线性层，输入大小为 config.hidden_size，输出大小为 config.vocab_size + config.prompt_types * config.prompt_length，无偏置
        self.lm_head = nn.Linear(
            config.hidden_size, config.vocab_size + config.prompt_types * config.prompt_length, bias=False
        )
        # 执行初始化后续操作
        self.post_init()

    @add_start_docstrings_to_model_forward(CPMANT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
        attention_mask: Optional[torch.Tensor] = None,  # 文本生成流程中的虚拟参数
        **kwargs,
    ):
        # 此处定义模型的前向传播逻辑，具体实现可能涉及多种输入和输出参数的处理，根据具体实现来理解其作用
        pass

    def get_input_embeddings(self):
        # 返回当前模型的输入嵌入层
        return self.cpmant.input_embedding

    def set_input_embeddings(self, embeddings):
        # 设置当前模型的输入嵌入层为给定的嵌入层
        self.cpmant.input_embedding = embeddings

    def get_output_embeddings(self):
        # 返回当前模型的输出嵌入层（即 lm_head 线性层）
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置当前模型的输出嵌入层为给定的新嵌入层
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        # 将输入的 token IDs 转换为整数类型
        input_ids = input_ids.int()
        # 如果 kwargs 中包含 attention_mask，则将其设为一个全零的张量，用于节省内存使用
        if "attention_mask" in kwargs:
            kwargs["attention_mask"] = torch.zeros(1, 1)

        # 返回经过处理后的输入字典，包含 input_ids、use_cache 和可能的 past_key_values
        return {
            "input_ids": input_ids,
            "use_cache": kwargs["use_cache"],
            "past_key_values": kwargs.get("past_key_values", None),
        }

    def _reorder_cache(self, past_key_values, beam_idx):
        # 对 past_key_values 中的每个 past_key_value 进行重排序，根据给定的 beam_idx
        past_key_values = [list(each) if each is not None else each for each in past_key_values]
        for key_value_layer in past_key_values:
            key_value_layer[0] = key_value_layer[0][beam_idx]
            key_value_layer[1] = key_value_layer[1][beam_idx]
        return past_key_values

`.\models\cpmant\tokenization_cpmant.py`

# 设置文件编码为 UTF-8
# 版权声明及许可信息
#
# 根据 Apache 许可证 2.0 版本进行许可，除非符合许可证中的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则根据许可证分发的软件是按“原样”基础分发的，
# 没有任何明示或暗示的保证或条件。请参阅许可证获取具体语言的权限和限制。
"""CPMAnt 的标记化类。"""
# 导入必要的库
import collections
import os
from typing import List, Optional, Tuple

# 导入条件依赖库
from transformers.utils import is_jieba_available, requires_backends

# 如果 jieba 库可用，则导入
if is_jieba_available():
    import jieba

# 导入通用工具函数和日志记录
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 定义预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "openbmb/cpm-ant-10b": "https://huggingface.co/openbmb/cpm-ant-10b/blob/main/vocab.txt",
    },
}

# 定义预训练模型的位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "openbmb/cpm-ant-10b": 1024,
}

def load_vocab(vocab_file):
    """加载词汇文件到字典中。"""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab

class WordpieceTokenizer(object):
    """基于词片段的标记化器。"""
    def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, token):
        """将单词标记化为词片段列表。"""
        chars = list(token)
        if len(chars) > self.max_input_chars_per_word:
            return [self.unk_token]

        start = 0
        sub_tokens = []
        while start < len(chars):
            end = len(chars)
            cur_substr = None
            while start < end:
                substr = "".join(chars[start:end])
                if substr in self.vocab:
                    cur_substr = substr
                    break
                end -= 1
            if cur_substr is None:
                sub_tokens.append(self.unk_token)
                start += 1
            else:
                sub_tokens.append(cur_substr)
                start = end

        return sub_tokens

class CpmAntTokenizer(PreTrainedTokenizer):
    """
    构造一个 CPMAnt 标记化器。基于字节级别的字节对编码。
    
    继承自 PreTrainedTokenizer 类。
    """
    pass
    # 定义类，用于处理特定的词汇表和标记化任务
    class BartTokenizer(BertTokenizer):
        """
        Args:
            vocab_file (`str`):
                Path to the vocabulary file.
            bod_token (`str`, *optional*, defaults to `"<d>"`):
                The beginning of document token.
            eod_token (`str`, *optional*, defaults to `"</d>"`):
                The end of document token.
            bos_token (`str`, *optional*, defaults to `"<s>"`):
                The beginning of sequence token.
            eos_token (`str`, *optional*, defaults to `"</s>"`):
                The end of sequence token.
            pad_token (`str`, *optional*, defaults to `"<pad>"`):
                The token used for padding.
            unk_token (`str`, *optional*, defaults to `"<unk>"`):
                The unknown token.
            line_token (`str`, *optional*, defaults to `"</n>"`):
                The line token.
            space_token (`str`, *optional*, defaults to `"</_>"`):
                The space token.
        """
    
        # 配置类变量，指定相关文件名和映射
        vocab_files_names = VOCAB_FILES_NAMES
        pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
        max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
        model_input_names = ["input_ids", "attention_mask"]
        add_prefix_space = False
    
        # 初始化方法，加载词汇表并进行相关配置
        def __init__(
            self,
            vocab_file,
            bod_token="<d>",
            eod_token="</d>",
            bos_token="<s>",
            eos_token="</s>",
            pad_token="<pad>",
            unk_token="<unk>",
            line_token="</n>",
            space_token="</_>",
            padding_side="left",
            **kwargs,
        ):
            # 要求后端库为 "jieba"
            requires_backends(self, ["jieba"])
            self.bod_token = bod_token
            self.eod_token = eod_token
            # 加载并设置词汇表编码器
            self.encoder = load_vocab(vocab_file)
            # 将空格和换行符的编码对应到词汇表中
            self.encoder[" "] = self.encoder[space_token]
            self.encoder["\n"] = self.encoder[line_token]
    
            # 删除空格和换行符的原始编码
            del self.encoder[space_token]
            del self.encoder[line_token]
    
            # 按编码值排序并转为有序字典
            self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))
            # 创建反向词汇表
            self.decoder = {v: k for k, v in self.encoder.items()}
    
            # 使用词块化器设置词块化方法
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token)
    
            # 调用父类的初始化方法
            super().__init__(
                bod_token=bod_token,
                eod_token=eod_token,
                bos_token=bos_token,
                eos_token=eos_token,
                pad_token=pad_token,
                unk_token=unk_token,
                line_token=line_token,
                space_token=space_token,
                padding_side=padding_side,
                **kwargs,
            )
    
        # 返回开始文档标记的编码
        @property
        def bod_token_id(self):
            return self.encoder[self.bod_token]
    
        # 返回结束文档标记的编码
        @property
        def eod_token_id(self):
            return self.encoder[self.eod_token]
    
        # 返回换行符的编码
        @property
        def newline_id(self):
            return self.encoder["\n"]
    
        # 返回词汇表大小
        @property
        def vocab_size(self) -> int:
            return len(self.encoder)
    
        # 获取词汇表
        def get_vocab(self):
            return dict(self.encoder, **self.added_tokens_encoder)
    # 将输入文本进行分词处理，并返回分词后的结果列表
    def _tokenize(self, text):
        output_tokens = []
        # 使用结巴分词库对文本进行分词，cut_all=False表示精确模式
        for x in jieba.cut(text, cut_all=False):
            # 对每个分词结果进行 WordPiece 分词处理，并将处理后的结果添加到输出列表中
            output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
        return output_tokens

    # 将标识符列表解码为字符串
    def _decode(self, token_ids, **kwargs):
        # 移除小于0的无效标识符
        token_ids = [i for i in token_ids if i >= 0]
        # 移除特殊的标识符，如 padding、结束和开始标记
        token_ids = [
            x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id
        ]
        # 调用父类的解码方法解码标识符列表为字符串
        return super()._decode(token_ids, **kwargs)

    # 检查给定的标识符是否在编码器（词汇表）中
    def check(self, token):
        return token in self.encoder

    # 将标记列表转换为字符串
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        return "".join(tokens)

    # 将标记（字符串）转换为其在词汇表中对应的标识符
    def _convert_token_to_id(self, token):
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 将标识符（整数）转换为其在词汇表中对应的标记（字符串）
    def _convert_id_to_token(self, index):
        return self.decoder.get(index, self.unk_token)

    # 将词汇表保存到指定的目录下
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果保存目录存在，则构造词汇表文件路径
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # 否则，直接使用指定的文件路径
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        
        index = 0
        # 处理特殊字符
        if " " in self.encoder:
            self.encoder["</_>"] = self.encoder[" "]
            del self.encoder[" "]
        if "\n" in self.encoder:
            self.encoder["</n>"] = self.encoder["\n"]
            del self.encoder["\n"]
        
        # 按照标识符的索引值对编码器进行排序，并转换为有序字典
        self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1]))

        # 将排序后的词汇表写入到文件中
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in self.encoder.items():
                # 检查索引是否连续，如果不连续则记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1
        
        # 返回保存的词汇表文件路径
        return (vocab_file,)
    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `List[int]`: The model input with special tokens.
        """
        # 如果没有第二个序列，则返回带有起始特殊标记的第一个序列
        if token_ids_1 is None:
            return [self.bos_token_id] + token_ids_0
        # 如果有第二个序列，则连接两个序列，并在中间添加起始特殊标记
        return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # 如果输入的 token_ids_0 和 token_ids_1 已经包含特殊标记，则调用父类方法处理
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 如果有第二个序列，则返回一个列表，以1开头表示起始特殊标记，接着全为0表示序列 token，再以1结尾表示第二个起始特殊标记
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
        # 如果只有一个序列，则返回一个列表，以1开头表示起始特殊标记，接着全为0表示序列 token
        return [1] + ([0] * len(token_ids_0))

`.\models\cpmant\init.py`

# flake8: noqa
# 禁止 flake8 对当前模块执行检查，以避免 "F401 '...' imported but unused" 警告。

# Copyright 2022 The HuggingFace Team and The OpenBMB Team. All rights reserved.
# 版权声明，保留所有权利。

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证 2.0 版本授权。

# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用本文件。

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本：

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# 根据适用法律或书面同意，软件

# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 在 "AS IS" 基础上分发，不提供任何担保或条件，无论是明示的还是隐含的。

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证了解具体的语言授权和限制。

from typing import TYPE_CHECKING

# rely on isort to merge the imports
# 使用 isort 来合并导入项

from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

_import_structure = {
    "configuration_cpmant": ["CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CpmAntConfig"],
    "tokenization_cpmant": ["CpmAntTokenizer"],
}

try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 添加模型相关的导入项到 _import_structure 字典中
    _import_structure["modeling_cpmant"] = [
        "CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "CpmAntForCausalLM",
        "CpmAntModel",
        "CpmAntPreTrainedModel",
    ]

if TYPE_CHECKING:
    # 在类型检查时导入必要的模块和类
    from .configuration_cpmant import CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP, CpmAntConfig
    from .tokenization_cpmant import CpmAntTokenizer

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 在类型检查时导入模型相关的类
        from .modeling_cpmant import (
            CPMANT_PRETRAINED_MODEL_ARCHIVE_LIST,
            CpmAntForCausalLM,
            CpmAntModel,
            CpmAntPreTrainedModel,
        )

else:
    import sys

    # 延迟加载模块的定义
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\ctrl\configuration_ctrl.py`

# coding=utf-8
# Copyright 2018 Salesforce and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Salesforce CTRL configuration

This module defines the configuration class `CTRLConfig` for the CTRL model. It provides a mapping of pretrained model names
to their corresponding configuration files.

The `CTRLConfig` class inherits from `PretrainedConfig` and defines parameters that control the architecture and behavior
of the CTRL model. It provides defaults that align with the Salesforce/ctrl architecture.

For more details on how to use configuration objects like `CTRLConfig` to instantiate CTRL models, refer to the
documentation of `PretrainedConfig`.
"""

from ...configuration_utils import PretrainedConfig
from ...utils import logging

logger = logging.get_logger(__name__)

# Mapping from pretrained model names to their configuration file URLs
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Salesforce/ctrl": "https://huggingface.co/Salesforce/ctrl/resolve/main/config.json"
}

class CTRLConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a `CTRLModel` or a `TFCTRLModel`.
    It defines parameters that control the architecture and behavior of the model when instantiated.
    Instantiating a configuration with the defaults will yield a similar configuration to that of
    the Salesforce/ctrl architecture from SalesForce.

    Configuration objects inherit from `PretrainedConfig` and can be used to control the model outputs.
    For more detailed information about configuring CTRL models, refer to the documentation of `PretrainedConfig`.
    """
    pass
    # 定义模型类型为 "ctrl"
    model_type = "ctrl"

    # 在推断阶段忽略的键列表，这些键不会在推断时被使用
    keys_to_ignore_at_inference = ["past_key_values"]

    # 属性映射字典，将模型配置中的一些属性映射到自定义的名称
    attribute_map = {
        "max_position_embeddings": "n_positions",   # 最大位置嵌入长度映射到 n_positions
        "hidden_size": "n_embd",                    # 隐藏大小映射到 n_embd
        "num_attention_heads": "n_head",            # 注意力头的数量映射到 n_head
        "num_hidden_layers": "n_layer",             # 隐藏层的数量映射到 n_layer
    }

    # 类的构造函数，初始化模型的配置参数
    def __init__(
        self,
        vocab_size=246534,                          # 词汇表大小，默认为 246534
        n_positions=256,                            # 最大序列长度，默认为 256
        n_embd=1280,                                # 嵌入和隐藏状态的维度，默认为 1280
        dff=8192,                                   # 前馈网络内部维度，默认为 8192
        n_layer=48,                                 # Transformer 编码器中的隐藏层数，默认为 48
        n_head=16,                                  # Transformer 编码器中每个注意力层的注意力头数，默认为 16
        resid_pdrop=0.1,                            # 嵌入、编码器和池化器中所有全连接层的 dropout 概率，默认为 0.1
        embd_pdrop=0.1,                             # 嵌入层的 dropout 比率，默认为 0.1
        layer_norm_epsilon=1e-6,                    # 层归一化层中使用的 epsilon，默认为 1e-6
        initializer_range=0.02,                     # 初始化所有权重矩阵时使用的截断正态初始化器的标准差，默认为 0.02
        use_cache=True,                             # 模型是否应返回最后的键/值注意力，默认为 True
        **kwargs,                                   # 允许接收任意其他关键字参数
        ):
        # 初始化Transformer模型的参数：词汇表大小
        self.vocab_size = vocab_size
        # 初始化Transformer模型的参数：位置编码的最大长度
        self.n_positions = n_positions
        # 初始化Transformer模型的参数：词嵌入的维度
        self.n_embd = n_embd
        # 初始化Transformer模型的参数：层数
        self.n_layer = n_layer
        # 初始化Transformer模型的参数：注意力头的数量
        self.n_head = n_head
        # 初始化Transformer模型的参数：前馈神经网络内部隐藏层的维度
        self.dff = dff
        # 初始化Transformer模型的参数：残差连接的dropout概率
        self.resid_pdrop = resid_pdrop
        # 初始化Transformer模型的参数：词嵌入的dropout概率
        self.embd_pdrop = embd_pdrop
        # 初始化Transformer模型的参数：层归一化的epsilon值
        self.layer_norm_epsilon = layer_norm_epsilon
        # 初始化Transformer模型的参数：初始化权重范围
        self.initializer_range = initializer_range

        # 初始化Transformer模型的参数：是否使用缓存
        self.use_cache = use_cache

        # 调用父类初始化方法，传递任意关键字参数
        super().__init__(**kwargs)

`.\models\ctrl\modeling_ctrl.py`

# coding=utf-8
# 设置编码格式为 UTF-8

# Copyright 2018 Salesforce and HuggingFace Inc. team.
# Copyright 2018 年 Salesforce 和 HuggingFace Inc. 团队的版权声明
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
# 版权所有 (c) 2018 年 NVIDIA 公司。保留所有权利。
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 依据 Apache 许可证，版本 2.0 (下称“许可证”)
# you may not use this file except in compliance with the License.
# 除非符合许可证，否则不得使用此文件。
# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 除非适用法律要求或书面同意，否则本许可下的软件均为“按原样”提供，
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何明示或默示的担保或条件。
# See the License for the specific language governing permissions and
# 请参阅许可证以了解特定语言的权限和
# limitations under the License.
# 许可下的限制。

""" PyTorch CTRL model."""
# PyTorch CTRL 模型

from typing import Optional, Tuple, Union
# 导入类型提示，包括 Optional（可选值）、Tuple（元组）、Union（联合类型）

import numpy as np
# 导入 NumPy 库，用于处理数组和矩阵的数学计算

import torch
# 导入 PyTorch 库

from torch import nn
# 从 PyTorch 中导入 nn 模块

from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
# 从 PyTorch 的 nn 模块中导入不同类型的损失函数

from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput
# 导入模型输出相关的类，来自 modeling_outputs 模块

from ...modeling_utils import PreTrainedModel
# 导入预训练模型的工具类，来自 modeling_utils 模块

from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_linear_layer
# 导入 PyTorch 相关的工具类和函数，来自 pytorch_utils 模块

from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
# 导入添加文档字符串的函数和工具，以及日志记录和替换返回文档字符串的工具

from .configuration_ctrl import CTRLConfig
# 从当前目录中的 configuration_ctrl 模块中导入 CTRLConfig 类

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

_CONFIG_FOR_DOC = "CTRLConfig"
# 用于文档的配置信息，指定为 "CTRLConfig"

CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Salesforce/ctrl"
    # CTRL 预训练模型的存档列表，包含一个预训练模型
    # 详见 https://huggingface.co/models?filter=ctrl 查看所有 CTRL 模型
]


def angle_defn(pos, i, d_model_size):
    # 定义角度函数，用于位置编码中计算角度率
    angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
    return pos * angle_rates
    # 返回位置和角度率的乘积


def positional_encoding(position, d_model_size, dtype):
    # 创建位置编码的正弦模式
    angle_rads = angle_defn(
        torch.arange(position, dtype=torch.int64).to(dtype).unsqueeze(1),
        torch.arange(d_model_size, dtype=torch.int64).to(dtype).unsqueeze(0),
        d_model_size,
    )
    # 计算角度弧度

    sines = torch.sin(angle_rads[:, 0::2])
    cosines = torch.cos(angle_rads[:, 1::2])

    pos_encoding = torch.cat([sines, cosines], dim=-1)
    # 组合正弦和余弦的编码结果
    return pos_encoding
    # 返回位置编码向量


def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
    # 缩放点积注意力机制

    matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
    # 计算 Q 和 K 的转置的矩阵乘积

    dk = k.shape[-1]
    scaled_attention_logits = matmul_qk / np.sqrt(dk)
    # 对矩阵乘积进行缩放，按照 K 的维度进行开方缩放

    if mask is not None:
        nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
        scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
    # 如果有掩码，则应用掩码

    if attention_mask is not None:
        # 应用注意力掩码
        scaled_attention_logits = scaled_attention_logits + attention_mask

    attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
    # 计算注意力权重，使用 softmax 归一化

    if head_mask is not None:
        attention_weights = attention_weights * head_mask
    # 如果有头部掩码，则应用头部掩码

    output = torch.matmul(attention_weights, v)
    # 计算加权和，得到输出

    return output, attention_weights
    # 返回输出和注意力权重
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model_size, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model_size = d_model_size

        self.depth = int(d_model_size / self.num_heads)  # 计算每个注意力头的深度

        self.Wq = nn.Linear(d_model_size, d_model_size)  # Query 线性变换层
        self.Wk = nn.Linear(d_model_size, d_model_size)  # Key 线性变换层
        self.Wv = nn.Linear(d_model_size, d_model_size)  # Value 线性变换层

        self.dense = nn.Linear(d_model_size, d_model_size)  # 最终输出的线性变换层
        self.pruned_heads = set()  # 初始化被剪枝的注意力头集合

    def prune_heads(self, heads):
        attention_head_size = self.d_model_size // self.num_heads
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, attention_head_size, self.pruned_heads)

        # 剪枝线性层
        self.Wq = prune_linear_layer(self.Wq, index)
        self.Wk = prune_linear_layer(self.Wk, index)
        self.Wv = prune_linear_layer(self.Wv, index)
        self.dense = prune_linear_layer(self.dense, index, dim=1)

        # 更新超参数
        self.num_heads = self.num_heads - len(heads)
        self.d_model_size = attention_head_size * self.num_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def split_into_heads(self, x, batch_size):
        x = x.reshape(batch_size, -1, self.num_heads, self.depth)  # 将输入张量分割成多个注意力头
        return x.permute([0, 2, 1, 3])  # 调整张量维度顺序以便并行处理

    def forward(
        self,
        v,
        k,
        q,
        mask,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    ):
        batch_size = q.shape[0]

        q = self.Wq(q)  # 查询向量线性变换
        k = self.Wk(k)  # 键向量线性变换
        v = self.Wv(v)  # 值向量线性变换

        q = self.split_into_heads(q, batch_size)  # 将查询向量分割成多个头
        k = self.split_into_heads(k, batch_size)  # 将键向量分割成多个头
        v = self.split_into_heads(v, batch_size)  # 将值向量分割成多个头
        if layer_past is not None:
            past_key, past_value = layer_past[0], layer_past[1]
            k = torch.cat((past_key, k), dim=-2)  # 连接过去的键向量和当前的键向量
            v = torch.cat((past_value, v), dim=-2)  # 连接过去的值向量和当前的值向量

        if use_cache is True:
            present = torch.stack((k, v))  # 存储当前的键和值向量
        else:
            present = (None,)

        output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)  # 执行缩放点积注意力
        scaled_attention = output[0].permute([0, 2, 1, 3])  # 调整输出注意力张量的维度顺序
        attn = output[1]  # 获取注意力权重
        original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size)
        output = self.dense(original_size_attention)  # 最终输出的线性变换

        outputs = (output, present)
        if output_attentions:
            outputs = outputs + (attn,)  # 如果需要输出注意力权重，则添加到输出中
        return outputs


def point_wise_feed_forward_network(d_model_size, dff):
    return nn.Sequential(nn.Linear(d_model_size, dff), nn.ReLU(), nn.Linear(dff, d_model_size))


class EncoderLayer(nn.Module):
    # 初始化函数，定义了 TransformerEncoderLayer 类的构造方法
    def __init__(self, d_model_size, num_heads, dff, rate=0.1):
        super().__init__()  # 调用父类构造方法

        # 创建多头注意力机制对象
        self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads)
        # 创建前馈神经网络对象
        self.ffn = point_wise_feed_forward_network(d_model_size, dff)

        # 创建 Layer Normalization 层，用于注意力输出
        self.layernorm1 = nn.LayerNorm(d_model_size, eps=1e-6)
        # 创建 Layer Normalization 层，用于前馈网络输出
        self.layernorm2 = nn.LayerNorm(d_model_size, eps=1e-6)

        # 创建 Dropout 层，用于注意力输出
        self.dropout1 = nn.Dropout(rate)
        # 创建 Dropout 层，用于前馈网络输出
        self.dropout2 = nn.Dropout(rate)

    # 前向传播函数，定义了 TransformerEncoderLayer 类的前向计算过程
    def forward(
        self, x, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False
    ):
        # 应用 Layer Normalization 到输入张量 x
        normed = self.layernorm1(x)
        # 使用多头注意力机制处理 Layer Normalization 后的张量
        attn_outputs = self.multi_head_attention(
            normed,
            normed,
            normed,
            mask,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 从多头注意力输出中取第一个元素作为注意力输出张量
        attn_output = attn_outputs[0]
        # 对注意力输出张量应用 Dropout
        attn_output = self.dropout1(attn_output)
        # 将原始输入张量 x 与处理后的注意力输出张量相加，得到部分前向传播输出 out1
        out1 = x + attn_output

        # 应用 Layer Normalization 到部分前向传播输出 out1
        out2 = self.layernorm2(out1)
        # 使用前馈神经网络处理 Layer Normalization 后的张量
        ffn_output = self.ffn(out2)
        # 对前馈网络输出张量应用 Dropout
        ffn_output = self.dropout2(ffn_output)
        # 将部分前向传播输出 out1 与处理后的前馈网络输出相加，得到最终前向传播输出 out2
        out2 = out1 + ffn_output

        # 构造最终输出元组，包含最终前向传播输出 out2 和可能的注意力输出附加信息
        outputs = (out2,) + attn_outputs[1:]
        # 返回最终输出元组
        return outputs
@add_start_docstrings(
    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
    CTRL_START_DOCSTRING,
)
class CTRLModel(CTRLPreTrainedModel):
    """CTRL 模型类，继承自 CTRLPreTrainedModel。用于生成原始隐藏状态，没有特定的输出头部。"""

    def __init__(self, config):
        """CTRL 模型的初始化函数。

        Args:
            config (`CTRLConfig`): 包含模型所有参数的配置类对象。
                通过配置文件初始化模型时不会加载与模型关联的权重，只加载配置。
                可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
        """
        super().__init__(config)

        # 设定模型的维度大小和层数
        self.d_model_size = config.n_embd
        self.num_layers = config.n_layer

        # 初始化位置编码
        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float)

        # 设定词嵌入层
        self.w = nn.Embedding(config.vocab_size, config.n_embd)

        # 设定 dropout 层
        self.dropout = nn.Dropout(config.embd_pdrop)

        # 设定 Transformer 编码层列表
        self.h = nn.ModuleList(
            [EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop) for _ in range(config.n_layer)]
        )

        # 设定 Layer Normalization 层
        self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        """返回输入词嵌入层 `w`。"""
        return self.w
    # 设置新的输入嵌入（embeddings）到模型中
    def set_input_embeddings(self, new_embeddings):
        self.w = new_embeddings

    # 剪枝模型中的注意力头（heads）
    # heads_to_prune: 需要在每个层剪枝的头部字典 {层号: 需要剪枝的头部列表}
    def _prune_heads(self, heads_to_prune):
        for layer, heads in heads_to_prune.items():
            # 调用多头注意力（multi_head_attention）的剪枝方法
            self.h[layer].multi_head_attention.prune_heads(heads)

    # 重写模型的前向传播方法，添加文档字符串和输出类型的注释
    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    CTRL_START_DOCSTRING,
)
"""
定义了一个带有语言建模头部的CTRL模型变换器。语言建模头部是一个线性层，其权重与输入的嵌入层相绑定。
"""

class CTRLLMHeadModel(CTRLPreTrainedModel):
    """
    CTRL语言模型的头部模型，继承自CTRL预训练模型。
    """

    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        """
        初始化函数，接受一个配置参数config，并调用父类的初始化函数。
        创建了CTRL模型的transformer部分和语言建模头部的线性层。
        """
        super().__init__(config)
        self.transformer = CTRLModel(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        """
        返回语言建模头部的嵌入层。
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        """
        设置语言建模头部的新嵌入层。
        """
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
        """
        根据输入准备生成过程中的输入。
        如果past_key_values不为None，则只保留输入ids的最后一个token。
        返回一个包含输入信息的字典。
        """
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}

    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播函数，接受多种输入参数，执行CTRL模型的前向计算。
        返回一个CausalLMOutputWithPast对象，其中包含模型的输出和过去的关键值。
        """
        pass  # Placeholder for forward function

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
    ) -> Tuple[Tuple[torch.Tensor]]:
        """
        静态方法，用于重新排序past_key_values缓存，以匹配每个生成步骤的正确beam_idx。
        返回重新排序后的past_key_values。
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
            for layer_past in past_key_values
        )

@add_start_docstrings(
    """
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    """
    CTRL_START_DOCSTRING,
)
# 导入需要的类和函数
class CTRLForSequenceClassification(CTRLPreTrainedModel):
    # 初始化方法，继承自父类 CTRLPreTrainedModel
    def __init__(self, config):
        # 调用父类初始化方法
        super().__init__(config)
        # 设置类别数量
        self.num_labels = config.num_labels
        # 初始化 CTRLModel 模型
        self.transformer = CTRLModel(config)
        # 设置分类器，线性层的输入维度为 config.n_embd，输出维度为类别数，不使用偏置
        self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False)

        # 初始化权重并进行最终处理
        self.post_init()

    # 前向传播方法，添加了模型输入的文档字符串和返回值文档字符串的修饰器
    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\ctrl\modeling_tf_ctrl.py`

# coding=utf-8
# Copyright 2018 Salesforce and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 CTRL model."""

from __future__ import annotations

from typing import Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast, TFSequenceClassifierOutput
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_ctrl import CTRLConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "Salesforce/ctrl"
_CONFIG_FOR_DOC = "CTRLConfig"

TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Salesforce/ctrl"
    # See all CTRL models at https://huggingface.co/models?filter=Salesforce/ctrl
]


def angle_defn(pos, i, d_model_size):
    # Calculate the rates of angles for positional encoding
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model_size)
    return pos * angle_rates


def positional_encoding(position, d_model_size):
    # Create positional encodings using sinusoidal patterns
    angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)

    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = tf.convert_to_tensor(np.concatenate([sines, cosines], axis=-1))

    return pos_encoding


def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
    # Calculate scaled dot-product attention
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(shape_list(k)[-1], dtype=matmul_qk.dtype)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += tf.cast(mask * -1e4, dtype=scaled_attention_logits.dtype)

    if attention_mask is not None:
        # Apply the attention mask
        attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype)
        scaled_attention_logits = scaled_attention_logits + attention_mask

    attention_weights = stable_softmax(scaled_attention_logits, axis=-1)

    # Mask heads if we want to
    # 如果给定了头部掩码（head_mask），则将注意力权重（attention_weights）与头部掩码逐元素相乘
    if head_mask is not None:
        attention_weights = attention_weights * head_mask
    
    # 将注意力权重（已经经过处理的，如果有头部掩码的话）与值（v）相乘，得到注意力机制的输出
    output = tf.matmul(attention_weights, v)
    
    # 返回注意力机制的输出和注意力权重
    return output, attention_weights
class TFMultiHeadAttention(keras.layers.Layer):
    # 初始化多头注意力层
    def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.d_model_size = d_model_size
        self.output_attentions = output_attentions

        # 计算每个头部的深度
        self.depth = int(d_model_size / self.num_heads)

        # 定义权重矩阵，用于查询（q）、键（k）、值（v）的线性映射
        self.Wq = keras.layers.Dense(d_model_size, name="Wq")
        self.Wk = keras.layers.Dense(d_model_size, name="Wk")
        self.Wv = keras.layers.Dense(d_model_size, name="Wv")

        # 最终输出的全连接层
        self.dense = keras.layers.Dense(d_model_size, name="dense")

    # 将输入张量分割成多个头部
    def split_into_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    # 多头注意力层的调用方法
    def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
        batch_size = shape_list(q)[0]

        # 线性映射到查询、键、值空间
        q = self.Wq(q)
        k = self.Wk(k)
        v = self.Wv(v)

        # 将查询、键、值分割成多个头部
        q = self.split_into_heads(q, batch_size)
        k = self.split_into_heads(k, batch_size)
        v = self.split_into_heads(v, batch_size)

        # 如果存在过去的键值对，将当前的键值对与过去的连接起来
        if layer_past is not None:
            past_key, past_value = tf.unstack(layer_past, axis=0)
            k = tf.concat((past_key, k), axis=-2)
            v = tf.concat((past_value, v), axis=-2)

        # 如果使用缓存，存储当前的键值对
        if use_cache:
            present = tf.stack((k, v), axis=0)
        else:
            present = (None,)

        # 进行缩放点积注意力计算
        output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
        scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
        attn = output[1]

        # 将多头注意力的输出重塑为原始形状
        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))

        # 通过全连接层处理重塑后的注意力表示
        output = self.dense(original_size_attention)
        outputs = (output, present)

        # 如果需要输出注意力权重，添加到输出中
        if output_attentions:
            outputs = outputs + (attn,)

        return outputs

    # 构建多头注意力层
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True

        # 构建权重矩阵
        if getattr(self, "Wq", None) is not None:
            with tf.name_scope(self.Wq.name):
                self.Wq.build([None, None, self.d_model_size])
        if getattr(self, "Wk", None) is not None:
            with tf.name_scope(self.Wk.name):
                self.Wk.build([None, None, self.d_model_size])
        if getattr(self, "Wv", None) is not None:
            with tf.name_scope(self.Wv.name):
                self.Wv.build([None, None, self.d_model_size])
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.d_model_size])
    # 初始化方法，设置模型的大小和隐藏层大小
    def __init__(self, d_model_size, dff, **kwargs):
        # 调用父类初始化方法
        super().__init__(**kwargs)

        # 创建第一个全连接层，使用ReLU激活函数，命名为"0"
        self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0")
        
        # 创建第二个全连接层，输出维度为d_model_size，命名为"2"
        self.dense_2 = keras.layers.Dense(d_model_size, name="2")
        
        # 设置模型大小和隐藏层大小
        self.d_model_size = d_model_size
        self.dff = dff

    # 模型调用方法，接受输入并返回第二个全连接层的输出
    def call(self, inputs, trainable=False):
        # 第一个全连接层的输出
        dense_0_output = self.dense_0(inputs)
        
        # 第二个全连接层的输出
        dense_2_output = self.dense_2(dense_0_output)

        # 返回第二个全连接层的输出作为模型的输出
        return dense_2_output

    # 构建方法，用于构建模型结构
    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        
        # 标记模型已经构建
        self.built = True
        
        # 如果dense_0层存在，则构建dense_0层
        if getattr(self, "dense_0", None) is not None:
            with tf.name_scope(self.dense_0.name):
                self.dense_0.build([None, None, self.d_model_size])
        
        # 如果dense_2层存在，则构建dense_2层
        if getattr(self, "dense_2", None) is not None:
            with tf.name_scope(self.dense_2.name):
                self.dense_2.build([None, None, self.dff])
class TFEncoderLayer(keras.layers.Layer):
    # 定义 Transformer 编码器层的 Keras 自定义层

    def __init__(
        self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
    ):
        super().__init__(**kwargs)

        self.output_attentions = output_attentions

        # 创建多头注意力机制层，用于编码器层
        self.multi_head_attention = TFMultiHeadAttention(
            d_model_size, num_heads, output_attentions=self.output_attentions, name="multi_head_attention"
        )
        
        # 创建点式前馈网络层，用于编码器层
        self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn")

        # 创建第一个层归一化层
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
        
        # 创建第二个层归一化层
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")

        # 创建第一个 dropout 层
        self.dropout1 = keras.layers.Dropout(rate)
        
        # 创建第二个 dropout 层
        self.dropout2 = keras.layers.Dropout(rate)
        
        # 保存模型的尺寸
        self.d_model_size = d_model_size

    def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
        # 对输入进行第一个归一化
        normed = self.layernorm1(x)
        
        # 使用多头注意力机制进行计算
        attn_outputs = self.multi_head_attention(
            normed,
            normed,
            normed,
            mask,
            layer_past,
            attention_mask,
            head_mask,
            use_cache,
            output_attentions,
            training=training,
        )
        
        # 从多头注意力机制的输出中获取注意力机制的结果
        attn_output = attn_outputs[0]
        
        # 对注意力机制的输出应用第一个 dropout 层
        attn_output = self.dropout1(attn_output, training=training)
        
        # 计算第一步的输出
        out1 = x + attn_output

        # 对第一步的输出进行第二个归一化
        out2 = self.layernorm2(out1)
        
        # 使用点式前馈网络进行计算
        ffn_output = self.ffn(out2)
        
        # 对点式前馈网络的输出应用第二个 dropout 层
        ffn_output = self.dropout2(ffn_output, training=training)
        
        # 计算第二步的输出
        out2 = out1 + ffn_output

        # 将所有输出整合到一个元组中返回
        outputs = (out2,) + attn_outputs[1:]
        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 构建多头注意力机制层
        if getattr(self, "multi_head_attention", None) is not None:
            with tf.name_scope(self.multi_head_attention.name):
                self.multi_head_attention.build(None)
        
        # 构建点式前馈网络层
        if getattr(self, "ffn", None) is not None:
            with tf.name_scope(self.ffn.name):
                self.ffn.build(None)
        
        # 构建第一个归一化层
        if getattr(self, "layernorm1", None) is not None:
            with tf.name_scope(self.layernorm1.name):
                self.layernorm1.build([None, None, self.d_model_size])
        
        # 构建第二个归一化层
        if getattr(self, "layernorm2", None) is not None:
            with tf.name_scope(self.layernorm2.name):
                self.layernorm2.build([None, None, self.d_model_size])


@keras_serializable
class TFCTRLMainLayer(keras.layers.Layer):
    # 基于 Keras 的 TFCTRL 主要层，用于 CTRL 模型

    # 配置类为 CTRLConfig
    config_class = CTRLConfig
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        self.config = config  # 设置模型的配置参数
        self.output_hidden_states = config.output_hidden_states  # 是否输出隐藏状态的配置
        self.output_attentions = config.output_attentions  # 是否输出注意力权重的配置
        self.use_cache = config.use_cache  # 是否使用缓存的配置
        self.return_dict = config.use_return_dict  # 是否返回字典的配置

        self.d_model_size = config.n_embd  # 获取模型的嵌入维度大小
        self.num_layers = config.n_layer  # 获取模型的层数

        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)  # 计算位置编码

        self.w = keras.layers.Embedding(
            input_dim=config.vocab_size,  # 输入词汇表大小
            output_dim=config.n_embd,  # 输出嵌入维度
            embeddings_initializer=get_initializer(config.initializer_range),  # 获取初始化器
            name="w",  # 设置层名称
        )

        self.dropout = keras.layers.Dropout(config.embd_pdrop)  # 设置dropout层
        self.h = [
            TFEncoderLayer(
                config.n_embd,  # 嵌入维度大小
                config.n_head,  # 头数
                config.dff,  # 前馈网络的大小
                config.resid_pdrop,  # 残差dropout率
                config.layer_norm_epsilon,  # 层归一化的epsilon值
                self.output_attentions,  # 是否输出注意力权重
                name=f"h_._{i}",  # 设置层名称
            )
            for i in range(config.n_layer)  # 循环创建编码层
        ]
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")  # 设置层归一化操作

    def get_input_embeddings(self):
        return self.w  # 返回输入嵌入层

    def set_input_embeddings(self, new_embeddings):
        self.w = new_embeddings  # 设置新的输入嵌入层

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError  # 抛出未实现错误

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ):
        # 模型的前向传播方法，使用unpack_inputs装饰器解压输入参数
        ...

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True  # 标记模型已构建
        if getattr(self, "w", None) is not None:
            with tf.name_scope(self.w.name):
                self.w.build(None)  # 构建输入嵌入层
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, None, self.config.n_embd])  # 构建层归一化层
        if getattr(self, "h", None) is not None:
            for layer in self.h:
                with tf.name_scope(layer.name):
                    layer.build(None)  # 构建每个编码层
# TFCTRLPreTrainedModel 类的定义，继承自 TFPreTrainedModel，用于处理权重初始化以及下载和加载预训练模型的简单接口。
class TFCTRLPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 CTRLConfig
    config_class = CTRLConfig
    # 基础模型前缀为 "transformer"
    base_model_prefix = "transformer"


# 下面是对 CTRLModel 的文档字符串和注释
CTRL_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`CTRLConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 空白，等待后续完善输入文档字符串的部分
CTRL_INPUTS_DOCSTRING = r"""
"""

# 添加文档字符串说明到 TFCTRLModel 类，描述其作为 CTRL 模型的裸变压器输出原始隐藏状态的特性。
@add_start_docstrings(
    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
    CTRL_START_DOCSTRING,
)
class TFCTRLModel(TFCTRLPreTrainedModel):
    # 初始化方法，接受配置参数和输入，调用父类的初始化方法
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 创建一个 TFCTRLMainLayer 类的实例作为 self.transformer，并命名为 "transformer"
        self.transformer = TFCTRLMainLayer(config, name="transformer")

    # 装饰器：解包输入参数，并添加文档字符串到模型前向传播方法
    @unpack_inputs
    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        # 添加样例代码的文档字符串，指定检查点、输出类型、配置类
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFBaseModelOutputWithPast]:
        # 调用 self.transformer 的前向传播方法，传入所有指定的参数
        outputs = self.transformer(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 返回前向传播的输出结果
        return outputs

    # 构建方法，用于构建模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 self.transformer 存在，则在其命名空间下构建模型
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                # 调用 self.transformer 的构建方法，传入 None 参数
                self.transformer.build(None)
class TFCTRLBiasLayer(keras.layers.Layer):
    """
    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
    so all weights have to be registered in a layer.
    """

    def __init__(self, shape, initializer, trainable, name, **kwargs):
        super().__init__(name=name, **kwargs)
        self.shape = shape  # 初始化bias的形状
        self.initializer = initializer  # 初始化bias的方式
        self.trainable = trainable  # 是否可以训练

    def build(self, input_shape):
        self.bias = self.add_weight(
            name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable
        )  # 添加bias作为权重到层中
        super().build(input_shape)

    def call(self, x):
        return x + self.bias  # 在输入张量x上添加bias



@add_start_docstrings(
    """
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    CTRL_START_DOCSTRING,
)
class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFCTRLMainLayer(config, name="transformer")  # 初始化transformer层
        self.bias_layer = TFCTRLBiasLayer(
            name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True
        )  # 初始化bias层，用于LM头部

    def get_output_embeddings(self):
        return self.get_input_embeddings()  # 获取输出的嵌入

    def set_output_embeddings(self, value):
        self.set_input_embeddings(value)  # 设置输出的嵌入

    def get_bias(self):
        return {"lm_head.bias": self.bias_layer.bias}  # 获取当前bias的值

    def set_bias(self, value):
        # Replaces the existing layers containing bias for correct (de)serialization.
        vocab_size = value["lm_head.bias"].shape[-1]  # 获取vocab_size
        self.bias_layer = TFCTRLBiasLayer(
            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True
        )  # 初始化一个新的bias层
        self.bias_layer.build(None)  # 构建新的bias层
        self.bias_layer.bias.assign(value["lm_head.bias"])  # 分配给新bias层的值
    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
        # 从 kwargs 中获取 token_type_ids，默认为 None
        token_type_ids = kwargs.get("token_type_ids", None)
        # 如果 past_key_values 不为 None，则只使用 inputs 的最后一个 token
        if past_key_values:
            # 将 inputs 的最后一个 token 扩展为单独的维度
            inputs = tf.expand_dims(inputs[:, -1], -1)
            # 如果 token_type_ids 不为 None，则也将其最后一个 token 扩展为单独的维度
            if token_type_ids is not None:
                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)

        # 从 kwargs 中获取 position_ids、attention_mask，默认为 None
        position_ids = kwargs.get("position_ids", None)
        attention_mask = kwargs.get("attention_mask", None)

        # 如果 attention_mask 不为 None 而 position_ids 为 None，则根据 attention_mask 计算 position_ids
        if attention_mask is not None and position_ids is None:
            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
            # 如果 past_key_values 不为 None，则将 position_ids 的最后一个 token 扩展为单独的维度
            if past_key_values:
                position_ids = tf.expand_dims(position_ids[:, -1], -1)

        # 返回一个包含准备好的输入的字典
        return {
            "input_ids": inputs,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
            "token_type_ids": token_type_ids,
        }

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义 call 方法，包含多个参数用于模型推断和训练
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFCausalLMOutputWithPast]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 调用 Transformer 模型进行前向传播，获取变换器的输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从变换器的输出中获取隐藏状态
        hidden_states = transformer_outputs[0]
        # 计算逻辑回归层的输出，使用权重转置
        logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True)
        # 对逻辑回归输出应用偏置层
        logits = self.bias_layer(logits)

        loss = None
        if labels is not None:
            # 将标签向左移动一个位置，并且截取最后一个逻辑回归标记
            shifted_logits = logits[:, :-1]
            labels = labels[:, 1:]
            # 计算损失函数，使用标签和移动后的逻辑回归输出
            loss = self.hf_compute_loss(labels, shifted_logits)

        if not return_dict:
            # 如果不返回字典，则输出元组形式
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有过去键值的 TFCausalLMOutputWithPast 对象，包括损失、逻辑回归输出和变换器的中间状态
        return TFCausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        # 设置构建状态为已完成
        self.built = True
        if getattr(self, "transformer", None) is not None:
            # 构建变换器模型
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        if getattr(self, "bias_layer", None) is not None:
            # 构建偏置层
            with tf.name_scope(self.bias_layer.name):
                self.bias_layer.build(None)
@add_start_docstrings(
    """
    The CTRL Model transformer with a sequence classification head on top (linear layer).

    [`TFCTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1, GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    CTRL_START_DOCSTRING,
)
class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.classifier = keras.layers.Dense(
            config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
            use_bias=False,
        )
        self.transformer = TFCTRLMainLayer(config, name="transformer")
        self.config = config

    def get_output_embeddings(self):
        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
        logger.warning(
            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
            "in transformers v4.32."
        )
        # 返回当前模型的权重矩阵 w 作为输出嵌入
        return self.transformer.w

    @unpack_inputs
    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        # 调用模型的前向传播方法，根据输入参数计算模型输出
        # unpack_inputs 解包输入参数，以便使用它们进行计算
        # add_start_docstrings_to_model_forward 添加模型前向传播文档字符串
        # add_code_sample_docstrings 添加代码示例的文档字符串
        pass
    ) -> Union[Tuple, TFSequenceClassifierOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
            config.vocab_size - 1]`.
        """

        # 使用transformer处理输入，返回transformer的输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从transformer的输出中获取隐藏状态
        hidden_states = transformer_outputs[0]
        
        # 使用分类器获取logits
        logits = self.classifier(hidden_states)
        in_logits = None
        
        # 如果没有定义pad_token_id，则将sequence_lengths设为-1
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            # 如果有输入input_ids，则计算每个样本的序列长度
            if input_ids is not None:
                # 计算序列中最后一个非pad_token_id的位置
                sequence_lengths = (
                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                    - 1
                )
                # 如果序列长度大于等于0，则保留该长度；否则使用默认的序列长度
                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                # 从logits中提取对应序列长度位置的值
                in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
            else:
                # 如果没有input_ids，则警告并设定sequence_lengths为-1
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )
        
        loss = None

        # 如果提供了labels，则计算损失
        if labels is not None:
            # 根据输入类型获取batch_size和sequence_length
            if input_ids is not None:
                batch_size, sequence_length = shape_list(input_ids)[:2]
            else:
                batch_size, sequence_length = shape_list(inputs_embeds)[:2]
            # 如果未定义pad_token_id且batch_size不等于1，则引发错误
            if self.config.pad_token_id is None and batch_size != 1:
                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")

            # 如果sequence_lengths不是tensor，则从logits中提取对应的值
            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0:batch_size, sequence_lengths]

            # 计算损失
            loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))

        # 如果in_logits不为None，则使用它作为pooled_logits；否则使用logits
        pooled_logits = in_logits if in_logits is not None else logits

        # 如果不返回dict，则返回输出元组
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果返回dict，则返回TFSequenceClassifierOutput对象
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=pooled_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
    # 定义 build 方法，用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回
        if self.built:
            return
        # 将模型标记为已构建
        self.built = True
        
        # 如果模型具有分类器属性，执行以下代码块
        if getattr(self, "classifier", None) is not None:
            # 使用分类器的名称空间来构建分类器模型
            with tf.name_scope(self.classifier.name):
                # 调用分类器对象的 build 方法来构建模型，输入形状为 [None, None, self.config.n_embd]
                self.classifier.build([None, None, self.config.n_embd])
        
        # 如果模型具有 transformer 属性，执行以下代码块
        if getattr(self, "transformer", None) is not None:
            # 使用 transformer 的名称空间来构建 transformer 模型
            with tf.name_scope(self.transformer.name):
                # 调用 transformer 对象的 build 方法来构建模型，输入形状为 None（即没有明确的输入形状要求）
                self.transformer.build(None)

`.\models\ctrl\tokenization_ctrl.py`

# coding=utf-8
# 设置文件的字符编码为UTF-8，确保可以正确处理中文等特殊字符
# Copyright 2018 Salesforce and The HuggingFace Inc. team.
# 版权声明，声明代码的版权归属
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可协议授权使用本代码
# you may not use this file except in compliance with the License.
# 除非符合许可协议，否则不得使用此文件
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
# 可以在上述链接获取许可协议的副本
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 本代码基于 "AS IS" 分发，无论明示还是暗示，不提供任何担保或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 请查看许可协议以了解具体的使用条款和限制条件
"""Tokenization classes for Salesforce CTRL."""
# 用于 Salesforce CTRL 模型的分词类

import json
# 导入json模块，用于处理JSON格式数据
import os
# 导入os模块，用于处理操作系统相关的功能
from typing import Optional, Tuple
# 导入必要的类型提示模块，用于声明函数的参数和返回值类型

import regex as re
# 导入regex模块，用于处理正则表达式

from ...tokenization_utils import PreTrainedTokenizer
# 从父目录的tokenization_utils模块中导入PreTrainedTokenizer类
from ...utils import logging
# 从父目录的utils模块中导入logging工具

logger = logging.get_logger(__name__)
# 使用logging模块获取当前模块的logger对象

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}
# 定义词汇文件和合并文件的名称映射，用于CTRL模型的加载

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {"Salesforce/ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"},
    "merges_file": {"Salesforce/ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"},
}
# 预训练模型的词汇文件和合并文件的URL映射，用于CTRL模型的加载

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "Salesforce/ctrl": 256,
}
# 预训练模型位置嵌入的尺寸映射，用于CTRL模型的加载

CONTROL_CODES = {
    "Pregnancy": 168629,
    "Christianity": 7675,
    "Explain": 106423,
    "Fitness": 63440,
    "Saving": 63163,
    "Ask": 27171,
    "Ass": 95985,
    "Joke": 163509,
    "Questions": 45622,
    "Thoughts": 49605,
    "Retail": 52342,
    "Feminism": 164338,
    "Writing": 11992,
    "Atheism": 192263,
    "Netflix": 48616,
    "Computing": 39639,
    "Opinion": 43213,
    "Alone": 44967,
    "Funny": 58917,
    "Gaming": 40358,
    "Human": 4088,
    "India": 1331,
    "Joker": 77138,
    "Diet": 36206,
    "Legal": 11859,
    "Norman": 4939,
    "Tip": 72689,
    "Weight": 52343,
    "Movies": 46273,
    "Running": 23425,
    "Science": 2090,
    "Horror": 37793,
    "Confession": 60572,
    "Finance": 12250,
    "Politics": 16360,
    "Scary": 191985,
    "Support": 12654,
    "Technologies": 32516,
    "Teenage": 66160,
    "Event": 32769,
    "Learned": 67460,
    "Notion": 182770,
    "Wikipedia": 37583,
    "Books": 6665,
    "Extract": 76050,
    "Confessions": 102701,
    "Conspiracy": 75932,
    "Links": 63674,
    "Narcissus": 150425,
    "Relationship": 54766,
    "Relationships": 134796,
    "Reviews": 41671,
    "News": 4256,
    "Translation": 26820,
    "multilingual": 128406,
}
# 控制代码映射，将特定的控制名称映射到其对应的数字代码

def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    # 返回单词中所有符号对的集合
    # 单词被表示为符号（符号是长度可变的字符串）的元组
    pairs = set()
    prev_char = word[0]
    # 初始化前一个字符为单词的第一个字符
    for char in word[1:]:
        # 遍历单词中的每一个字符（从第二个字符开始）
        pairs.add((prev_char, char))
        # 将当前字符和前一个字符组成的符号对添加到集合中
        prev_char = char
        # 更新前一个字符为当前字符

    pairs = set(pairs)
    # 转换为集合类型并返回
    return pairs


class CTRLTokenizer(PreTrainedTokenizer):
    """
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.
    构造一个CTRL分词器，基于字节对编码（Byte-Pair-Encoding）。
    """
    """
    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    """

    # 定义类级别的常量和映射
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    control_codes = CONTROL_CODES

    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        # 从给定的词汇文件中加载编码器（字典）
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 创建解码器，是编码器的反转映射
        self.decoder = {v: k for k, v in self.encoder.items()}
        # 从给定的合并文件中读取 BPE（Byte-Pair Encoding）合并操作
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        # 创建 BPE 合并操作到排名的映射
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        # 初始化缓存，用于存储已处理过的 BPE 操作结果
        self.cache = {}
        # 调用父类的初始化方法，传入未知标记和额外的关键字参数
        super().__init__(unk_token=unk_token, **kwargs)

    @property
    def vocab_size(self):
        # 返回编码器中的词汇大小（词汇表大小）
        return len(self.encoder)

    def get_vocab(self):
        # 返回编码器和添加的特殊标记编码器合并后的字典
        return dict(self.encoder, **self.added_tokens_encoder)

    def bpe(self, token):
        # 如果缓存中已经存在对应的 BPE 结果，则直接返回
        if token in self.cache:
            return self.cache[token]
        # 将单词转换为字符元组，并添加结束符</w>，以进行 BPE 操作
        word = tuple(token)
        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
        # 获取单词中的所有字符对
        pairs = get_pairs(word)

        # 如果没有字符对，则直接返回原始标记
        if not pairs:
            return token

        while True:
            # 找到优先级最高的字符对进行合并
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果找不到该字符对的合并操作，停止循环
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            # 执行 BPE 合并操作
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            # 如果单词长度为1，停止合并操作
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        # 将合并结果转换为 BPE 格式的标记
        word = "@@ ".join(word)
        word = word[:-4]
        # 将处理过的结果存入缓存并返回
        self.cache[token] = word
        return word
    # 将输入的文本按非空白字符分割成单词列表
    def _tokenize(self, text):
        split_tokens = []

        words = re.findall(r"\S+\n?", text)

        # 遍历每个单词，并应用BPE编码器将每个单词拆分成子词，加入到split_tokens列表中
        for token in words:
            split_tokens.extend(list(self.bpe(token).split(" ")))
        return split_tokens

    # 根据词汇表将token转换为对应的ID，如果token不在词汇表中，则使用unk_token对应的ID
    def _convert_token_to_id(self, token):
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 根据词汇表将ID转换为对应的token，如果ID不在词汇表中，则使用unk_token
    def _convert_id_to_token(self, index):
        return self.decoder.get(index, self.unk_token)

    # 将一系列token转换为单个字符串，去除特殊token标记"@@"并去除两端空格
    def convert_tokens_to_string(self, tokens):
        out_string = " ".join(tokens).replace("@@ ", "").strip()
        return out_string

    # 将词汇表保存到指定目录下的文件中，并返回保存的文件路径
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            # 如果保存目录不存在，则记录错误信息并返回
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构建保存词汇表文件和BPE合并规则文件的路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将词汇表以JSON格式写入到vocab_file
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将BPE合并规则写入到merge_file
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    # 如果BPE合并索引不连续，记录警告信息
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        return vocab_file, merge_file

    # decode方法被注释掉，可能用于将token_ids解码为字符串，移除特殊标记和空格
    # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
    #     filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
    #     tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
    #     tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
    #     return ''.join(tokens_generated_so_far)

Transformers-源码解析-三十-

Transformers 源码解析（三十）

.\models\convnextv2\modeling_convnextv2.py

.\models\convnextv2\modeling_tf_convnextv2.py

.\models\convnextv2\__init__.py

.\models\cpm\tokenization_cpm.py

.\models\cpm\tokenization_cpm_fast.py

.\models\cpm\__init__.py

.\models\cpmant\configuration_cpmant.py

.\models\cpmant\modeling_cpmant.py

.\models\cpmant\tokenization_cpmant.py

.\models\cpmant\__init__.py

.\models\ctrl\configuration_ctrl.py

.\models\ctrl\modeling_ctrl.py

.\models\ctrl\modeling_tf_ctrl.py

.\models\ctrl\tokenization_ctrl.py

`.\models\convnextv2\modeling_convnextv2.py`

`.\models\convnextv2\modeling_tf_convnextv2.py`

`.\models\convnextv2\init.py`

`.\models\cpm\tokenization_cpm.py`

`.\models\cpm\tokenization_cpm_fast.py`

`.\models\cpm\init.py`

`.\models\cpmant\configuration_cpmant.py`

`.\models\cpmant\modeling_cpmant.py`

`.\models\cpmant\tokenization_cpmant.py`

`.\models\cpmant\init.py`

`.\models\ctrl\configuration_ctrl.py`

`.\models\ctrl\modeling_ctrl.py`

`.\models\ctrl\modeling_tf_ctrl.py`

`.\models\ctrl\tokenization_ctrl.py`