Transformers 源码解析（五十七）

`.\models\groupvit\modeling_tf_groupvit.py`

# coding=utf-8
# Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 GroupViT model."""


from __future__ import annotations

import collections.abc
import math
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_tensorflow_probability_available,
    logging,
    replace_return_docstrings,
)
from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig


logger = logging.get_logger(__name__)

# soft dependency
if is_tensorflow_probability_available():
    try:
        import tensorflow_probability as tfp

        # On the first call, check whether a compatible version of TensorFlow is installed
        # TensorFlow Probability depends on a recent stable release of TensorFlow
        _ = tfp.distributions.Normal(loc=0.0, scale=1.0)
    except ImportError:
        logger.error(
            "GroupViT models are not usable since `tensorflow_probability` can't be loaded. "
            "It seems you have `tensorflow_probability` installed with the wrong tensorflow version."
            "Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability."
        )

_CHECKPOINT_FOR_DOC = "nvidia/groupvit-gcc-yfcc"

TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "nvidia/groupvit-gcc-yfcc",
    # See all GroupViT models at https://huggingface.co/models?filter=groupvit
]


LARGE_NEGATIVE = -1e8


# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    # 获取输入 mask 的序列长度
    src_len = shape_list(mask)[1]
    # 如果未指定目标长度，则使用输入 mask 的序列长度
    tgt_len = tgt_len if tgt_len is not None else src_len
    # 创建一个常数张量，值为 1.0
    one_cst = tf.constant(1.0)
    # 将 mask 转换为常数张量类型
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在第二个维度上扩展 mask，扩展后的形状为 `[bsz, 1, tgt_seq_len, src_seq_len]`
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
    # 返回计算结果，其中 `one_cst - expanded_mask` 表示两个数的差
    # `LARGE_NEGATIVE` 与该差相乘，结果为一个较大的负数
    return (one_cst - expanded_mask) * LARGE_NEGATIVE
# 对比损失函数，从 https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html 适配而来
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
    # 计算稀疏分类交叉熵损失的平均值，用于对比损失
    return tf.math.reduce_mean(
        # 使用稀疏分类交叉熵损失函数，计算 logits 的损失
        keras.metrics.sparse_categorical_crossentropy(
            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
        )
    )


# 从 transformers.models.clip.modeling_tf_clip.clip_loss 复制过来，将 clip 替换为 groupvit
def groupvit_loss(similarity: tf.Tensor) -> tf.Tensor:
    # 计算对比损失函数用于 caption 和 image
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(tf.transpose(similarity))
    return (caption_loss + image_loss) / 2.0


def hard_softmax(logits: tf.Tensor, dim: int) -> tf.Tensor:
    y_soft = stable_softmax(logits, dim)
    # 直通算法。
    index = tf.argmax(y_soft, dim)
    y_hard = tf.one_hot(
        index,
        depth=shape_list(logits)[dim],
        # TensorFlow 期望轴在 -1 或 [0, 3) 范围内，但接收到的是 -2
        # 因此使用以下代码片段。
        axis=range(len(shape_list(logits)))[dim],
        dtype=y_soft.dtype,
    )
    ret = y_hard - tf.stop_gradient(y_soft) + y_soft

    return ret


def gumbel_softmax(logits: tf.Tensor, tau: float = 1, hard: bool = False, dim: int = -1) -> tf.Tensor:
    gumbel_dist = tfp.distributions.Gumbel(0.0, 1.0)
    gumbels = gumbel_dist.sample(tf.shape(logits), dtype=logits.dtype)

    gumbels = (logits + gumbels) / tau  # ~Gumbel(logits,tau)
    y_soft = stable_softmax(gumbels, dim)

    if hard:
        # 直通算法。
        index = tf.argmax(y_soft, dim)
        y_hard = tf.one_hot(
            index,
            depth=shape_list(logits)[dim],
            # TensorFlow 期望轴在 -1 或 [0, 3) 范围内，但接收到的是 -2
            # 因此使用以下代码片段。
            axis=range(len(shape_list(logits)))[dim],
            dtype=y_soft.dtype,
        )
        ret = y_hard - tf.stop_gradient(y_soft) + y_soft
    else:
        # 重参数化技巧。
        ret = y_soft
    return ret


def resize_attention_map(attentions: tf.Tensor, height: int, width: int, align_corners: bool = False) -> tf.Tensor:
    """
    Args:
        attentions (`tf.Tensor`): shape 为 [batch_size, groups, feat_height*feat_width] 的注意力图
        height (`int`): 输出注意力图的高度
        width (`int`): 输出注意力图的宽度
        align_corners (`bool`, *optional*): `nn.functional.interpolate` 的 `align_corner` 参数。

    Returns:
        `tf.Tensor`: shape 为 [batch_size, groups, height, width] 的调整大小后的注意力图
    """

    scale = (height * width // attentions.shape[2]) ** 0.5
    if height > width:
        feat_width = int(np.round(width / scale))
        feat_height = shape_list(attentions)[2] // feat_width
    else:
        feat_height = int(np.round(height / scale))
        feat_width = shape_list(attentions)[2] // feat_height
    # 获取注意力张量的批量大小
    batch_size = shape_list(attentions)[0]
    # 获取注意力张量中的群组数，这里指代群组标记的数量
    groups = shape_list(attentions)[1]  # number of group token
    # 将注意力张量重新形状为 [batch_size, groups, feat_height, feat_width]
    attentions = tf.reshape(attentions, (batch_size, groups, feat_height, feat_width))
    # 调整注意力张量的维度顺序，将维度重新排列为 [batch_size, groups, feat_height, feat_width]
    attentions = tf.transpose(attentions, perm=(0, 2, 3, 1))
    # 如果指定了align_corners为True，则使用双线性插值方式调整注意力张量大小
    if align_corners:
        attentions = tf.compat.v1.image.resize(
            attentions,
            size=(height, width),
            method="bilinear",
            align_corners=align_corners,
        )
    # 如果align_corners为False，则使用普通的双线性插值方式调整注意力张量大小
    else:
        attentions = tf.image.resize(attentions, size=(height, width), method="bilinear")
    # 再次调整注意力张量的维度顺序，将维度重新排列为 [batch_size, groups, height, width]
    attentions = tf.transpose(attentions, perm=(0, 3, 1, 2))
    # 返回调整后的注意力张量
    return attentions
# 从注意力图中获取分组信息的函数
def get_grouping_from_attentions(attentions: Tuple[tf.Tensor], hw_shape: Tuple[int]) -> tf.Tensor:
    """
    Args:
        attentions (`tuple(tf.Tensor)`): TFGroupViTVisionTransformer 返回的注意力图元组
        hw_shape (`tuple(int)`): 输出注意力图的高度和宽度
    Returns:
        `tf.Tensor`: 形状为 [batch_size, groups, height, width] 的注意力图
    """

    # 存储所有注意力图的列表
    attn_maps = []
    # 上一个注意力掩码的初始值设为 None
    prev_attn_masks = None
    # 遍历每一个注意力图
    for attn_masks in attentions:
        # 调整注意力掩码的维度顺序，[batch_size, num_groups, height x width] -> [batch_size, height x width, num_groups]
        attn_masks = tf.transpose(attn_masks, perm=(0, 2, 1))
        # 如果是第一个注意力图，则直接赋值给 prev_attn_masks
        if prev_attn_masks is None:
            prev_attn_masks = attn_masks
        else:
            # 否则，进行矩阵乘法操作
            prev_attn_masks = tf.matmul(prev_attn_masks, attn_masks)
        # 调整得到的注意力图的维度顺序，[batch_size, height x width, num_groups] -> [batch_size, num_groups, height x width]
        # 然后调整为 [batch_size, num_groups, height, width] 的形状
        cur_attn_map = resize_attention_map(tf.transpose(prev_attn_masks, perm=(0, 2, 1)), *hw_shape)
        # 将当前注意力图添加到列表中
        attn_maps.append(cur_attn_map)

    # 获取最终的分组注意力图，即最后一个注意力图
    final_grouping = attn_maps[-1]

    # 返回最终的分组注意力图，并停止梯度传播
    return tf.stop_gradient(final_grouping)
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        segmentation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        text_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`TFGroupViTTextModel`].
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`TFGroupViTVisionModel`].
        text_model_output (`TFBaseModelOutputWithPooling`):
            The output of the [`TFGroupViTTextModel`].
        vision_model_output (`TFBaseModelOutputWithPooling`):
            The output of the [`TFGroupViTVisionModel`].
    """

    # Initialize optional attributes with None
    loss: tf.Tensor | None = None
    logits_per_image: tf.Tensor = None
    logits_per_text: tf.Tensor = None
    segmentation_logits: tf.Tensor = None
    text_embeds: tf.Tensor = None
    image_embeds: tf.Tensor = None
    text_model_output: TFBaseModelOutputWithPooling = None
    vision_model_output: TFBaseModelOutputWithPooling = None

    # Define method to convert attributes to a tuple, excluding specific complex types
    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            # If key is not in exclusion list, return the attribute value; otherwise, convert complex type to tuple
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
# 定义 TFGroupViTCrossAttentionLayer 类，继承自 keras.layers.Layer
class TFGroupViTCrossAttentionLayer(keras.layers.Layer):
    # 初始化方法，接受 GroupViTVisionConfig 类型的 config 参数和其他关键字参数
    def __init__(self, config: GroupViTVisionConfig, **kwargs):
        super().__init__(**kwargs)
        # 创建 TFGroupViTAttention 实例并赋给 self.attn 属性，名称为 "attn"
        self.attn = TFGroupViTAttention(config, name="attn")
        # 创建 LayerNormalization 实例并赋给 self.norm2 属性，使用 config 中的参数
        self.norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2")
        # 创建 TFGroupViTMLP 实例并赋给 self.mlp 属性，名称为 "mlp"
        self.mlp = TFGroupViTMLP(config, name="mlp")
        # 创建 LayerNormalization 实例并赋给 self.norm_post 属性，使用 config 中的参数
        self.norm_post = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post")
        # 将 config 参数赋给 self.config 属性
        self.config = config

    # call 方法，定义层的正向传播逻辑，接受 query, key 和 training 参数，返回一个 tf.Tensor
    def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将 query 赋给变量 x
        x = query
        # 使用 self.attn 对象处理 query 和 encoder_hidden_states=key，并加到 x 上
        x = x + self.attn(query, encoder_hidden_states=key)[0]
        # 使用 self.norm2 对 x 进行 LayerNormalization，然后传入 self.mlp 处理并加到 x 上
        x = x + self.mlp(self.norm2(x))
        # 对 x 使用 self.norm_post 进行 LayerNormalization
        x = self.norm_post(x)
        # 返回处理后的 x
        return x

    # build 方法，用于构建层，设置内部变量和子层的建立过程
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 检查并建立 self.attn 对象
        if getattr(self, "attn", None) is not None:
            with tf.name_scope(self.attn.name):
                self.attn.build(None)
        # 检查并建立 self.norm2 对象
        if getattr(self, "norm2", None) is not None:
            with tf.name_scope(self.norm2.name):
                self.norm2.build([None, None, self.config.hidden_size])
        # 检查并建立 self.mlp 对象
        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)
        # 检查并建立 self.norm_post 对象
        if getattr(self, "norm_post", None) is not None:
            with tf.name_scope(self.norm_post.name):
                self.norm_post.build([None, None, self.config.hidden_size])


# 定义 TFGroupViTAssignAttention 类，继承自 keras.layers.Layer
class TFGroupViTAssignAttention(keras.layers.Layer):
    # 初始化方法，接受 GroupViTVisionConfig 类型的 config 参数和其他关键字参数
    def __init__(self, config: GroupViTVisionConfig, **kwargs):
        super().__init__(**kwargs)
        # 计算缩放因子，赋给 self.scale 属性
        self.scale = config.hidden_size**-0.5
        # 创建 Dense 层实例 q_proj，并赋给 self.q_proj 属性，名称为 "q_proj"
        self.q_proj = keras.layers.Dense(config.hidden_size, name="q_proj")
        # 创建 Dense 层实例 k_proj，并赋给 self.k_proj 属性，名称为 "k_proj"
        self.k_proj = keras.layers.Dense(config.hidden_size, name="k_proj")
        # 创建 Dense 层实例 v_proj，并赋给 self.v_proj 属性，名称为 "v_proj"
        self.v_proj = keras.layers.Dense(config.hidden_size, name="v_proj")
        # 创建 Dense 层实例 proj，并赋给 self.proj 属性，名称为 "proj"
        self.proj = keras.layers.Dense(config.hidden_size, name="proj")
        # 从 config 参数获取 assign_eps，并赋给 self.assign_eps 属性
        self.assign_eps = config.assign_eps
        # 将 config 参数赋给 self.config 属性
        self.config = config

    # get_attn 方法，接受 attn, gumbel, hard, training 参数，返回处理后的 attn tf.Tensor
    def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor:
        # 如果 gumbel 为真且在训练状态下，使用 gumbel_softmax 处理 attn
        if gumbel and training:
            attn = gumbel_softmax(attn, dim=-2, hard=hard)
        else:
            # 否则根据 hard 的值选择合适的 softmax 函数处理 attn
            if hard:
                attn = hard_softmax(attn, dim=-2)
            else:
                attn = stable_softmax(attn, axis=-2)

        # 返回处理后的 attn
        return attn
    # 实现注意力机制的调用函数，计算并返回注意力加权后的输出及软注意力分布

    def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False):
        # 将key作为value备份
        value = key
        # 对query进行投影操作，将其映射到指定维度空间
        query = self.q_proj(query)

        # 对key进行投影操作，将其映射到指定维度空间
        key = self.k_proj(key)

        # 对value进行投影操作，将其映射到指定维度空间
        value = self.v_proj(value)

        # 计算原始注意力分布，query与key的转置矩阵相乘，并乘以缩放因子
        raw_attn = tf.matmul(query, key, transpose_b=True) * self.scale

        # 根据原始注意力分布获取注意力权重，可选择性使用Gumbel-Softmax进行采样
        attn = self.get_attn(raw_attn, training=training)
        soft_attn = self.get_attn(raw_attn, training=training, gumbel=False, hard=False)

        # 归一化注意力权重，防止数值不稳定，加上一个小的常数eps
        attn = attn / (tf.math.reduce_sum(attn, axis=-1, keepdims=True) + self.assign_eps)

        # 根据注意力权重对value进行加权求和，得到最终输出
        out = tf.matmul(attn, value)

        # 对输出结果进行最终投影，映射到指定的维度空间
        out = self.proj(out)

        # 返回最终输出结果及soft_attn，即软注意力分布
        return out, soft_attn

    # 构建注意力层，设置各投影操作的维度
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 如果q_proj存在，则设置其输入形状
        if getattr(self, "q_proj", None) is not None:
            with tf.name_scope(self.q_proj.name):
                self.q_proj.build([None, None, self.config.hidden_size])
        
        # 如果k_proj存在，则设置其输入形状
        if getattr(self, "k_proj", None) is not None:
            with tf.name_scope(self.k_proj.name):
                self.k_proj.build([None, None, self.config.hidden_size])
        
        # 如果v_proj存在，则设置其输入形状
        if getattr(self, "v_proj", None) is not None:
            with tf.name_scope(self.v_proj.name):
                self.v_proj.build([None, None, self.config.hidden_size])
        
        # 如果proj存在，则设置其输入形状
        if getattr(self, "proj", None) is not None:
            with tf.name_scope(self.proj.name):
                self.proj.build([None, None, self.config.hidden_size])
class TFGroupViTTokenAssign(keras.layers.Layer):
    def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_output_group: int, **kwargs):
        super().__init__(**kwargs)
        self.num_output_group = num_output_group
        # 对 group_tokens 进行层归一化
        self.norm_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_tokens")
        # 根据配置计算 MLP 的维度
        assign_mlp_ratio = (
            config.assign_mlp_ratio
            if isinstance(config.assign_mlp_ratio, collections.abc.Iterable)
            else (config.assign_mlp_ratio, config.assign_mlp_ratio)
        )
        tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio]
        # 创建用于中间 MLP 的层
        self.mlp_inter = TFGroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group, name="mlp_inter")
        # 对 post_tokens 进行层归一化
        self.norm_post_tokens = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post_tokens")
        # 对输入 x 进行层归一化
        self.norm_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_x")
        # 创建用于前分配注意力的层
        self.pre_assign_attn = TFGroupViTCrossAttentionLayer(config, name="pre_assign_attn")
        # 创建分配注意力的层
        self.assign = TFGroupViTAssignAttention(config, name="assign")
        # 对新的 x 进行层归一化
        self.norm_new_x = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_new_x")
        # 创建用于通道的 MLP 层
        self.mlp_channels = TFGroupViTMLP(
            config, config.hidden_size, channels_dim, config.hidden_size, name="mlp_channels"
        )
        self.config = config

    def project_group_token(self, group_tokens: tf.Tensor) -> tf.Tensor:
        """
        Args:
            group_tokens (tf.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (tf.Tensor): [batch_size, num_output_groups, channels]
        """
        # 使用中间 MLP 层对 group_tokens 进行投影
        projected_group_tokens = self.mlp_inter(group_tokens)
        # 对投影后的 group_tokens 进行层归一化
        projected_group_tokens = self.norm_post_tokens(projected_group_tokens)
        return projected_group_tokens

    def call(self, image_tokens: tf.Tensor, group_tokens: tf.Tensor, training: bool = False):
        """
        Args:
            image_tokens (`tf.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`tf.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        """

        # 对 group_tokens 进行层归一化
        group_tokens = self.norm_tokens(group_tokens)
        # 对 image_tokens 进行层归一化
        image_tokens = self.norm_x(image_tokens)
        # 使用中间 MLP 层对 group_tokens 进行投影，得到投影后的 group_tokens
        projected_group_tokens = self.project_group_token(group_tokens)
        # 使用前分配注意力层对投影后的 group_tokens 和 image_tokens 进行处理
        projected_group_tokens = self.pre_assign_attn(projected_group_tokens, image_tokens)
        # 使用分配注意力层对投影后的 group_tokens 和 image_tokens 进行分配
        new_image_tokens, attention = self.assign(projected_group_tokens, image_tokens)
        # 将投影后的 group_tokens 添加到新的 image_tokens 上
        new_image_tokens += projected_group_tokens

        # 对新的 image_tokens 进行层归一化和 MLP 通道操作
        new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens))

        return new_image_tokens, attention
    # 定义 build 方法，用于构建模型结构
    def build(self, input_shape=None):
        # 如果模型已经构建完成，直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在 norm_tokens 属性，则构建与其相关的操作
        if getattr(self, "norm_tokens", None) is not None:
            # 使用 tf.name_scope 为 norm_tokens 创建命名空间
            with tf.name_scope(self.norm_tokens.name):
                # 使用 norm_tokens 属性构建操作，输入形状为 [None, None, self.config.hidden_size]
                self.norm_tokens.build([None, None, self.config.hidden_size])
        
        # 如果存在 mlp_inter 属性，则构建与其相关的操作
        if getattr(self, "mlp_inter", None) is not None:
            # 使用 tf.name_scope 为 mlp_inter 创建命名空间
            with tf.name_scope(self.mlp_inter.name):
                # 使用 mlp_inter 属性构建操作，输入形状为 None
                self.mlp_inter.build(None)
        
        # 如果存在 norm_post_tokens 属性，则构建与其相关的操作
        if getattr(self, "norm_post_tokens", None) is not None:
            # 使用 tf.name_scope 为 norm_post_tokens 创建命名空间
            with tf.name_scope(self.norm_post_tokens.name):
                # 使用 norm_post_tokens 属性构建操作，输入形状为 [None, None, self.config.hidden_size]
                self.norm_post_tokens.build([None, None, self.config.hidden_size])
        
        # 如果存在 norm_x 属性，则构建与其相关的操作
        if getattr(self, "norm_x", None) is not None:
            # 使用 tf.name_scope 为 norm_x 创建命名空间
            with tf.name_scope(self.norm_x.name):
                # 使用 norm_x 属性构建操作，输入形状为 [None, None, self.config.hidden_size]
                self.norm_x.build([None, None, self.config.hidden_size])
        
        # 如果存在 pre_assign_attn 属性，则构建与其相关的操作
        if getattr(self, "pre_assign_attn", None) is not None:
            # 使用 tf.name_scope 为 pre_assign_attn 创建命名空间
            with tf.name_scope(self.pre_assign_attn.name):
                # 使用 pre_assign_attn 属性构建操作，输入形状为 None
                self.pre_assign_attn.build(None)
        
        # 如果存在 assign 属性，则构建与其相关的操作
        if getattr(self, "assign", None) is not None:
            # 使用 tf.name_scope 为 assign 创建命名空间
            with tf.name_scope(self.assign.name):
                # 使用 assign 属性构建操作，输入形状为 None
                self.assign.build(None)
        
        # 如果存在 norm_new_x 属性，则构建与其相关的操作
        if getattr(self, "norm_new_x", None) is not None:
            # 使用 tf.name_scope 为 norm_new_x 创建命名空间
            with tf.name_scope(self.norm_new_x.name):
                # 使用 norm_new_x 属性构建操作，输入形状为 [None, None, self.config.hidden_size]
                self.norm_new_x.build([None, None, self.config.hidden_size])
        
        # 如果存在 mlp_channels 属性，则构建与其相关的操作
        if getattr(self, "mlp_channels", None) is not None:
            # 使用 tf.name_scope 为 mlp_channels 创建命名空间
            with tf.name_scope(self.mlp_channels.name):
                # 使用 mlp_channels 属性构建操作，输入形状为 None
                self.mlp_channels.build(None)
# Adapted from transformers.models.vit.modeling_tf_vit.TFViTPatchEmbeddings with ViT->GroupViT
# 这个类从 TFViTPatchEmbeddings 修改而来，用于 GroupViT 模型
class TFGroupViTPatchEmbeddings(keras.layers.Layer):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """
    
    def __init__(self, config: GroupViTConfig, **kwargs):
        super().__init__(**kwargs)
        # 从配置中获取图像大小和补丁大小
        image_size, patch_size = config.image_size, config.patch_size
        num_channels = config.num_channels
        # hidden_size 作为成员变量保存，因为在调用方法中会用到
        self.hidden_size = config.hidden_size
        
        # 如果图像大小和补丁大小不是可迭代对象，则转换为元组
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算图像中的补丁数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # 保存图像大小、补丁大小、补丁数量、通道数和配置
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.num_channels = num_channels
        self.config = config
        
        # 创建投影层，用于将像素值投影到隐藏状态的大小
        self.projection = keras.layers.Conv2D(
            filters=self.hidden_size,                 # 输出通道数即隐藏状态的维度
            kernel_size=patch_size,                   # 卷积核大小设置为补丁大小
            strides=patch_size,                       # 步长设置为补丁大小，用于不重叠地提取补丁
            padding="valid",                          # 使用有效填充方式
            data_format="channels_last",              # 数据格式为通道在最后
            use_bias=True,                            # 使用偏置
            kernel_initializer=get_initializer(self.config.initializer_range),  # 使用指定初始化器初始化权重
            bias_initializer="zeros",                 # 偏置初始化为零
            name="projection",                        # 层的名称为 projection
        )

    def call(
        self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
        # call 方法接收像素值张量、是否插值位置编码的标志和训练模式的标志
        # 未完整注释，仅包含方法签名部分，后续应继续添加注释
    ) -> tf.Tensor:
        # 从输入的像素值张量中获取批量大小、通道数、高度和宽度
        batch_size, num_channels, height, width = shape_list(pixel_values)
        # 如果在执行即时模式并且像素值的通道数与配置中设置的通道数不匹配，则引发值错误
        if tf.executing_eagerly() and num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 如果不插值位置编码且在执行即时模式下，输入图像的高度或宽度与模型配置的不匹配，则引发值错误
        if (
            not interpolate_pos_encoding
            and tf.executing_eagerly()
            and (height != self.image_size[0] or width != self.image_size[1])
        ):
            raise ValueError(
                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
            )

        # 当在 CPU 上运行时，`keras.layers.Conv2D` 不支持 `NCHW` 格式，因此将输入格式从 `NCHW` 转换为 `NHWC`
        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))

        # 使用投影层对像素值进行投影
        projection = self.projection(pixel_values)

        # 将2D空间维度改为单个时间维度
        # shape = (batch_size, num_patches, out_channels=embed_dim)
        num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])

        # 在 TFGroupViTVisionEmbeddings 中，此层的嵌入将被层归一化处理
        # LayerNormalization 层需要具有静态的最后一个维度（否则在使用符号张量时会导致 test_keras_save_load 失败）
        # 这就是为什么在 reshape 方法中使用了 hidden_size
        embeddings = tf.reshape(tensor=projection, shape=(batch_size, num_patches, self.hidden_size))

        return embeddings

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果 projection 属性已存在，则构建 projection 层
        if getattr(self, "projection", None) is not None:
            with tf.name_scope(self.projection.name):
                self.projection.build([None, None, None, self.num_channels])
# Adapted from transformers.vit.modeling_tf_vit.TFViTEmbeddings
class TFGroupViTVisionEmbeddings(keras.layers.Layer):
    """
    Construct the position and patch embeddings.

    """

    def __init__(self, config: GroupViTVisionConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化补丁嵌入层对象
        self.patch_embeddings = TFGroupViTPatchEmbeddings(config, name="patch_embeddings")
        # 添加 dropout 层，使用配置中的 dropout 率
        self.dropout = keras.layers.Dropout(rate=config.dropout, name="dropout")
        # 添加 LayerNormalization 层，使用配置中的 epsilon 值
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
        # 保存配置对象
        self.config = config

    def build(self, input_shape=None):
        # 获取补丁数量
        num_patches = self.patch_embeddings.num_patches
        # 添加位置嵌入权重，形状为 (1, num_patches, hidden_size)，使用零初始化
        self.position_embeddings = self.add_weight(
            shape=(1, num_patches, self.config.hidden_size),
            initializer="zeros",
            trainable=True,
            name="position_embeddings",
        )

        if self.built:
            return
        self.built = True
        # 如果已经构建，直接返回
        if getattr(self, "patch_embeddings", None) is not None:
            with tf.name_scope(self.patch_embeddings.name):
                self.patch_embeddings.build(None)
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                # 构建 LayerNormalization 层，输入形状为 [None, None, hidden_size]
                self.layernorm.build([None, None, self.config.hidden_size])

    def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
        """
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        """

        # 获取 embeddings 的形状信息
        batch_size, num_patches, dim = shape_list(embeddings)
        # 获取位置编码的数量
        num_positions = shape_list(self.position_embeddings)[1]

        # 如果补丁数量与位置编码数量相等，并且高度与宽度也相等，则直接返回位置编码
        if num_patches == num_positions and height == width:
            return self.position_embeddings
        # 否则，进行插值处理
        patch_pos_embed = self.position_embeddings
        h0 = height // self.config.patch_size
        w0 = width // self.config.patch_size
        # 使用双三次插值方法调整位置编码的大小
        patch_pos_embed = tf.image.resize(
            images=tf.reshape(
                patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
            ),
            size=(h0, w0),
            method="bicubic",
        )
        patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
        return patch_pos_embed

    def call(
        self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
    ):
        # 神经网络层的调用方法，根据像素值计算输出
    # 定义函数的返回类型为 TensorFlow 张量
    ) -> tf.Tensor:
        # 从 pixel_values 的形状列表中获取高度和宽度信息
        _, _, height, width = shape_list(pixel_values)
        # 将像素值转换为补丁的嵌入向量，并根据需要插值位置编码
        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
        # 对嵌入向量进行层归一化处理
        embeddings = self.layernorm(embeddings)

        # 如果需要插值位置编码，则将其添加到每个 token 的嵌入向量中
        if interpolate_pos_encoding:
            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
        else:
            # 否则，直接添加预定义的位置编码到嵌入向量中
            embeddings = embeddings + self.position_embeddings

        # 对嵌入向量应用 dropout 操作
        embeddings = self.dropout(embeddings)

        # 返回处理后的嵌入向量
        return embeddings
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings 复制代码，修改为 GroupViT
class TFGroupViTTextEmbeddings(keras.layers.Layer):
    def __init__(self, config: GroupViTTextConfig, **kwargs):
        super().__init__(**kwargs)

        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置文件中的隐藏大小

        self.config = config  # 保存配置信息

    def build(self, input_shape: tf.TensorShape = None):
        with tf.name_scope("token_embedding"):
            # 添加权重矩阵，形状为 (词汇表大小, 嵌入维度)，根据配置的初始化因子和范围进行初始化
            self.weight = self.add_weight(
                shape=(self.config.vocab_size, self.embed_dim),
                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                trainable=True,
                name="weight",
            )

        with tf.name_scope("position_embedding"):
            # 添加位置嵌入矩阵，形状为 (最大位置嵌入数, 嵌入维度)，根据配置的初始化因子和范围进行初始化
            self.position_embedding = self.add_weight(
                shape=(self.config.max_position_embeddings, self.embed_dim),
                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                trainable=True,
                name="embeddings",
            )

        super().build(input_shape)

    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
    ) -> tf.Tensor:
        """
        根据输入张量应用嵌入。

        返回:
            final_embeddings (`tf.Tensor`): 输出的嵌入张量。
        """
        if input_ids is None and inputs_embeds is None:
            raise ValueError("You have to specify either input_ids or inputs_embeds")  # 抛出数值错误，要求指定 input_ids 或 inputs_embeds

        if inputs_embeds is None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)  # 检查嵌入索引是否在范围内
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)  # 根据输入的 input_ids 从权重中获取嵌入向量

        input_shape = shape_list(inputs_embeds)[:-1]  # 获取输入嵌入张量的形状

        if position_ids is None:
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)  # 如果位置嵌入为空，则创建一个位置张量

        position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)  # 根据位置索引获取位置嵌入向量
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))  # 按指定倍数复制位置嵌入向量
        final_embeddings = inputs_embeds + position_embeds  # 最终的嵌入向量为输入嵌入向量加上位置嵌入向量

        return final_embeddings


class TFGroupViTStage(keras.layers.Layer):
    """这对应于 GroupViT 实现中的 `GroupingLayer` 类。"""

    def __init__(
        self,
        config: GroupViTVisionConfig,
        depth: int,
        num_prev_group_token: int,
        num_group_token: int,
        num_output_group: int,
        **kwargs,
    ):
        super().__init__(**kwargs)  # 调用父类的构造方法，传递任意关键字参数
        self.config = config  # 设置当前对象的config属性为传入的config参数
        self.depth = depth  # 设置当前对象的depth属性为传入的depth参数
        self.num_group_token = num_group_token  # 设置当前对象的num_group_token属性为传入的num_group_token参数
        self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(depth)]  # 根据depth参数创建TFGroupViTEncoderLayer对象的列表，每个对象命名为layers_._{i}

        if num_group_token > 0:
            self.downsample = TFGroupViTTokenAssign(
                config=config,
                num_group_token=num_group_token,
                num_output_group=num_output_group,
                name="downsample",
            )  # 如果num_group_token大于0，则创建TFGroupViTTokenAssign对象赋值给self.downsample属性，使用传入的config、num_group_token、num_output_group参数
        else:
            self.downsample = None  # 否则将self.downsample属性设为None

        if num_prev_group_token > 0 and num_group_token > 0:
            self.group_projector = [
                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="group_projector.0"),
                TFGroupViTMixerMLP(
                    config, num_prev_group_token, config.hidden_size // 2, num_group_token, name="group_projector.1"
                ),
            ]  # 如果num_prev_group_token和num_group_token均大于0，则创建包含LayerNormalization和TFGroupViTMixerMLP对象的列表，赋值给self.group_projector属性，使用传入的config、num_prev_group_token、config.hidden_size和num_group_token参数
        else:
            self.group_projector = None  # 否则将self.group_projector属性设为None

    def build(self, input_shape=None):
        if self.num_group_token > 0:
            self.group_token = self.add_weight(
                shape=(1, self.num_group_token, self.config.hidden_size),
                initializer="zeros",
                trainable=True,
                name="group_token",
            )  # 如果num_group_token大于0，则创建形状为(1, num_group_token, config.hidden_size)的可训练权重，赋值给self.group_token属性
        else:
            self.group_token = None  # 否则将self.group_token属性设为None

        if self.built:
            return  # 如果已经构建过，则直接返回
        self.built = True  # 标记已经构建过

        if getattr(self, "downsample", None) is not None:
            with tf.name_scope(self.downsample.name):
                self.downsample.build(None)  # 如果self.downsample不为None，则使用其名称作为作用域，在作用域内调用其build方法

        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)  # 遍历self.layers列表中的每个层对象，使用其名称作为作用域，在作用域内调用其build方法

        if getattr(self, "group_projector", None) is not None:
            with tf.name_scope(self.group_projector[0].name):
                self.group_projector[0].build([None, None, self.config.hidden_size])  # 如果self.group_projector不为None，则使用其第一个元素的名称作为作用域，在作用域内调用其build方法，传入形状为[None, None, config.hidden_size]的参数
            with tf.name_scope(self.group_projector[1].name):
                self.group_projector[1].build(None)  # 使用self.group_projector的第二个元素的名称作为作用域，在作用域内调用其build方法，不传入任何参数

    @property
    def with_group_token(self):
        return self.group_token is not None  # 返回self.group_token是否不为None的布尔值

    def split_x(self, x: tf.Tensor) -> tf.Tensor:
        if self.with_group_token:
            return x[:, : -self.num_group_token], x[:, -self.num_group_token :]  # 如果self.with_group_token为True，则返回x张量的前部分（去掉最后self.num_group_token列）和后部分（最后self.num_group_token列）
        else:
            return x, None  # 否则返回x张量和None

    def concat_x(self, x: tf.Tensor, group_token: tf.Tensor | None = None) -> tf.Tensor:
        if group_token is None:
            return x  # 如果group_token为None，则直接返回x张量
        return tf.concat([x, group_token], axis=1)  # 否则在axis=1的维度上连接x张量和group_token张量，并返回结果张量

    def call(
        self,
        hidden_states: tf.Tensor,
        prev_group_token: tf.Tensor | None = None,
        output_attentions: bool = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        """
        Args:
            hidden_states (`tf.Tensor`): 输入层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): 注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`，其中填充元素由极大负值指示。
            output_attentions (`bool`, *可选*):
                是否返回 Grouping block 的分组张量。
        """
        # 如果开启了 group token 功能
        if self.with_group_token:
            # 复制并展开 group token 到与 hidden_states 相同的形状
            group_token = tf.tile(self.group_token, multiples=(shape_list(hidden_states)[0], 1, 1))
            # 如果存在 group_projector，对 group token 应用每一层的 projector
            if self.group_projector is not None:
                for layer in self.group_projector:
                    prev_group_token = layer(prev_group_token)
                group_token = group_token + prev_group_token  # 将前一个 group token 添加到当前 group token
        else:
            group_token = None

        x = hidden_states

        # 将 hidden_states 和 group_token 进行连接
        cat_x = self.concat_x(x, group_token)
        # 遍历每一层并应用
        for layer in self.layers:
            layer_out = layer(
                cat_x,
                attention_mask=None,
                causal_attention_mask=None,
                output_attentions=None,
            )
            cat_x = layer_out[0]  # 更新 cat_x 到当前层的输出

        # 将 cat_x 拆分回原始的 hidden_states 和 group_token
        x, group_token = self.split_x(cat_x)

        attention = None
        # 如果存在 downsample 层，进行降采样操作
        if self.downsample is not None:
            x, attention = self.downsample(x, group_token)

        # 输出结果为 (x, group_token)，如果需要返回 attentions，则加入 attention
        outputs = (x, group_token)
        if output_attentions:
            outputs = outputs + (attention,)

        return outputs
class TFGroupViTMLP(keras.layers.Layer):
    # TFGroupViTMLP 类，继承自 keras.layers.Layer
    def __init__(
        self,
        config: GroupViTVisionConfig,
        hidden_size: Optional[int] = None,
        intermediate_size: Optional[int] = None,
        output_size: Optional[int] = None,
        **kwargs,
    ):
        # 初始化函数，接受配置 config 和可选的隐藏大小、中间大小、输出大小等参数
        super().__init__(**kwargs)
        self.config = config
        # 获取激活函数
        self.activation_fn = get_tf_activation(config.hidden_act)
        # 设置隐藏大小，默认从配置中获取
        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
        # 设置中间大小，默认从配置中获取
        intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
        # 设置输出大小，默认为隐藏大小
        output_size = output_size if output_size is not None else hidden_size
        # 创建 Dense 层 fc1，用于中间层
        self.fc1 = keras.layers.Dense(intermediate_size, name="fc1")
        # 创建 Dense 层 fc2，用于输出层
        self.fc2 = keras.layers.Dense(output_size, name="fc2")
        self.intermediate_size = intermediate_size
        self.hidden_size = hidden_size

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 调用函数，传入隐藏状态 hidden_states 和训练标志 training
        # 将 hidden_states 输入到 fc1 中
        hidden_states = self.fc1(hidden_states)
        # 使用激活函数处理 fc1 输出
        hidden_states = self.activation_fn(hidden_states)
        # 将处理后的 hidden_states 输入到 fc2 中
        hidden_states = self.fc2(hidden_states)
        return hidden_states

    def build(self, input_shape=None):
        # 构建函数，在第一次调用时构建层的内部变量
        if self.built:
            return
        self.built = True
        # 构建 fc1 层
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.hidden_size])
        # 构建 fc2 层
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.intermediate_size])


class TFGroupViTMixerMLP(TFGroupViTMLP):
    # TFGroupViTMixerMLP 类，继承自 TFGroupViTMLP
    def call(self, x, training: bool = False):
        # 调用函数，传入输入 x 和训练标志 training
        # 调用父类 TFGroupViTMLP 的 call 方法，将 x 转置后输入
        x = super().call(hidden_states=tf.transpose(x, perm=(0, 2, 1)))
        # 返回转置后的结果
        return tf.transpose(x, perm=(0, 2, 1))


# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPAttention
class TFGroupViTAttention(keras.layers.Layer):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    # TFGroupViTAttention 类，继承自 keras.layers.Layer
    """来自《Attention Is All You Need》论文的多头注意力"""
    def __init__(self, config: GroupViTConfig, **kwargs):
        super().__init__(**kwargs)

        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置中的隐藏大小
        self.num_attention_heads = config.num_attention_heads  # 设置注意力头的数量为配置中的注意力头数
        self.attention_head_size = self.embed_dim // self.num_attention_heads  # 计算每个注意力头的大小
        if self.attention_head_size * self.num_attention_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: "
                f"{self.num_attention_heads})."
            )

        factor = config.initializer_factor  # 从配置中获取初始化因子
        # 计算输入投影的标准差
        in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
        # 计算输出投影的标准差
        out_proj_std = (self.embed_dim**-0.5) * factor

        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)  # 计算注意力头大小的平方根

        # 初始化查询投影层，使用自定义的初始化器
        self.q_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
        )
        # 初始化键投影层，使用自定义的初始化器
        self.k_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
        )
        # 初始化值投影层，使用自定义的初始化器
        self.v_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
        )

        # 初始化 dropout 层，设定丢弃率为配置中的注意力丢弃率
        self.dropout = keras.layers.Dropout(rate=config.attention_dropout)

        # 初始化输出投影层，使用自定义的初始化器
        self.out_proj = keras.layers.Dense(
            units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
        )

    # 从 transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores 复制而来
    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # 将张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # 将张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor = None,
        causal_attention_mask: tf.Tensor = None,
        output_attentions: bool = None,
        encoder_hidden_states: tf.Tensor = None,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        """Input shape: Batch x Time x Channel"""

        # 获取隐藏状态的批次大小
        batch_size = shape_list(hidden_states)[0]
        # 判断是否为跨注意力机制
        is_cross_attention = encoder_hidden_states is not None

        # 计算混合后的查询向量
        mixed_query_layer = self.q_proj(inputs=hidden_states)
        if is_cross_attention:
            # 若为跨注意力，计算混合后的键和值向量
            mixed_key_layer = self.k_proj(inputs=encoder_hidden_states)
            mixed_value_layer = self.v_proj(inputs=encoder_hidden_states)
        else:
            # 否则，计算混合后的键和值向量
            mixed_key_layer = self.k_proj(inputs=hidden_states)
            mixed_value_layer = self.v_proj(inputs=hidden_states)

        # 调整张量形状以进行注意力得分计算
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # 计算注意力分数，即查询向量和键向量的点积
        # 结果维度为(batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        # 将注意力分数除以 sqrt(注意力头大小)
        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
        attention_scores = tf.divide(attention_scores, dk)

        # 先应用因果注意力掩码
        if causal_attention_mask is not None:
            # 加上因果注意力掩码（在 TFCLIPModel call() 函数中预先计算）
            attention_scores = tf.add(attention_scores, causal_attention_mask)

        # 若存在普通注意力掩码，则也应用
        if attention_mask is not None:
            # 加上普通注意力掩码（在 TFCLIPModel call() 函数中预先计算）
            attention_scores = tf.add(attention_scores, attention_mask)

        # 将注意力分数归一化为概率
        _attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # 对注意力概率进行 dropout 处理
        attention_probs = self.dropout(inputs=_attention_probs)

        # 计算注意力输出值
        attention_output = tf.matmul(attention_probs, value_layer)
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        # 重新整形注意力输出张量的形状
        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))

        # 通过输出投影层处理注意力输出
        attention_output = self.out_proj(attention_output)

        # 根据模型不同的输出设置，返回注意力输出和可能的注意力权重
        outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)

        return outputs
    # 构建方法用于构造模型，根据输入形状初始化模型的各个组件
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位为已构建
        self.built = True
        
        # 如果存在查询投影层，则构建查询投影层
        if getattr(self, "q_proj", None) is not None:
            # 在命名空间内构建查询投影层，输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.q_proj.name):
                self.q_proj.build([None, None, self.embed_dim])
        
        # 如果存在键投影层，则构建键投影层
        if getattr(self, "k_proj", None) is not None:
            # 在命名空间内构建键投影层，输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.k_proj.name):
                self.k_proj.build([None, None, self.embed_dim])
        
        # 如果存在值投影层，则构建值投影层
        if getattr(self, "v_proj", None) is not None:
            # 在命名空间内构建值投影层，输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.v_proj.name):
                self.v_proj.build([None, None, self.embed_dim])
        
        # 如果存在输出投影层，则构建输出投影层
        if getattr(self, "out_proj", None) is not None:
            # 在命名空间内构建输出投影层，输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.embed_dim])
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPEncoderLayer 复制代码并改名为 TFGroupViTEncoderLayer，用于 GroupViT 模型
class TFGroupViTEncoderLayer(keras.layers.Layer):
    # 初始化函数，接受 GroupViTConfig 对象作为配置参数
    def __init__(self, config: GroupViTConfig, **kwargs):
        super().__init__(**kwargs)

        # 设定嵌入维度为隐藏大小
        self.embed_dim = config.hidden_size
        # 创建自注意力层，使用 TFGroupViTAttention 类，并命名为 "self_attn"
        self.self_attn = TFGroupViTAttention(config, name="self_attn")
        # 创建第一个层规范化层，使用 LayerNormalization，设定 epsilon 为 config.layer_norm_eps，命名为 "layer_norm1"
        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
        # 创建 MLP 层，使用 TFGroupViTMLP 类，命名为 "mlp"
        self.mlp = TFGroupViTMLP(config, name="mlp")
        # 创建第二个层规范化层，使用 LayerNormalization，设定 epsilon 为 config.layer_norm_eps，命名为 "layer_norm2"
        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")

    # 调用函数，实现层的前向传播逻辑
    def call(
        self,
        hidden_states: tf.Tensor,               # 输入的隐藏状态张量，形状为 `(batch, seq_len, embed_dim)`
        attention_mask: tf.Tensor,              # 注意力掩码张量，形状为 `(batch, 1, tgt_len, src_len)`
        causal_attention_mask: tf.Tensor,       # 因果注意力掩码张量，形状为 `(batch, 1, tgt_len, src_len)`
        output_attentions: bool,                # 是否输出所有注意力层的注意力张量
        training: bool = False,                 # 是否处于训练模式
    ) -> Tuple[tf.Tensor]:                     # 返回类型为包含一个张量的元组
        """
        Args:
            hidden_states (`tf.Tensor`): 输入层的隐藏状态，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): 注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`
                其中填充元素由非常大的负值表示。
            causal_attention_mask (`tf.Tensor`): 因果注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`
                其中填充元素由非常大的负值表示。
            output_attentions (`bool`):
                是否返回所有注意力层的注意力张量。查看返回的 `outputs` 中的详细信息。
        """
        residual = hidden_states

        # 应用第一个层规范化层
        hidden_states = self.layer_norm1(inputs=hidden_states)
        # 使用 self_attn 进行自注意力计算
        attention_outputs = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            training=training,
        )
        # 取自注意力输出的第一个张量作为新的隐藏状态
        hidden_states = attention_outputs[0]
        # 添加残差连接
        hidden_states = residual + hidden_states

        residual = hidden_states
        # 应用第二个层规范化层
        hidden_states = self.layer_norm2(inputs=hidden_states)
        # 应用 MLP 层
        hidden_states = self.mlp(hidden_states=hidden_states)
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 如果需要输出注意力张量，则将其添加到输出中
        outputs = (hidden_states,) + attention_outputs[1:]  # 如果输出注意力张量，则添加它们

        return outputs
    # 构建神经网络层次结构。如果已经构建过，则直接返回。
    def build(self, input_shape=None):
        if self.built:
            return
        # 标记该层次已经构建
        self.built = True
        # 如果存在 self_attn 属性，则构建 self_attn 层
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        # 如果存在 layer_norm1 属性，则构建 layer_norm1 层
        if getattr(self, "layer_norm1", None) is not None:
            with tf.name_scope(self.layer_norm1.name):
                self.layer_norm1.build([None, None, self.embed_dim])
        # 如果存在 mlp 属性，则构建 mlp 层
        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)
        # 如果存在 layer_norm2 属性，则构建 layer_norm2 层
        if getattr(self, "layer_norm2", None) is not None:
            with tf.name_scope(self.layer_norm2.name):
                self.layer_norm2.build([None, None, self.embed_dim])
# 从 transformers.models.clip.modeling_tf_clip.TFGroupViTTextEncoder 适配而来的自定义层
class TFGroupViTTextEncoder(keras.layers.Layer):
    def __init__(self, config: GroupViTTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化多层 TFGroupViTEncoderLayer，根据配置创建指定数量的编码器层
        self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]

    def call(
        self,
        hidden_states,               # 输入的隐藏状态张量
        attention_mask: tf.Tensor,   # 注意力掩码张量
        causal_attention_mask: tf.Tensor,  # 因果注意力掩码张量
        output_attentions: bool,     # 是否输出注意力矩阵
        output_hidden_states: bool,  # 是否输出隐藏状态
        return_dict: bool,           # 是否返回字典格式的输出
        training: bool = False,      # 是否处于训练模式
    ) -> Union[Tuple, TFBaseModelOutput]:
        # 如果需要输出隐藏状态，则初始化空元组存储编码器状态
        encoder_states = () if output_hidden_states else None
        # 如果需要输出注意力矩阵，则初始化空元组存储注意力矩阵
        all_attentions = () if output_attentions else None

        # 遍历每个编码器层进行前向传播
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                # 如果需要输出隐藏状态，将当前隐藏状态添加到状态元组中
                encoder_states = encoder_states + (hidden_states,)

            # 调用编码器层的前向传播，获取层的输出
            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask,
                causal_attention_mask,
                output_attentions=output_attentions,
            )
            # 更新隐藏状态为编码器层的输出的第一个元素
            hidden_states = layer_outputs[0]

            if output_attentions:
                # 如果需要输出注意力矩阵，将当前层的注意力矩阵添加到 all_attentions 元组中
                all_attentions = all_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，将最终的隐藏状态添加到状态元组中
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        # 如果不需要返回字典格式的输出，根据需要返回隐藏状态、编码器状态和注意力矩阵
        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        # 否则，返回 TFBaseModelOutput 对象，包含最终的隐藏状态、编码器状态和注意力矩阵
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 self.layers，则遍历每个层并构建它们
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)


# TFGroupViTVisionEncoder 类
class TFGroupViTVisionEncoder(keras.layers.Layer):
    def __init__(self, config: GroupViTVisionConfig, **kwargs) -> None:
        super().__init__(**kwargs)

        # 初始化多个 TFGroupViTStage，根据配置创建多个视觉编码阶段
        self.stages = [
            TFGroupViTStage(
                config=config,
                depth=config.depths[i],
                num_group_token=config.num_group_tokens[i],
                num_output_group=config.num_output_groups[i],
                num_prev_group_token=config.num_output_groups[i - 1] if i > 0 else 0,
                name=f"stages_._{i}",
            )
            for i in range(len(config.depths))
        ]

    def call(
        self,
        hidden_states: tf.Tensor,    # 输入的隐藏状态张量
        output_hidden_states: bool,  # 是否输出隐藏状态
        output_attentions: bool,     # 是否输出注意力矩阵
        return_dict: bool,           # 是否返回字典格式的输出
        training: bool = False,      # 是否处于训练模式

    ) -> None:
        # 遍历每个视觉编码阶段进行前向传播
        for stage in self.stages:
            # 调用每个阶段的前向传播函数
            hidden_states = stage(
                hidden_states,
                output_hidden_states=output_hidden_states,
                output_attentions=output_attentions,
                return_dict=return_dict,
            )

        # 返回最终的隐藏状态张量作为视觉编码器的输出
        return hidden_states
    ) -> Union[tuple, TFBaseModelOutput]:
        # 如果输出隐藏状态，则初始化一个空元组，否则设为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，则初始化一个空元组，否则设为 None
        all_groupings = () if output_attentions else None

        # 初始化 group_tokens 为 None
        group_tokens = None

        # 遍历 self.stages 中的每个阶段
        for stage in self.stages:
            # 如果输出隐藏状态，则将当前隐藏状态加入到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用当前阶段的处理函数，获取当前层的输出
            layer_outputs = stage(hidden_states, group_tokens, output_attentions)

            # 更新隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]
            # 更新 group_tokens 为当前层输出的第二个元素
            group_tokens = layer_outputs[1]

            # 如果输出注意力权重且当前层有有效的注意力权重输出，则将其加入 all_groupings 中
            if output_attentions and layer_outputs[2] is not None:
                all_groupings = all_groupings + (layer_outputs[2],)

        # 如果输出隐藏状态，则将最终隐藏状态加入 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不要求返回字典形式的输出，则按顺序返回非空的结果元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_groupings] if v is not None)
        
        # 如果需要返回字典形式的输出，则构造 TFBaseModelOutput 对象返回
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        
        # 标记当前模型已经构建
        self.built = True
        
        # 如果模型已经定义了 stages 属性，则对每个层进行构建
        if getattr(self, "stages", None) is not None:
            for layer in self.stages:
                # 使用当前层的名称为其创建命名空间，并进行构建
                with tf.name_scope(layer.name):
                    layer.build(None)
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPTextTransformer 复制的代码，将 CLIPText 改为 GroupViTText，将 CLIPEncoder 改为 GroupViTTextEncoder
class TFGroupViTTextTransformer(keras.layers.Layer):
    def __init__(self, config: GroupViTTextConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化 GroupViTTextEmbeddings 层，用于处理输入文本的嵌入表示
        self.embeddings = TFGroupViTTextEmbeddings(config, name="embeddings")
        
        # 初始化 GroupViTTextEncoder 层，用于对嵌入表示进行编码得到输出特征
        self.encoder = TFGroupViTTextEncoder(config, name="encoder")
        
        # 最终的层归一化，用于规范化最终输出的特征向量
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")

        # 用于计算 `pooled_output` 的相关属性
        self.eos_token_id = config.eos_token_id  # EOS（结束符）的 token ID
        self.embed_dim = config.hidden_size  # 嵌入维度大小

    def call(
        self,
        input_ids: TFModelInputType,
        attention_mask: tf.Tensor,
        position_ids: tf.Tensor,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
        # 函数的参数：input_ids 输入的模型输入，attention_mask 注意力掩码，position_ids 位置编码，output_attentions 是否输出注意力权重，output_hidden_states 是否输出隐藏状态，return_dict 是否返回字典格式的输出
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 获取输入 `input_ids` 的形状信息
        input_shape = shape_list(input_ids)

        # 使用输入的 `input_ids` 和 `position_ids` 作为参数，调用嵌入层对象 `self.embeddings` 进行嵌入操作
        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)

        # 从输入形状信息中提取批大小和序列长度
        batch_size, seq_length = input_shape
        
        # CLIP 文本模型使用因果掩码，在这里准备它
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
        causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)

        # 检查注意力掩码并扩展其维度
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        attention_mask = _expand_mask(attention_mask)

        # 调用编码器 `self.encoder`，传入嵌入输出、注意力掩码等参数，并接收编码器的输出
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从编码器输出中提取序列输出
        sequence_output = encoder_outputs[0]

        # 对序列输出进行最终层归一化处理
        sequence_output = self.final_layer_norm(inputs=sequence_output)

        # 如果 `self.eos_token_id` 等于 2
        if self.eos_token_id == 2:
            # 如果 `eos_token_id` 在 PR #24773 之前是错误的，保持之前的操作
            # 对 `sequence_output` 进行聚合操作，选择每个序列中最高数值的位置作为池化输出
            pooled_output = tf.gather_nd(
                params=sequence_output,
                indices=tf.stack(
                    values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
                ),
            )
        else:
            # 如果 `eos_token_id` 在 PR #24773 中被更新了，允许额外新标记的使用
            # 对 `sequence_output` 进行聚合操作，选择每个序列中 `self.eos_token_id` 的位置作为池化输出
            pooled_output = tf.gather_nd(
                params=sequence_output,
                indices=tf.stack(
                    values=(
                        tf.range(input_shape[0], dtype=tf.int64),
                        tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1),
                    ),
                    axis=1,
                ),
            )

        # 如果不返回字典形式的结果，则返回元组形式的输出
        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        # 返回 TFBaseModelOutputWithPooling 对象，包含序列输出、池化输出、编码器的隐藏状态和注意力权重
        return TFBaseModelOutputWithPooling(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
    # 构建因果注意力掩码，用于自注意力机制
    def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
        # 如果 seq_length 是运行时值，不能用 tf.constant 处理。根据 TensorFlow 文档，tf.fill 可处理动态形状：
        # https://www.tensorflow.org/api_docs/python/tf/fill
        diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)

        # 创建一个二维加性注意力掩码，所有位置均被掩盖
        to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)

        # 将二维矩阵的对角线及其以下三角部分设置为 0，即不被掩盖的位置
        # 提示：将二维矩阵视为 (query_seq, key_seq) 的空间
        to_mask = tf.linalg.band_part(to_mask, 0, -1)
        # to_mask = tf.linalg.band_part(to_mask, -1, 0)  # 这行代码是注释掉的备选方案
        to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)

        return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPVisionTransformer
# 自transformers库中TFCLIPVisionTransformer模块改编而来，用于处理视觉信息的Transformer模型

class TFGroupViTVisionTransformer(keras.layers.Layer):
    def __init__(self, config: GroupViTVisionConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化视觉嵌入层对象，使用GroupViTVisionEmbeddings处理视觉嵌入
        self.embeddings = TFGroupViTVisionEmbeddings(config, name="embeddings")
        
        # 初始化视觉编码器对象，使用GroupViTVisionEncoder处理视觉编码
        self.encoder = TFGroupViTVisionEncoder(config, name="encoder")
        
        # 初始化层归一化对象，设置epsilon值为config中定义的层归一化epsilon值
        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
        
        # 设置嵌入维度为config中定义的隐藏层大小
        self.embed_dim = config.hidden_size

    # 定义模型调用方法，接收像素值和其他配置参数，并返回模型输出
    def call(
        self,
        pixel_values: TFModelInputType,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
        # 获取嵌入层的输出，即像素值的嵌入表示
        embedding_output = self.embeddings(pixel_values)

        # 将嵌入输出传入编码器，获取编码器的输出
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
            return_dict=return_dict,
        )

        # 获取编码器输出的最后隐藏状态
        last_hidden_state = encoder_outputs[0]

        # 对最后隐藏状态进行层归一化处理
        last_hidden_state = self.layernorm(last_hidden_state)

        # 计算池化输出，通过对最后隐藏状态在第1维度上求均值得到
        pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)

        # 如果不需要返回字典形式的结果，则返回元组形式的输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果需要返回字典形式的结果，则构建TFBaseModelOutputWithPooling对象返回
        return TFBaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    # 构建方法，在首次调用时构建嵌入层、编码器和层归一化对象
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果已定义嵌入层，则使用tf.name_scope构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        
        # 如果已定义编码器，则使用tf.name_scope构建编码器
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        
        # 如果已定义层归一化对象，则使用tf.name_scope构建层归一化对象
        if getattr(self, "layernorm", None) is not None:
            with tf.name_scope(self.layernorm.name):
                self.layernorm.build([None, None, self.embed_dim])


@keras_serializable
# 从transformers.models.clip.modeling_tf_clip.TFCLIPTextMainLayer复制而来，将CLIP替换为GroupViT
# 从transformers库中TFCLIPTextMainLayer模块复制而来，修改为处理GroupViT的文本主层

class TFGroupViTTextMainLayer(keras.layers.Layer):
    config_class = GroupViTTextConfig

    def __init__(self, config: GroupViTTextConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 初始化配置对象
        self.config = config
        
        # 初始化文本模型对象，使用TFGroupViTTextTransformer处理文本信息
        self.text_model = TFGroupViTTextTransformer(config, name="text_model")

    # 获取输入嵌入层对象的方法，返回文本模型的嵌入层对象
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.text_model.embeddings

    # 设置输入嵌入层对象的方法，设置文本模型的嵌入层权重和词汇大小
    def set_input_embeddings(self, value: tf.Variable):
        self.text_model.embeddings.weight = value
        self.text_model.embeddings.vocab_size = shape_list(value)[0]

    # 对输入参数进行解包处理的装饰器
    @unpack_inputs
    # 定义一个方法 `call`，用于执行模型的前向传播
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 如果 `input_ids` 为空，则抛出数值错误
        if input_ids is None:
            raise ValueError("You have to specify input_ids")

        # 获取 `input_ids` 的形状
        input_shape = shape_list(input_ids)

        # 如果 `attention_mask` 为空，则创建一个形状与 `input_shape` 相同的张量，填充值为 1
        if attention_mask is None:
            attention_mask = tf.fill(dims=input_shape, value=1)

        # 调用 `text_model` 进行文本模型的前向传播，并传递相应的参数
        text_model_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回文本模型的输出结果
        return text_model_outputs

    # 定义一个方法 `build`，用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        # 如果存在 `text_model` 属性，则在其命名空间内构建模型
        if getattr(self, "text_model", None) is not None:
            with tf.name_scope(self.text_model.name):
                self.text_model.build(None)
# 使用 keras_serializable 装饰器使该类可以序列化为 Keras 模型
@keras_serializable
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPVisionMainLayer 复制而来，并将 CLIP 改为 GroupViT
class TFGroupViTVisionMainLayer(keras.layers.Layer):
    # 指定配置类为 GroupViTVisionConfig
    config_class = GroupViTVisionConfig

    def __init__(self, config: GroupViTVisionConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 创建 TFGroupViTVisionTransformer 模型，并命名为 vision_model
        self.vision_model = TFGroupViTVisionTransformer(config, name="vision_model")

    # 返回 vision_model 的 embeddings 层作为输入嵌入
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.vision_model.embeddings

    # 对输入进行解包，并调用 vision_model 进行前向传播
    @unpack_inputs
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 如果 pixel_values 为 None，则抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 调用 vision_model 进行前向传播，并返回其输出
        vision_model_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return vision_model_outputs

    # 构建层次结构，如果已经构建过，则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 vision_model 存在，则在其命名空间下构建模型
        if getattr(self, "vision_model", None) is not None:
            with tf.name_scope(self.vision_model.name):
                self.vision_model.build(None)


# 使用 keras_serializable 装饰器使该类可以序列化为 Keras 模型
@keras_serializable
# 从 transformers.models.clip.modeling_tf_clip.TFCLIPMainLayer 改编而来
class TFGroupViTMainLayer(keras.layers.Layer):
    # 指定配置类为 GroupViTConfig
    config_class = GroupViTConfig
    # 初始化方法，接受一个配置对象 config 和其他关键字参数
    def __init__(self, config: GroupViTConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 检查 config.text_config 是否为 GroupViTTextConfig 类型，否则引发 ValueError 异常
        if not isinstance(config.text_config, GroupViTTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type GroupViTTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查 config.vision_config 是否为 GroupViTVisionConfig 类型，否则引发 ValueError 异常
        if not isinstance(config.vision_config, GroupViTVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type GroupViTVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 将传入的 config 对象赋值给实例变量 self.config
        self.config = config

        # 从 config 对象中获取 text_config 和 vision_config 对象，并分别赋值给 text_config 和 vision_config 变量
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置实例变量，分别为投影维度和投影中间维度
        self.projection_dim = config.projection_dim
        self.projection_intermediate_dim = config.projection_intermediate_dim
        # 设置文本嵌入维度和视觉嵌入维度
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 创建 TFGroupViTTextTransformer 对象，用于文本模型的转换，命名为 "text_model"
        self.text_model = TFGroupViTTextTransformer(text_config, name="text_model")
        # 创建 TFGroupViTVisionTransformer 对象，用于视觉模型的转换，命名为 "vision_model"
        self.vision_model = TFGroupViTVisionTransformer(vision_config, name="vision_model")

        # 定义视觉投影层，包括 Dense 层、BatchNormalization 层和 ReLU 激活函数层
        self.visual_projection = [
            keras.layers.Dense(self.projection_intermediate_dim, name="visual_projection.0"),
            keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.9, epsilon=1e-5),
            keras.layers.ReLU(name="visual_projection.2"),
            keras.layers.Dense(self.projection_dim, name="visual_projection.3"),
        ]
        # 定义文本投影层，包括 Dense 层、BatchNormalization 层和 ReLU 激活函数层
        self.text_projection = [
            keras.layers.Dense(self.projection_intermediate_dim, name="text_projection.0"),
            keras.layers.BatchNormalization(name="text_projection.1", momentum=0.9, epsilon=1e-5),
            keras.layers.ReLU(name="text_projection.2"),
            keras.layers.Dense(self.projection_dim, name="text_projection.3"),
        ]
    def build(self, input_shape=None):
        # 添加一个可训练的名为logit_scale的权重，初始值为config中的logit_scale_init_value
        self.logit_scale = self.add_weight(
            shape=(1,),
            initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
            trainable=True,
            name="logit_scale",
        )

        # 如果模型已经建立，则直接返回
        if self.built:
            return
        # 标记模型已经建立
        self.built = True
        
        # 如果存在text_model，则构建text_model
        if getattr(self, "text_model", None) is not None:
            with tf.name_scope(self.text_model.name):
                self.text_model.build(None)
        
        # 如果存在vision_model，则构建vision_model
        if getattr(self, "vision_model", None) is not None:
            with tf.name_scope(self.vision_model.name):
                self.vision_model.build(None)
        
        # 如果存在visual_projection，则分别构建其各个层
        if getattr(self, "visual_projection", None) is not None:
            with tf.name_scope(self.visual_projection[0].name):
                self.visual_projection[0].build([None, None, None, self.vision_embed_dim])
            with tf.name_scope(self.visual_projection[1].name):
                self.visual_projection[1].build((None, self.projection_intermediate_dim))
            with tf.name_scope(self.visual_projection[3].name):
                self.visual_projection[3].build([None, None, None, self.projection_intermediate_dim])
        
        # 如果存在text_projection，则分别构建其各个层
        if getattr(self, "text_projection", None) is not None:
            with tf.name_scope(self.text_projection[0].name):
                self.text_projection[0].build([None, None, None, self.text_embed_dim])
            with tf.name_scope(self.text_projection[1].name):
                self.text_projection[1].build((None, self.projection_intermediate_dim))
            with tf.name_scope(self.text_projection[3].name):
                self.text_projection[3].build([None, None, None, self.projection_intermediate_dim])

    @unpack_inputs
    def get_text_features(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> tf.Tensor:
        # 如果未提供input_ids，则抛出数值错误异常
        if input_ids is None:
            raise ValueError("You have to specify either input_ids")

        # 获取input_ids的形状
        input_shape = shape_list(input_ids)

        # 如果未提供attention_mask，则使用全1填充
        if attention_mask is None:
            attention_mask = tf.fill(dims=input_shape, value=1)

        # 使用text_model处理输入，获取文本输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从文本输出中获取汇总的输出
        pooled_output = text_outputs[1]
        
        # 将汇总的输出依次经过text_projection的每一层处理
        for layer in self.text_projection:
            pooled_output = layer(pooled_output)

        # 返回文本特征
        text_features = pooled_output
        return text_features

    @unpack_inputs
    # 定义一个方法，用于获取图像特征
    def get_image_features(
        self,
        pixel_values: TFModelInputType | None = None,  # 像素值输入，可以为 None
        output_attentions: Optional[bool] = None,      # 是否输出注意力权重，默认为 None
        output_hidden_states: Optional[bool] = None,    # 是否输出隐藏状态，默认为 None
        return_dict: Optional[bool] = None,             # 是否返回字典形式的输出，默认为 None
        training: bool = False,                         # 是否处于训练模式，默认为 False
    ) -> tf.Tensor:                                    # 返回类型为 TensorFlow 的张量

        # 如果像素值为 None，则抛出 ValueError 异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 使用视觉模型处理像素值，根据参数选择是否返回注意力权重和隐藏状态
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取视觉模型输出的汇总特征（一般是第二个输出）
        pooled_output = vision_outputs[1]

        # 通过每层的可调用层对汇总特征进行变换
        for layer in self.visual_projection:
            pooled_output = layer(pooled_output)

        # 将处理后的图像特征赋给变量 image_features
        image_features = pooled_output

        # 返回图像特征
        return image_features

    # 使用装饰器 unpack_inputs 定义一个方法，用于调用模型
    def call(
        self,
        input_ids: TFModelInputType | None = None,            # 输入的 token IDs，可以为 None
        pixel_values: TFModelInputType | None = None,         # 像素值输入，可以为 None
        attention_mask: np.ndarray | tf.Tensor | None = None, # 注意力掩码，可以为 None
        position_ids: np.ndarray | tf.Tensor | None = None,   # 位置 IDs，可以为 None
        return_loss: Optional[bool] = None,                   # 是否返回损失，默认为 None
        output_attentions: Optional[bool] = None,             # 是否输出注意力权重，默认为 None
        output_hidden_states: Optional[bool] = None,          # 是否输出隐藏状态，默认为 None
        output_segmentation: Optional[bool] = None,           # 是否输出分割结果，默认为 None
        return_dict: Optional[bool] = None,                   # 是否返回字典形式的输出，默认为 None
        training: bool = False,                               # 是否处于训练模式，默认为 False
# GROUPVIT_TEXT_INPUTS_DOCSTRING 变量，包含了关于输入格式的文档字符串，用于说明 TF 2.0 模型接受的输入格式。
GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            # 输入序列的标记索引在词汇表中的位置。
            # 可以使用 [`AutoTokenizer`] 获取这些索引。详情请参见 [`PreTrainedTokenizer.__call__`] 和 [`PreTrainedTokenizer.encode`]。
            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            # 遮罩，用于在填充标记索引上避免进行注意力计算。
            # 遮罩值为 `[0, 1]`：
            # - 1 表示**未被遮罩**的标记，
            # - 0 表示**被遮罩**的标记。
            # [什么是注意力遮罩？](../glossary#attention-mask)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。
            # 选取范围为 `[0, config.max_position_embeddings - 1]`。
            # [什么是位置 ID？](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。
            # 返回的张量中详细说明了 `attentions`。此参数仅在 eager 模式下可用，在图模式下将使用配置中的值。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。
            # 返回的张量中详细说明了 `hidden_states`。此参数仅在 eager 模式下可用，在图模式下将使用配置中的值。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。此参数仅在 eager 模式下可用，在图模式下值始终为 True。
        training (`bool`, *optional*, defaults to `False``):
            # 是否将模型设置为训练模式（某些模块如 dropout 模块在训练和评估中行为不同）。
"""

GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
"""

GROUPVIT_INPUTS_DOCSTRING = r"""
    A docstring describing the inputs expected by a function or method in the GROUPVIT module.

"""
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            # 输入序列标记在词汇表中的索引。
            # 可以使用 [`AutoTokenizer`] 获取索引。详见 [`PreTrainedTokenizer.__call__`] 和 [`PreTrainedTokenizer.encode`]。
            # [什么是输入 ID?](../glossary#input-ids)

        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            # 像素值。可以使用 [`AutoImageProcessor`] 获取像素值。详见 [`CLIPImageProcessor.__call__`]。
        
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            # 遮罩，避免在填充的标记索引上执行注意力操作。
            # 遮罩值在 `[0, 1]` 之间：
            # - 1 表示**未遮罩**的标记，
            # - 0 表示**已遮罩**的标记。
            # [什么是注意力遮罩?](../glossary#attention-mask)

        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。
            # 选择范围为 `[0, config.max_position_embeddings - 1]`。
            # [什么是位置 ID?](../glossary#position-ids)

        return_loss (`bool`, *optional*):
            # 是否返回对比损失。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。

        training (`bool`, *optional*, defaults to `False`):
            # 是否在训练模式中使用模型（某些模块在训练和评估之间具有不同的行为）。
"""
TFGroupViTTextModel 类定义了一个基于 TFGroupViTPreTrainedModel 的文本模型。
"""
class TFGroupViTTextModel(TFGroupViTPreTrainedModel):
    # 设置配置类
    config_class = GroupViTTextConfig
    # 主输入名称
    main_input_name = "input_ids"

    def __init__(self, config: GroupViTTextConfig, *inputs, **kwargs):
        # 调用父类构造函数
        super().__init__(config, *inputs, **kwargs)

        # 初始化 TFGroupViTTextMainLayer 实例作为模型的主要组件
        self.groupvit = TFGroupViTTextMainLayer(config, name="groupvit")

    @unpack_inputs
    # 将输入参数解包后，添加文档字符串到模型的前向传播方法
    @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 替换模型前向传播方法的返回文档字符串
    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTTextConfig)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        r"""
        模型的前向传播函数，接受输入参数并返回模型的输出。

        Returns:
            TFBaseModelOutputWithPooling 或者包含 tf.Tensor 的元组

        Examples:
        示例用法，展示了如何使用模型进行推理。

        ```
        >>> from transformers import CLIPTokenizer, TFGroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = TFGroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```
        """

        # 调用 self.groupvit 的前向传播方法并返回结果
        outputs = self.groupvit(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def build(self, input_shape=None):
        # 如果已经构建完成，则直接返回
        if self.built:
            return
        # 设置构建完成标志
        self.built = True
        # 如果 self.groupvit 存在，则在 TensorFlow 的命名空间内构建组件
        if getattr(self, "groupvit", None) is not None:
            with tf.name_scope(self.groupvit.name):
                self.groupvit.build(None)


class TFGroupViTVisionModel(TFGroupViTPreTrainedModel):
    # 设置配置类
    config_class = GroupViTVisionConfig
    # 主输入名称
    main_input_name = "pixel_values"

    def __init__(self, config: GroupViTVisionConfig, *inputs, **kwargs):
        # 调用父类构造函数
        super().__init__(config, *inputs, **kwargs)

        # 初始化 TFGroupViTVisionMainLayer 实例作为模型的主要组件
        self.groupvit = TFGroupViTVisionMainLayer(config, name="groupvit")

    @unpack_inputs
    # 添加文档字符串到模型的前向传播方法
    @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
    # 替换模型前向传播方法的返回文档字符串
    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
    def call(
        self,
        pixel_values: TFModelInputType | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        r"""
        返回模型的输出结果。

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFGroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = TFGroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```
        """

        outputs = self.groupvit(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "groupvit", None) is not None:
            with tf.name_scope(self.groupvit.name):
                self.groupvit.build(None)
# 使用装饰器添加文档字符串，指定类的起始文档字符串
@add_start_docstrings(GROUPVIT_START_DOCSTRING)
# 定义 TFGroupViTModel 类，继承自 TFGroupViTPreTrainedModel 类
class TFGroupViTModel(TFGroupViTPreTrainedModel):
    # 指定配置类为 GroupViTConfig
    config_class = GroupViTConfig

    # 初始化方法，接受 GroupViTConfig 类型的配置对象和其他参数
    def __init__(self, config: GroupViTConfig, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        
        # 创建 TFGroupViTMainLayer 实例，命名为 groupvit
        self.groupvit = TFGroupViTMainLayer(config, name="groupvit")

    # 使用装饰器添加文档字符串到模型前向传播方法
    @unpack_inputs
    @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def get_text_features(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> tf.Tensor:
        r"""
        返回文本特征张量 (`tf.Tensor` of shape `(batch_size, output_dim`): 
        通过将投影层应用于 [`TFGroupViTTextModel`] 的汇总输出所得到的文本嵌入。

        示例:

        ```
        >>> from transformers import CLIPTokenizer, TFGroupViTModel

        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
        >>> text_features = model.get_text_features(**inputs)
        ```"""

        # 调用 TFGroupViTMainLayer 的 get_text_features 方法，返回文本特征张量
        text_features = self.groupvit.get_text_features(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回文本特征张量
        return text_features

    # 使用装饰器添加文档字符串到模型前向传播方法
    @unpack_inputs
    @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: TFModelInputType | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> tf.Tensor:
        r"""
        """
    ) -> tf.Tensor:
        r"""
        Returns:
            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
            the projection layer to the pooled output of [`TFGroupViTVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFGroupViTModel

        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> image_features = model.get_image_features(**inputs)
        ```"""

        # 调用 TFGroupViTVisionModel 的方法获取图像特征
        image_features = self.groupvit.get_image_features(
            pixel_values=pixel_values,  # 图像像素值
            output_attentions=output_attentions,  # 是否输出注意力权重
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            return_dict=return_dict,  # 是否以字典形式返回结果
            training=training,  # 是否处于训练模式
        )

        return image_features

    @unpack_inputs
    @add_start_docstrings_to_model_forward(GROUPVIT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFGroupViTModelOutput, config_class=GroupViTConfig)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        pixel_values: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_segmentation: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        ) -> Union[TFGroupViTModelOutput, Tuple[tf.Tensor]]:
        r"""
        Returns:

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFGroupViTModel
        >>> import tensorflow as tf

        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = tf.math.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
        ```"""

        # 调用模型的 forward 方法，传递输入参数进行推理
        outputs = self.groupvit(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            position_ids=position_ids,
            return_loss=return_loss,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            output_segmentation=output_segmentation,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    def serving_output(self, output: TFGroupViTModelOutput) -> TFGroupViTModelOutput:
        # TODO: As is this currently fails with saved_model=True, because
        # TensorFlow cannot trace through nested dataclasses. Reference:
        # https://github.com/huggingface/transformers/pull/16886
        # 返回模型输出作为服务端输出
        return output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果模型已经构建，则直接返回
        if getattr(self, "groupvit", None) is not None:
            # 使用 TensorFlow 的命名空间来构建模型组件
            with tf.name_scope(self.groupvit.name):
                self.groupvit.build(None)

`.\models\groupvit\init.py`

# 版权声明和许可证声明，指明此代码的版权和使用许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License 2.0 版本许可，除非遵循该许可，否则不得使用此文件。
# You may obtain a copy of the License at
# 你可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非法律要求或书面同意，否则软件
# distributed under the License is distributed on an "AS IS" BASIS,
# 依据许可证的“原样”分发。
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何形式的明示或暗示保证或条件。
# See the License for the specific language governing permissions and
# 详细信息，请参阅特定语言的许可证。
from typing import TYPE_CHECKING

# 从工具包引入相关模块和函数，检查是否可以导入相关依赖项
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available

# 定义导入结构字典，用于延迟加载模块
_import_structure = {
    "configuration_groupvit": [
        "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "GroupViTConfig",
        "GroupViTOnnxConfig",
        "GroupViTTextConfig",
        "GroupViTVisionConfig",
    ],
}

# 检查是否可以导入 torch，如果不行，则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果能导入 torch，则加入模型相关的配置和模型定义到导入结构中
    _import_structure["modeling_groupvit"] = [
        "GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "GroupViTModel",
        "GroupViTPreTrainedModel",
        "GroupViTTextModel",
        "GroupViTVisionModel",
    ]

# 检查是否可以导入 tensorflow，如果不行，则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果能导入 tensorflow，则加入 tensorflow 模型相关的配置和模型定义到导入结构中
    _import_structure["modeling_tf_groupvit"] = [
        "TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFGroupViTModel",
        "TFGroupViTPreTrainedModel",
        "TFGroupViTTextModel",
        "TFGroupViTVisionModel",
    ]

# 如果是类型检查模式，导入具体的配置和模型类
if TYPE_CHECKING:
    from .configuration_groupvit import (
        GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        GroupViTConfig,
        GroupViTOnnxConfig,
        GroupViTTextConfig,
        GroupViTVisionConfig,
    )

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_groupvit import (
            GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            GroupViTModel,
            GroupViTPreTrainedModel,
            GroupViTTextModel,
            GroupViTVisionModel,
        )

    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tf_groupvit import (
            TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFGroupViTModel,
            TFGroupViTPreTrainedModel,
            TFGroupViTTextModel,
            TFGroupViTVisionModel,
        )

else:
    # 如果不是类型检查模式，使用 LazyModule 进行模块的延迟加载
    import sys

    # 将当前模块映射到 LazyModule，以支持延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\herbert\tokenization_herbert.py`

# 导入必要的库和模块：json、os、re、unicodedata以及从typing模块导入List、Optional和Tuple
import json
import os
import re
import unicodedata
from typing import List, Optional, Tuple

# 从tokenization_utils中导入PreTrainedTokenizer、_is_control、_is_punctuation、_is_whitespace函数
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
# 从utils中导入logging模块
from ...utils import logging

# 获取logger对象用于日志记录
logger = logging.get_logger(__name__)

# 定义词汇文件的名称字典
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",   # 词汇表文件名
    "merges_file": "merges.txt",  # 合并文件名
}

# 预训练模型的词汇文件映射字典
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.json"
    },  # allegro/herbert-base-cased模型的词汇表下载地址
    "merges_file": {
        "allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/merges.txt"
    },  # allegro/herbert-base-cased模型的合并文件下载地址
}

# 预训练模型的位置嵌入大小映射字典
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}

# 预训练模型初始化配置空字典
PRETRAINED_INIT_CONFIGURATION = {}

# 从transformers.models.xlm.tokenization_xlm中复制的函数：获取词中的符号对
def get_pairs(word):
    """
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

# 从transformers.models.xlm.tokenization_xlm中复制的函数：替换Unicode标点符号
def replace_unicode_punct(text):
    """
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    """
    text = text.replace("，", ",")     # 替换中文逗号
    text = re.sub(r"。\s*", ". ", text)   # 替换中文句号并确保后面有空格
    text = text.replace("、", ",")     # 替换中文顿号
    text = text.replace("”", '"')      # 替换右双引号
    text = text.replace("“", '"')      # 替换左双引号
    text = text.replace("∶", ":")      # 替换中文分号
    text = text.replace("：", ":")      # 替换中文冒号
    text = text.replace("？", "?")      # 替换中文问号
    text = text.replace("《", '"')      # 替换中文书名号左
    text = text.replace("》", '"')      # 替换中文书名号右
    text = text.replace("）", ")")      # 替换右括号
    text = text.replace("！", "!")      # 替换中文感叹号
    text = text.replace("（", "(")      # 替换左括号
    text = text.replace("；", ";")      # 替换中文分号
    text = text.replace("１", "1")      # 替换全角数字1
    text = text.replace("」", '"')      # 替换中文引号右
    text = text.replace("「", '"')      # 替换中文引号左
    text = text.replace("０", "0")      # 替换全角数字0
    text = text.replace("３", "3")      # 替换全角数字3
    text = text.replace("２", "2")      # 替换全角数字2
    text = text.replace("５", "5")      # 替换全角数字5
    text = text.replace("６", "6")      # 替换全角数字6
    text = text.replace("９", "9")      # 替换全角数字9
    text = text.replace("７", "7")      # 替换全角数字7
    text = text.replace("８", "8")      # 替换全角数字8
    text = text.replace("４", "4")      # 替换全角数字4
    # 将全角句号后的空白替换为一个标准的英文句号加空格
    text = re.sub(r"．\s*", ". ", text)
    # 替换全角的波浪号为标准的波浪号
    text = text.replace("～", "~")
    # 替换单引号的全角形式为标准的单引号
    text = text.replace("’", "'")
    # 替换省略号的全角形式为标准的省略号
    text = text.replace("…", "...")
    # 替换全角的破折号为标准的破折号
    text = text.replace("━", "-")
    # 替换全角的左尖括号为标准的左尖括号
    text = text.replace("〈", "<")
    # 替换全角的右尖括号为标准的右尖括号
    text = text.replace("〉", ">")
    # 替换全角的左方括号为标准的左方括号
    text = text.replace("【", "[")
    # 替换全角的右方括号为标准的右方括号
    text = text.replace("】", "]")
    # 替换全角的百分号为标准的百分号
    text = text.replace("％", "%")
    # 返回处理后的文本
    return text
# 从transformers.models.xlm.tokenization_xlm.remove_non_printing_char复制而来
def remove_non_printing_char(text):
    """
    这个函数用于移除文本中的非打印字符。
    """
    output = []
    for char in text:
        # 获取字符的Unicode类别
        cat = unicodedata.category(char)
        # 如果字符的类别以"C"开头（表示控制字符），则跳过
        if cat.startswith("C"):
            continue
        # 否则将字符添加到输出列表中
        output.append(char)
    # 将列表中的字符连接成字符串并返回
    return "".join(output)


# 从transformers.models.bert.tokenization_bert.whitespace_tokenize复制而来
def whitespace_tokenize(text):
    """对文本进行基本的空白符号清理和分割。"""
    # 去除文本两端的空白符
    text = text.strip()
    # 如果文本为空，则返回空列表
    if not text:
        return []
    # 使用空白符分割文本，得到分词结果
    tokens = text.split()
    # 返回分词结果列表
    return tokens


# 从transformers.models.bert.tokenization_bert.BasicTokenizer复制而来
class BasicTokenizer(object):
    """
    构造一个BasicTokenizer对象，用于运行基本的分词（标点符号分割、小写化等）。

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            是否在分词时将输入转换为小写。
        never_split (`Iterable`, *optional*):
            在分词时不会被拆分的token集合。仅在`do_basic_tokenize=True`时生效。
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            是否对中文字符进行分词。

            对于日语，应该将其停用（见此问题）。
        strip_accents (`bool`, *optional*):
            是否去除所有的重音符号。如果未指定此选项，则将根据`lowercase`的值确定（与原始BERT相同）。
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            在某些情况下，我们希望跳过基本的标点符号分割，以便稍后的分词可以捕捉到单词的完整上下文，如缩写。
    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        if never_split is None:
            never_split = []
        # 是否在分词时转换为小写
        self.do_lower_case = do_lower_case
        # 在分词时不会被拆分的token集合
        self.never_split = set(never_split)
        # 是否对中文字符进行分词
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 是否去除所有的重音符号
        self.strip_accents = strip_accents
        # 是否进行标点符号分割
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 使用 union() 方法将 self.never_split 和输入的 never_split 合并成一个新的集合，如果 never_split 为 None 则默认为空集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        
        # 清洁文本数据，处理特殊字符等
        text = self._clean_text(text)

        # 如果设置了 tokenize_chinese_chars 标志，对中文字符进行特殊处理，主要用于多语言和中文模型
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        
        # 使用 NFC 规范对文本进行 Unicode 规范化，主要是为了避免不同 Unicode 编码的相同字符被视为不同字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        
        # 使用空格分隔符进行基本的 tokenization，得到原始 token 列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        
        split_tokens = []
        # 遍历每个原始 token 进行处理
        for token in orig_tokens:
            # 如果 token 不在 never_split 中，则进行以下处理
            if token not in never_split:
                # 如果设置了 do_lower_case 标志，则将 token 转换为小写
                if self.do_lower_case:
                    token = token.lower()
                    # 如果 strip_accents 不为 False，则移除 token 中的重音符号
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果 strip_accents 标志为 True，则移除 token 中的重音符号
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            
            # 使用 _run_split_on_punc 方法进行标点符号的拆分处理，并将结果添加到 split_tokens 中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 使用空格分隔符重新组合 split_tokens 中的 token，得到最终的输出 token 列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 使用 NFD 规范对文本进行 Unicode 规范化，将重音符号分离出来
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取当前字符的 Unicode 分类
            cat = unicodedata.category(char)
            # 如果分类为 Mn（Mark, Nonspacing），则跳过当前字符，即跳过重音符号
            if cat == "Mn":
                continue
            # 将不包含重音符号的字符添加到 output 中
            output.append(char)
        # 将列表中的字符重新组合成字符串并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要根据标点符号分割文本，或者文本在never_split列表中，则直接返回原始文本作为列表的单个元素
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号，则将其作为新的列表项加入到输出列表中，并标记为开始一个新词
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果当前字符不是标点符号
                if start_new_word:
                    output.append([])  # 在输出列表中添加一个空列表作为新词的开始
                start_new_word = False
                output[-1].append(char)  # 将当前字符添加到当前词的列表项中
            i += 1

        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是CJK字符，则在其前后添加空格，并加入到输出列表中
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查给定的码点是否是CJK字符的码点范围内的值
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或者控制字符，则跳过不处理
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果字符是空白字符，则将其替换为一个空格
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)
    # 定义一个名为 HerbertTokenizer 的类，继承自 PreTrainedTokenizer 类
    """
    Construct a BPE tokenizer for HerBERT.

    Peculiarities:

    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
      punctuation character will be treated separately.

    - Such pretokenized input is BPE subtokenized

    This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.
    """

    # 定义类级别变量，指定各种文件的名称
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # 初始化方法，构造函数
    def __init__(
        self,
        vocab_file,
        merges_file,
        tokenizer_file=None,
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        sep_token="</s>",
        bos_token="<s>",
        do_lowercase_and_remove_accent=False,
        additional_special_tokens=[
            "<special0>",
            "<special1>",
            "<special2>",
            "<special3>",
            "<special4>",
            "<special5>",
            "<special6>",
            "<special7>",
            "<special8>",
            "<special9>",
        ],
        lang2id=None,
        id2lang=None,
        **kwargs,
    ):
    ):
        try:
            import sacremoses
        except ImportError:
            raise ImportError(
                "You need to install sacremoses to use HerbertTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        self.sm = sacremoses

        # cache of sm.MosesPunctNormalizer instance
        self.cache_moses_punct_normalizer = {}
        # cache of sm.MosesTokenizer instance
        self.cache_moses_tokenizer = {}
        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
        # True for current supported model (v1.2.0), False for XLM-17 & 100
        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
        self.lang2id = lang2id
        self.id2lang = id2lang
        if lang2id is not None and id2lang is not None:
            assert len(lang2id) == len(id2lang)

        self.ja_word_tokenizer = None
        self.zh_word_tokenizer = None

        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            lang2id=lang2id,
            id2lang=id2lang,
            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
            tokenizer_file=None,
            **kwargs,
        )

        self.bert_pre_tokenizer = BasicTokenizer(
            do_lower_case=False,
            never_split=self.all_special_tokens,
            tokenize_chinese_chars=False,
            strip_accents=False,
        )

    @property
    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
    def do_lower_case(self):
        # 返回当前对象的 do_lowercase_and_remove_accent 属性值
        return self.do_lowercase_and_remove_accent

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm
    def moses_punct_norm(self, text, lang):
        if lang not in self.cache_moses_punct_normalizer:
            # 如果语言在缓存中不存在，则创建一个新的 MosesPunctNormalizer 实例
            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
            self.cache_moses_punct_normalizer[lang] = punct_normalizer
        else:
            # 如果语言在缓存中已存在，则从缓存中获取 MosesPunctNormalizer 实例
            punct_normalizer = self.cache_moses_punct_normalizer[lang]
        # 使用 punct_normalizer 对文本进行标点符号规范化处理
        return punct_normalizer.normalize(text)

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize
    # 如果指定语言的 Moses 分词器不在缓存中
    if lang not in self.cache_moses_tokenizer:
        # 创建一个新的 Moses 分词器并添加到缓存中
        moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
        self.cache_moses_tokenizer[lang] = moses_tokenizer
    else:
        # 否则，从缓存中获取已存在的 Moses 分词器
        moses_tokenizer = self.cache_moses_tokenizer[lang]
    
    # 使用 Moses 分词器对文本进行分词处理，返回分词结果的列表
    return moses_tokenizer.tokenize(text, return_str=False, escape=False)

# 从 XLMTokenizer 中复制的方法，执行一系列的文本预处理步骤
def moses_pipeline(self, text, lang):
    # 替换文本中的 Unicode 标点符号
    text = replace_unicode_punct(text)
    # 对文本进行 Moses 标点符号规范化处理
    text = self.moses_punct_norm(text, lang)
    # 移除文本中的非打印字符
    text = remove_non_printing_char(text)
    # 返回处理后的文本
    return text

# 从 XLMTokenizer 中复制的方法，用于日语文本分词
def ja_tokenize(self, text):
    # 如果尚未初始化日语词汇分词器
    if self.ja_word_tokenizer is None:
        try:
            # 尝试导入 Mykytea 库并创建 Mykytea 对象
            import Mykytea
            self.ja_word_tokenizer = Mykytea.Mykytea(
                f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
            )
        except (AttributeError, ImportError):
            # 如果导入失败，则记录错误信息并引发异常
            logger.error(
                "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
                " (https://github.com/chezou/Mykytea-python) with the following steps"
            )
            logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
            logger.error("2. autoreconf -i")
            logger.error("3. ./configure --prefix=$HOME/local")
            logger.error("4. make && make install")
            logger.error("5. pip install kytea")
            raise
        
    # 使用日语词汇分词器对文本进行分词处理，返回分词结果的列表
    return list(self.ja_word_tokenizer.getWS(text))

@property
# 从 XLMTokenizer 中复制的属性，返回词汇表的大小
def vocab_size(self):
    return len(self.encoder)

# 从 XLMTokenizer 中复制的方法，返回词汇表的字典形式，包括添加的特殊标记
def get_vocab(self):
    return dict(self.encoder, **self.added_tokens_encoder)

# 从 XLMTokenizer 中复制的方法，用于 BPE（字节对编码）处理，但代码截断未完整提供
def bpe
    def bpe(self, token):
        # 将单词转换为 BPE 编码
        word = tuple(token[:-1]) + (token[-1] + "</w>",)
        # 如果缓存中已有该编码，直接返回缓存结果
        if token in self.cache:
            return self.cache[token]
        # 获取所有可能的符号对
        pairs = get_pairs(word)

        # 如果没有符号对，则直接返回原始单词加上结束符号
        if not pairs:
            return token + "</w>"

        # 开始迭代合并符号对
        while True:
            # 找到优先级最低的符号对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果找到的符号对不在预定义的符号对中，则停止合并
            if bigram not in self.bpe_ranks:
                break
            # 合并符号对
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            # 如果合并后的单词长度为1，则停止合并
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        
        # 将元组单词转换为字符串
        word = " ".join(word)
        # 替换特殊字符
        if word == "\n  </w>":
            word = "\n</w>"
        # 将结果存入缓存并返回
        self.cache[token] = word
        return word

    def _tokenize(self, text):
        # 使用 BPE 对文本进行分词预处理
        pre_tokens = self.bert_pre_tokenizer.tokenize(text)

        split_tokens = []
        # 将每个预处理的 token 进行 BPE 分词处理
        for token in pre_tokens:
            if token:
                split_tokens.extend(list(self.bpe(token).split(" ")))

        return split_tokens

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 将 token 转换为对应的 id
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 将 id 转换为对应的 token
        return self.decoder.get(index, self.unk_token)

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将一系列 token 转换为单个字符串
        out_string = "".join(tokens).replace("</w>", " ").strip()
        return out_string

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        # 构建带有特殊 token 的输入
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Generate token type IDs (segment IDs) from a pair of token ID lists for sequence classification tasks. Each token
        ID list represents a sequence (or a pair of sequences).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs representing the second sequence in a pair.

        Returns:
            `List[int]`: List of token type IDs (segment IDs) where each element corresponds to a token in the input
            sequences. Typically, `0` is used for the first sequence and `1` for the second sequence in a pair.
        """

        # Define special tokens for beginning of sequence (BOS) and separator (SEP)
        bos = [self.bos_token_id]  # Get the ID of the beginning of sequence token
        sep = [self.sep_token_id]  # Get the ID of the separator token

        # Check if token_ids_1 is provided (indicating a pair of sequences)
        if token_ids_1 is None:
            # If only one sequence (token_ids_0), return token type IDs with BOS, sequence tokens, and SEP
            return [0] * len(bos + token_ids_0 + sep)
        
        # If two sequences are provided (token_ids_0 and token_ids_1), return token type IDs with BOS, sequence 1 tokens,
        # SEP, sequence 2 tokens, and SEP
        return [0] * len(bos + token_ids_0 + sep) + [1] * len(token_ids_1 + sep)
    # 返回用于序列对分类任务的序列对掩码。XLM 序列对掩码格式如下：
    #
    # ```
    # 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
    # | first sequence    | second sequence |
    # ```
    #
    # 如果 `token_ids_1` 是 `None`，则仅返回掩码的第一部分（全为0）。
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        sep = [self.sep_token_id]  # 获取分隔符 token 的 ID
        cls = [self.cls_token_id]  # 获取类别标识符 token 的 ID
        if token_ids_1 is None:
            # 如果第二个序列的 token IDs 是 None，返回只包含第一个序列和分隔符的掩码（全为0）
            return len(cls + token_ids_0 + sep) * [0]
        else:
            # 否则返回两个序列加上分隔符的掩码，第一个序列部分为0，第二个序列部分为1
            return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary 复制而来
    # 保存词汇表到指定的目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建词汇表文件路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构建合并文件路径
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将词典编码器以 JSON 格式写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将 BPE 标记及其索引写入合并文件
        with open(merge_file, "w", encoding="utf-8") as writer:
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        return vocab_file, merge_file

    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__ 复制而来
    # 返回当前对象的状态，用于序列化
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sm"] = None  # 设置 sm 属性为 None
        return state

    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__ 复制而来
    # 设置当前对象的状态，用于反序列化
    # 定义一个特殊方法 __setstate__，用于从序列化状态恢复对象的属性
    def __setstate__(self, d):
        # 将对象的 __dict__ 属性更新为给定的字典 d，用于恢复对象状态
        self.__dict__ = d

        # 尝试导入 sacremoses 库，用于处理文本的分词和正规化
        try:
            import sacremoses
        # 如果导入失败，抛出 ImportError 异常
        except ImportError:
            raise ImportError(
                "You need to install sacremoses to use XLMTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        # 如果导入成功，将 sacremoses 赋值给对象的属性 self.sm
        self.sm = sacremoses

`.\models\herbert\tokenization_herbert_fast.py`

# coding=utf-8
# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple

# 从tokenization_utils_fast模块导入PreTrainedTokenizerFast类
from ...tokenization_utils_fast import PreTrainedTokenizerFast
# 从utils模块导入logging函数
from ...utils import logging
# 从当前目录的tokenization_herbert模块导入HerbertTokenizer类
from .tokenization_herbert import HerbertTokenizer

# 获取logger对象
logger = logging.get_logger(__name__)

# 定义常量，指定预训练模型相关的文件名
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型文件映射，指定预训练模型及其对应的文件下载地址
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.json"
    },
    "merges_file": {
        "allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/merges.txt"
    },
}

# 定义预训练模型的位置编码嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
# 定义预训练模型的初始化配置为空字典
PRETRAINED_INIT_CONFIGURATION = {}

# HerbertTokenizerFast类，继承自PreTrainedTokenizerFast类
class HerbertTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).

    Peculiarities:

    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
      a punctuation character will be treated separately.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
    """

    # 类属性，指定默认的文件名字典
    vocab_files_names = VOCAB_FILES_NAMES
    # 类属性，指定预训练模型文件的映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 类属性，指定预训练模型的初始化配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 类属性，指定预训练模型的最大输入大小
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 类属性，指定慢速tokenizer的类
    slow_tokenizer_class = HerbertTokenizer

    # 初始化方法，接受多个参数并调用父类的初始化方法
    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        sep_token="</s>",
        **kwargs,
    ):
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            sep_token=sep_token,
            **kwargs,
        )
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从一个序列或者一对序列构建模型输入，用于序列分类任务，通过连接和添加特殊标记。像BERT和HerBERT序列有如下格式：

        - 单个序列: `<s> X </s>`
        - 一对序列: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的ID列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的ID列表，用于序列对。

        Returns:
            `List[int]`: 包含适当特殊标记的输入ID列表。
        """

        # 获取CLS和SEP标记的ID
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]

        # 如果只有一个序列，则返回CLS + 序列 + SEP
        if token_ids_1 is None:
            return cls + token_ids_0 + sep

        # 如果有一对序列，则返回CLS + 第一个序列 + SEP + 第二个序列 + SEP
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        从没有添加特殊标记的标记列表中提取序列ID。当使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。

        Args:
            token_ids_0 (`List[int]`):
                ID列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的ID列表，用于序列对。
            already_has_special_tokens (`bool`, *可选*, 默认为 `False`):
                标记列表是否已经按模型的要求格式化为特殊标记。

        Returns:
            `List[int]`: 一个整数列表，范围为 [0, 1]：1 表示特殊标记，0 表示序列标记。
        """
        # 如果已经有特殊标记，则直接调用父类方法获取特殊标记掩码
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 如果只有一个序列，则返回 [1] + [0] * len(token_ids_0) + [1]
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]

        # 如果有一对序列，则返回 [1] + [0] * len(token_ids_0) + [1] + [0] * len(token_ids_1) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        """
        从序列创建令牌类型ID列表，用于区分一对序列中的每个部分。

        Args:
            token_ids_0 (`List[int]`):
                第一个序列的ID列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的ID列表，用于序列对。

        Returns:
            `List[int]`: 一个整数列表，表示每个令牌的类型ID。
        """
        # 初始化类型ID列表
        token_type_ids = []

        # 遍历第一个序列的ID列表，标记为类型1
        for _ in token_ids_0:
            token_type_ids.append(0)

        # 如果有第二个序列，则遍历第二个序列的ID列表，标记为类型2
        if token_ids_1 is not None:
            for _ in token_ids_1:
                token_type_ids.append(1)

        return token_type_ids
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
        BERT sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define the separation and classification tokens as lists containing their respective token IDs
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If token_ids_1 is None, return a list of zeros indicating the mask for token_ids_0 only
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # If token_ids_1 is provided, concatenate the masks for token_ids_0 and token_ids_1, with 0s for the first sequence and 1s for the second
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Save the tokenizer model's vocabulary files to the specified directory with the optional filename prefix
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        # Return the saved file paths as a tuple
        return tuple(files)

`.\models\herbert\init.py`

# 版权声明及许可声明，说明代码受 Apache 许可证 2.0 版本保护
#
# 从 typing 模块导入 TYPE_CHECKING
from typing import TYPE_CHECKING

# 从 ...utils 中导入必要的异常和工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available

# 定义模块导入结构
_import_structure = {"tokenization_herbert": ["HerbertTokenizer"]}

# 检查 tokenizers 是否可用，若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加快速 tokenization_herbert_fast 的导入结构
    _import_structure["tokenization_herbert_fast"] = ["HerbertTokenizerFast"]


# 如果在类型检查环境下
if TYPE_CHECKING:
    # 从 .tokenization_herbert 模块导入 HerbertTokenizer 类
    from .tokenization_herbert import HerbertTokenizer

    # 再次检查 tokenizers 是否可用，若不可用则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，则从 .tokenization_herbert_fast 模块导入 HerbertTokenizerFast 类
        from .tokenization_herbert_fast import HerbertTokenizerFast

# 如果不在类型检查环境下
else:
    # 导入 sys 模块
    import sys

    # 将当前模块替换为 LazyModule，使用 LazyModule 对象初始化模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\hubert\configuration_hubert.py`

# 设置文件编码为 UTF-8
# 版权声明，指明此代码版权归 Fairseq 作者和 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本使用此文件；除非符合许可证条件，否则不得使用此文件
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非法律另有规定或书面同意，否则按“原样”分发软件
# 无论是明示还是暗示的条件，都没有担保或条件，包括但不限于
# 适销性或特定用途适用性的保证。详细信息请参阅许可证。
""" Hubert model configuration"""

# 导入 functools 和 operator 模块
import functools
import operator

# 从 configuration_utils.py 中导入 PretrainedConfig 类
# 从 utils.py 中导入 logging 模块
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取 logger 实例，用于记录日志信息
logger = logging.get_logger(__name__)

# Hubert 预训练配置文件的映射表，将模型名称映射到其配置文件的 URL
HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/hubert-base-ls960": "https://huggingface.co/facebook/hubert-base-ls960/resolve/main/config.json",
    # 查看所有 Hubert 模型的列表：https://huggingface.co/models?filter=hubert
}


class HubertConfig(PretrainedConfig):
    r"""
    这是用于存储 [`HubertModel`] 配置的类。用于根据指定的参数实例化 Hubert 模型，
    定义模型架构。使用默认值实例化配置将产生类似于 Hubert
    [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。更多信息请阅读
    [`PretrainedConfig`] 的文档。

    Example:

    ```
    >>> from transformers import HubertModel, HubertConfig

    >>> # 初始化一个 Hubert facebook/hubert-base-ls960 风格的配置
    >>> configuration = HubertConfig()

    >>> # 使用配置初始化一个模型，其模型风格为 facebook/hubert-base-ls960
    >>> model = HubertModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 指定模型类型为 "hubert"
    model_type = "hubert"
    # 定义一个初始化方法，用于创建一个 Transformer 模型的实例
    def __init__(
        self,
        vocab_size=32,  # 词汇表大小，默认为32
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer 模型中的隐藏层数，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # Transformer 中间层的大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为 GELU
        hidden_dropout=0.1,  # 隐藏层的 dropout 概率，默认为0.1
        activation_dropout=0.1,  # 激活函数的 dropout 概率，默认为0.1
        attention_dropout=0.1,  # 注意力机制的 dropout 概率，默认为0.1
        feat_proj_layer_norm=True,  # 是否对特征投影进行层归一化，默认为True
        feat_proj_dropout=0.0,  # 特征投影的 dropout 概率，默认为0.0
        final_dropout=0.1,  # 最终输出层的 dropout 概率，默认为0.1
        layerdrop=0.1,  # LayerDrop 的概率，默认为0.1
        initializer_range=0.02,  # 参数初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # 层归一化的 epsilon 值，默认为1e-5
        feat_extract_norm="group",  # 特征提取的归一化方式，默认为"group"
        feat_extract_activation="gelu",  # 特征提取的激活函数，默认为 GELU
        conv_dim=(512, 512, 512, 512, 512, 512, 512),  # 卷积层的维度，默认为(512, 512, 512, 512, 512, 512, 512)
        conv_stride=(5, 2, 2, 2, 2, 2, 2),  # 卷积层的步长，默认为(5, 2, 2, 2, 2, 2, 2)
        conv_kernel=(10, 3, 3, 3, 3, 2, 2),  # 卷积层的卷积核大小，默认为(10, 3, 3, 3, 3, 2, 2)
        conv_bias=False,  # 是否使用卷积层的偏置，默认为False
        num_conv_pos_embeddings=128,  # 卷积位置嵌入的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的分组数量，默认为16
        do_stable_layer_norm=False,  # 是否进行稳定层归一化，默认为False
        apply_spec_augment=True,  # 是否应用音频增强技术，默认为True
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 时间掩码的最小数量，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        mask_feature_min_masks=0,  # 特征掩码的最小数量，默认为0
        ctc_loss_reduction="sum",  # CTC 损失的归并方式，默认为"sum"
        ctc_zero_infinity=False,  # 是否将无穷值视为零，默认为False
        use_weighted_layer_sum=False,  # 是否使用加权层求和，默认为False
        classifier_proj_size=256,  # 分类器投影的大小，默认为256
        pad_token_id=0,  # 填充标记的 ID，默认为0
        bos_token_id=1,  # 起始标记的 ID，默认为1
        eos_token_id=2,  # 结束标记的 ID，默认为2
        **kwargs,  # 其他未指定的参数
    ):
        ):
        # 调用父类的初始化方法，并传递额外的关键字参数
        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
        # 设置隐藏层的大小
        self.hidden_size = hidden_size
        # 特征提取层的归一化方法
        self.feat_extract_norm = feat_extract_norm
        # 特征提取层的激活函数
        self.feat_extract_activation = feat_extract_activation
        # 卷积层的维度列表
        self.conv_dim = list(conv_dim)
        # 卷积层的步长列表
        self.conv_stride = list(conv_stride)
        # 卷积核大小的列表
        self.conv_kernel = list(conv_kernel)
        # 是否包含卷积层的偏置
        self.conv_bias = conv_bias
        # 卷积位置嵌入的数量
        self.num_conv_pos_embeddings = num_conv_pos_embeddings
        # 卷积位置嵌入的组数
        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
        # 特征提取层的数量
        self.num_feat_extract_layers = len(self.conv_dim)
        # 隐藏层的数量
        self.num_hidden_layers = num_hidden_layers
        # 中间层的大小
        self.intermediate_size = intermediate_size
        # 隐藏层的激活函数
        self.hidden_act = hidden_act
        # 注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 隐藏层的丢弃率
        self.hidden_dropout = hidden_dropout
        # 注意力丢弃率
        self.attention_dropout = attention_dropout
        # 激活函数的丢弃率
        self.activation_dropout = activation_dropout
        # 特征投影层的层归一化
        self.feat_proj_layer_norm = feat_proj_layer_norm
        # 特征投影层的丢弃率
        self.feat_proj_dropout = feat_proj_dropout
        # 最终输出的丢弃率
        self.final_dropout = final_dropout
        # 层丢弃（LayerDrop）的比例
        self.layerdrop = layerdrop
        # 层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 初始化器的范围
        self.initializer_range = initializer_range
        # 词汇表大小
        self.vocab_size = vocab_size
        # 是否使用稳定层归一化
        self.do_stable_layer_norm = do_stable_layer_norm
        # 是否使用加权层求和
        self.use_weighted_layer_sum = use_weighted_layer_sum
        # 分类器投影层的大小
        self.classifier_proj_size = classifier_proj_size

        # 检查卷积层配置的有效性
        if (
            (len(self.conv_stride) != self.num_feat_extract_layers)
            or (len(self.conv_kernel) != self.num_feat_extract_layers)
            or (len(self.conv_dim) != self.num_feat_extract_layers)
        ):
            # 如果卷积层配置不正确，则抛出值错误异常
            raise ValueError(
                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
            )

        # 针对 SpecAugment 进行微调配置参数，参考论文 https://arxiv.org/abs/1904.08779
        self.apply_spec_augment = apply_spec_augment
        # 时间遮蔽的概率
        self.mask_time_prob = mask_time_prob
        # 时间遮蔽的长度
        self.mask_time_length = mask_time_length
        # 时间遮蔽的最小遮蔽数量
        self.mask_time_min_masks = mask_time_min_masks
        # 特征遮蔽的概率
        self.mask_feature_prob = mask_feature_prob
        # 特征遮蔽的长度
        self.mask_feature_length = mask_feature_length
        # 特征遮蔽的最小遮蔽数量
        self.mask_feature_min_masks = mask_feature_min_masks

        # CTC（Connectionist Temporal Classification）损失函数的配置
        self.ctc_loss_reduction = ctc_loss_reduction
        # CTC 损失函数中的零无穷值
        self.ctc_zero_infinity = ctc_zero_infinity

    @property
    # 计算输入到 logits 的比例
    def inputs_to_logits_ratio(self):
        return functools.reduce(operator.mul, self.conv_stride, 1)

`.\models\hubert\convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Hubert checkpoint."""


import argparse  # 导入命令行参数解析模块

import torch  # 导入PyTorch库
from s3prl.hub import distilhubert  # 从s3prl库的hub模块导入distilhubert模型

from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging  # 从transformers库导入Hubert相关类和logging模块


logging.set_verbosity_info()  # 设置日志输出级别为信息级别
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
    "fc2": "encoder.layers.*.feed_forward.output_dense",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "mask_emb": "masked_spec_embed",
}


def set_recursively(hf_pointer, key, value, full_name, weight_type):
    """
    递归设置模型参数的函数。
    
    Args:
        hf_pointer (object): 要设置的模型参数的指针对象。
        key (str): 参数的名称路径。
        value (torch.Tensor): 要设置的参数值。
        full_name (str): 参数的完整名称。
        weight_type (str): 参数类型，如'weight', 'bias'等。

    Raises:
        AssertionError: 如果要设置的参数形状与预期不符合，抛出异常。
    """
    for attribute in key.split("."):  # 按点号分割参数路径并遍历
        hf_pointer = getattr(hf_pointer, attribute)  # 逐级获取属性对象

    if weight_type is not None:  # 如果参数类型不为空
        hf_shape = getattr(hf_pointer, weight_type).shape  # 获取指定类型的参数形状
    else:
        hf_shape = hf_pointer.shape  # 否则获取参数的形状

    assert hf_shape == value.shape, (  # 断言参数形状与值的形状是否一致
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

    # 根据参数类型设置模型参数的值
    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    else:
        hf_pointer.data = value

    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")


def recursively_load_weights(fairseq_model, hf_model):
    """
    递归加载权重函数。

    Args:
        fairseq_model: Fairseq模型对象。
        hf_model: HuggingFace模型对象。
    """
    unused_weights = []  # 未使用的权重列表
    fairseq_dict = fairseq_model.state_dict()  # 获取Fairseq模型的状态字典

    feature_extractor = hf_model.feature_extractor  # 获取HuggingFace模型的特征提取器
    # 遍历 fairseq_dict 字典中的每个键值对，键是参数名，值是对应的张量值
    for name, value in fairseq_dict.items():
        # 标记当前参数是否被使用的布尔变量，默认为未使用
        is_used = False
        
        # 检查参数名中是否包含 "conv_layers"
        if "conv_layers" in name:
            # 如果包含，则调用 load_conv_layer 函数加载卷积层参数
            load_conv_layer(
                name,  # 参数名
                value,  # 参数值
                feature_extractor,  # 特征提取器对象
                unused_weights,  # 未使用的权重列表
                hf_model.config.feat_extract_norm == "group",  # 特征提取器配置的归一化是否为 "group"
            )
            is_used = True  # 将标记设置为已使用
        
        else:
            # 如果不包含 "conv_layers"，则遍历 MAPPING 字典中的映射关系
            for key, mapped_key in MAPPING.items():
                mapped_key = mapped_key

                # 检查当前参数名中是否包含当前遍历到的 key
                if key in name:
                    is_used = True  # 标记为已使用

                    # 如果 mapped_key 中包含通配符 "*"，则替换为对应的层索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)

                    # 根据参数名的后缀确定参数类型，可能是权重的不同类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "weight" in name:
                        weight_type = "weight"
                    elif "bias" in name:
                        weight_type = "bias"
                    else:
                        weight_type = None
                    
                    # 递归设置 hf_model 中的参数值，使用 mapped_key 作为路径
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                
                continue  # 继续下一个映射关系的检查
        
        # 如果参数未被使用，则将参数名添加到 unused_weights 列表中
        if not is_used:
            unused_weights.append(name)

    # 记录未使用的权重参数名到日志中
    logger.warning(f"Unused weights: {unused_weights}")
# 加载卷积层参数的函数，根据给定的全名、数值、特征提取器、未使用的权重列表和是否使用组归一化来操作
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 从完整名称中提取层名称
    name = full_name.split("conv_layers.")[-1]
    # 根据点号分割名称，获取层和类型编号
    items = name.split(".")
    layer_id = int(items[0])  # 提取层编号
    type_id = int(items[1])   # 提取类型编号

    # 如果类型编号为0，处理偏置或权重参数
    if type_id == 0:
        if "bias" in name:
            # 检查并设置偏置参数值，同时记录日志
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        elif "weight" in name:
            # 检查并设置权重参数值，同时记录日志
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 如果类型编号为2且未使用组归一化，或者类型编号为2且层编号为0且使用了组归一化，则处理层归一化参数
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        if "bias" in name:
            # 检查并设置层归一化偏置参数值，同时记录日志
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        elif "weight" in name:
            # 检查并设置层归一化权重参数值，同时记录日志
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
            )
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    else:
        unused_weights.append(full_name)


# 将给定模型转换为Hubert配置
def convert_config(model):
    config = HubertConfig()  # 创建一个Hubert配置对象
    fs_config = model.config  # 获取模型的配置信息

    # 设置Hubert配置的各个参数
    config.activation_dropout = fs_config.activation_dropout
    config.apply_spec_augment = False
    config.attention_dropout = fs_config.attention_dropout
    config.conv_bias = False
    conv_layers = eval(fs_config.extractor_conv_feature_layers)  # 评估提取器的卷积特征层配置
    config.conv_dim = [x[0] for x in conv_layers]  # 提取卷积层维度信息
    config.conv_kernel = [x[1] for x in conv_layers]  # 提取卷积层核大小信息
    config.conv_stride = [x[2] for x in conv_layers]  # 提取卷积层步幅信息
    config.feat_extract_activation = "gelu"  # 设置特征提取激活函数为GELU
    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"  # 设置特征提取归一化方式
    config.feat_proj_layer_norm = False  # 不使用特征投影层归一化
    # 设置特征投影层的dropout率为0.0
    config.feat_proj_dropout = 0.0
    # 设置最终输出层的dropout率为0.0
    config.final_dropout = 0.0
    # 设置隐藏层激活函数为fs_config.activation_fn
    config.hidden_act = fs_config.activation_fn
    # 设置隐藏层的dropout率为fs_config.dropout
    config.hidden_dropout = fs_config.dropout
    # 设置隐藏层的大小为fs_config.encoder_embed_dim
    config.hidden_size = fs_config.encoder_embed_dim
    # 设置初始化范围为0.02
    config.initializer_range = 0.02
    # 设置中间层的大小为fs_config.encoder_ffn_embed_dim
    config.intermediate_size = fs_config.encoder_ffn_embed_dim
    # 设置层归一化的epsilon值为1e-5
    config.layer_norm_eps = 1e-5
    # 设置层drop的比例为0.0
    config.layerdrop = 0.0
    # 设置注意力头的数量为fs_config.encoder_attention_heads
    config.num_attention_heads = fs_config.encoder_attention_heads
    # 设置卷积位置嵌入的分组数为fs_config.conv_pos_groups
    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
    # 设置卷积位置嵌入的数量为fs_config.conv_pos
    config.num_conv_pos_embeddings = fs_config.conv_pos
    # 设置特征提取层的数量为conv_layers列表的长度
    config.num_feat_extract_layers = len(conv_layers)
    # 设置隐藏层的数量为fs_config.encoder_layers
    config.num_hidden_layers = fs_config.encoder_layers
    
    # 返回配置对象config
    return config
# 使用 `torch.no_grad()` 上下文管理器，确保在转换模型检查点期间不计算梯度
@torch.no_grad()
# 定义函数 `convert_hubert_checkpoint`，用于将模型权重从 Hubert 转换到 Transformers 设计
def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None):
    # 使用 `distilhubert()` 函数获取 Hubert 模型，并访问其内部的子模型
    model = distilhubert().model.model

    # 如果提供了 `config_path`，从预训练的配置文件加载 HubertConfig
    if config_path is not None:
        config = HubertConfig.from_pretrained(config_path)
    else:
        # 否则，调用 `convert_config` 函数将 Hubert 模型的配置转换为 HubertConfig
        config = convert_config(model)
    # 将模型设置为评估模式（不计算梯度）
    model = model.eval()

    # 创建一个 Wav2Vec2FeatureExtractor 实例，用于音频特征提取
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16000,
        padding_value=0,
        do_normalize=False,
        return_attention_mask=False,
    )
    
    # 根据提供的 `config` 创建 HubertModel 实例
    hf_model = HubertModel(config)

    # 递归加载 Hubert 模型的权重到 Transformers 的 HubertModel 中
    recursively_load_weights(model, hf_model)

    # 将特征提取器的状态保存到给定的 `pytorch_dump_folder_path`
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
    # 将转换后的 HubertModel 的状态保存到相同的 `pytorch_dump_folder_path`
    hf_model.save_pretrained(pytorch_dump_folder_path)


# 如果当前脚本作为主程序运行，则执行以下代码
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加命令行参数 `--pytorch_dump_folder_path`，指定输出 PyTorch 模型的路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数 `--config_path`，指定待转换模型的配置文件路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 解析命令行参数
    args = parser.parse_args()
    # 调用 `convert_hubert_checkpoint` 函数，传入解析后的命令行参数
    convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path)

`.\models\hubert\convert_hubert_original_pytorch_checkpoint_to_pytorch.py`

# 导入必要的库和模块
import argparse  # 用于解析命令行参数
import json  # 用于处理 JSON 格式的数据
import os  # 提供与操作系统交互的功能

import fairseq  # 导入 fairseq 库，用于处理 Fairseq 模型
import torch  # 导入 PyTorch 库，用于深度学习任务
from fairseq.data import Dictionary  # 从 fairseq 库中导入 Dictionary 类

# 从 transformers 库中导入相关类和函数
from transformers import (
    HubertConfig,  # 导入 HubertConfig 类，用于配置 Hubert 模型的参数
    HubertForCTC,  # 导入 HubertForCTC 类，用于 Hubert 模型的 CTC（Connectionist Temporal Classification）任务
    HubertModel,  # 导入 HubertModel 类，用于加载 Hubert 模型
    Wav2Vec2CTCTokenizer,  # 导入 Wav2Vec2CTCTokenizer 类，用于 CTC 任务的标记化
    Wav2Vec2FeatureExtractor,  # 导入 Wav2Vec2FeatureExtractor 类，用于音频特征提取
    Wav2Vec2Processor,  # 导入 Wav2Vec2Processor 类，用于处理 Wav2Vec2 模型的输入输出
    logging,  # 导入 logging 模块，用于日志记录
)

logging.set_verbosity_info()  # 设置日志记录级别为 info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

MAPPING = {
    "post_extract_proj": "feature_projection.projection",  # 映射 Fairseq 中的 post_extract_proj 到 Transformers 中的 feature_projection.projection
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",  # 映射 Fairseq 中的 encoder.pos_conv.0 到 Transformers 中的 encoder.pos_conv_embed.conv
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",  # 映射 Fairseq 中的 self_attn.k_proj 到 Transformers 中的 encoder.layers.*.attention.k_proj
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",  # 映射 Fairseq 中的 self_attn.v_proj 到 Transformers 中的 encoder.layers.*.attention.v_proj
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",  # 映射 Fairseq 中的 self_attn.q_proj 到 Transformers 中的 encoder.layers.*.attention.q_proj
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",  # 映射 Fairseq 中的 self_attn.out_proj 到 Transformers 中的 encoder.layers.*.attention.out_proj
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",  # 映射 Fairseq 中的 self_attn_layer_norm 到 Transformers 中的 encoder.layers.*.layer_norm
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",  # 映射 Fairseq 中的 fc1 到 Transformers 中的 encoder.layers.*.feed_forward.intermediate_dense
    "fc2": "encoder.layers.*.feed_forward.output_dense",  # 映射 Fairseq 中的 fc2 到 Transformers 中的 encoder.layers.*.feed_forward.output_dense
    "final_layer_norm": "encoder.layers.*.final_layer_norm",  # 映射 Fairseq 中的 final_layer_norm 到 Transformers 中的 encoder.layers.*.final_layer_norm
    "encoder.layer_norm": "encoder.layer_norm",  # 映射 Fairseq 中的 encoder.layer_norm 到 Transformers 中的 encoder.layer_norm
    "w2v_model.layer_norm": "feature_projection.layer_norm",  # 映射 Fairseq 中的 w2v_model.layer_norm 到 Transformers 中的 feature_projection.layer_norm
    "w2v_encoder.proj": "lm_head",  # 映射 Fairseq 中的 w2v_encoder.proj 到 Transformers 中的 lm_head
    "mask_emb": "masked_spec_embed",  # 映射 Fairseq 中的 mask_emb 到 Transformers 中的 masked_spec_embed
}


def set_recursively(hf_pointer, key, value, full_name, weight_type):
    """
    递归设置指针指向的属性值，并记录日志。

    Args:
        hf_pointer (object): Transformers 模型中的属性指针
        key (str): 属性名称，用点分隔表示层次结构
        value (torch.Tensor): 设置的值
        full_name (str): 完整名称，用于日志记录
        weight_type (str): 权重类型，如 'weight', 'bias' 等

    Raises:
        AssertionError: 如果设置的值的形状与预期不符合
    """
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)

    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    assert hf_shape == value.shape, (
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    else:
        hf_pointer.data = value

    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")


def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
    """
    递归加载 Fairseq 模型的权重到 Transformers 模型中。

    Args:
        fairseq_model (FairseqModel): Fairseq 模型对象
        hf_model (PreTrainedModel): Transformers 模型对象
        is_finetuned (bool): 是否为微调模型

    Returns:
        None
    """
    unused_weights = []
    fairseq_dict = fairseq_model.state_dict()

    # 根据是否微调选择要加载权重的对象
    feature_extractor = hf_model.hubert.feature_extractor if is_finetuned else hf_model.feature_extractor
    # 遍历 fairseq_dict 字典中的每个键值对
    for name, value in fairseq_dict.items():
        # 初始化是否被使用的标志为 False
        is_used = False
        
        # 检查当前 name 是否包含 "conv_layers" 字符串
        if "conv_layers" in name:
            # 调用 load_conv_layer 函数加载卷积层权重
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            # 标记当前 name 已被使用
            is_used = True
        else:
            # 遍历 MAPPING 字典中的每个键值对
            for key, mapped_key in MAPPING.items():
                # 如果是微调模型并且 mapped_key 不是 "lm_head"，则加上前缀 "hubert."
                mapped_key = "hubert." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
                
                # 检查 key 是否在 name 中，或者检查是否不是微调且 name 的第一部分与 key 的最后一部分相同
                if key in name or (key.split("w2v_model.")[-1] == name.split(".")[0] and not is_finetuned):
                    # 标记当前 name 已被使用
                    is_used = True
                    
                    # 如果 mapped_key 中包含通配符 "*", 替换为当前层的索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    
                    # 根据 name 的后缀确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "weight" in name:
                        weight_type = "weight"
                    elif "bias" in name:
                        weight_type = "bias"
                    else:
                        weight_type = None
                    
                    # 递归设置 hf_model 中的权重值
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                
                # 继续下一个键值对的检查
                continue
        
        # 如果当前 name 没有被使用，则将其添加到 unused_weights 列表中
        if not is_used:
            unused_weights.append(name)

    # 记录警告信息，显示未使用的权重名称列表
    logger.warning(f"Unused weights: {unused_weights}")
# 定义函数，加载卷积层权重到特征提取器中
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 从完整名称中提取层的简化名称
    name = full_name.split("conv_layers.")[-1]
    # 根据点号分割名称，获取层索引和类型索引
    items = name.split(".")
    layer_id = int(items[0])  # 卷积层索引
    type_id = int(items[1])   # 类型索引

    # 处理卷积层权重和偏置
    if type_id == 0:
        if "bias" in name:
            # 断言确保值的形状与特征提取器中的偏置形状匹配
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            # 将偏置值赋给特征提取器的相应卷积层
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            # 记录初始化日志
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        elif "weight" in name:
            # 断言确保值的形状与特征提取器中的权重形状匹配
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            # 将权重值赋给特征提取器的相应卷积层
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            # 记录初始化日志
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 处理层归一化参数
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        if "bias" in name:
            # 断言确保值的形状与特征提取器中的层归一化偏置形状匹配
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            # 将层归一化偏置值赋给特征提取器的相应卷积层
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            # 记录初始化日志
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        elif "weight" in name:
            # 断言确保值的形状与特征提取器中的层归一化权重形状匹配
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
            )
            # 将层归一化权重值赋给特征提取器的相应卷积层
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            # 记录初始化日志
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    else:
        # 将未使用的权重名称添加到未使用列表中
        unused_weights.append(full_name)


# 无需计算梯度的上下文管理器装饰器
@torch.no_grad()
# 将Hubert模型的检查点转换为transformers设计的函数
def convert_hubert_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 如果提供了配置文件路径，则从预训练配置中加载配置
    if config_path is not None:
        config = HubertConfig.from_pretrained(config_path)
    else:
        # 否则使用默认配置
        config = HubertConfig()
    # 如果模型是微调过的
    if is_finetuned:
        # 如果提供了字典路径，则加载字典
        if dict_path:
            target_dict = Dictionary.load(dict_path)

            # 重要的更改：调整 bos 和 pad 的标记 ID，因为 CTC 符号是 <pad>，而不是 fairseq 中的 <s>
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)

            # 设置保存词汇表 JSON 文件的路径
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
            # 如果 pytorch_dump_folder_path 不是目录，则记录错误并返回
            if not os.path.isdir(pytorch_dump_folder_path):
                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
                return
            # 创建目录 pytorch_dump_folder_path，如果不存在的话
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            # 将目标字典的索引保存到 vocab_path 指定的 JSON 文件中
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(target_dict.indices, vocab_handle)

            # 使用 Wav2Vec2CTCTokenizer 初始化 tokenizer
            tokenizer = Wav2Vec2CTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )

            # 根据 config 中的 feat_extract_norm 设置 return_attention_mask
            return_attention_mask = True if config.feat_extract_norm == "layer" else False

            # 使用给定参数初始化 Wav2Vec2FeatureExtractor
            feature_extractor = Wav2Vec2FeatureExtractor(
                feature_size=1,
                sampling_rate=16000,
                padding_value=0,
                do_normalize=True,
                return_attention_mask=return_attention_mask,
            )

            # 使用 feature_extractor 和 tokenizer 初始化 Wav2Vec2Processor
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

            # 将 processor 的预训练模型保存到 pytorch_dump_folder_path 中
            processor.save_pretrained(pytorch_dump_folder_path)

        # 如果模型是微调过的，则创建 HubertForCTC 模型
        hf_wav2vec = HubertForCTC(config)
    else:
        # 如果模型不是微调过的，则创建 HubertModel 模型
        hf_wav2vec = HubertModel(config)

    # 加载模型的权重并设置为评估模式
    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])

    # 获取模型集合中的第一个模型并设置为评估模式
    model = model[0].eval()

    # 递归加载模型的权重到 hf_wav2vec 模型中
    recursively_load_weights(model, hf_wav2vec, is_finetuned)

    # 将 hf_wav2vec 的预训练模型保存到 pytorch_dump_folder_path 中
    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
# 如果当前脚本被作为主程序运行，则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加命令行参数：指向输出 PyTorch 模型的路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数：指向 fairseq 检查点的路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加命令行参数：指向微调模型的字典路径
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    # 添加命令行参数：指向需要转换的模型的 hf config.json 的路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加命令行参数：是否为微调模型的标志，如果存在则设置为 True
    parser.add_argument(
        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
    )
    # 解析命令行参数，并将其存储在 args 对象中
    args = parser.parse_args()
    
    # 调用 convert_hubert_checkpoint 函数，传递命令行参数中的相关路径和标志
    convert_hubert_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
    )

`.\models\hubert\convert_hubert_original_s3prl_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Hubert checkpoint."""


import argparse  # 导入 argparse 模块，用于处理命令行参数

import torch  # 导入 PyTorch 库

from transformers import HubertConfig, HubertForSequenceClassification, Wav2Vec2FeatureExtractor, logging  # 导入 transformers 相关类和 logging 模块


logging.set_verbosity_info()  # 设置日志记录的详细程度为 info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

SUPPORTED_MODELS = ["UtteranceLevel"]  # 支持的模型列表，当前仅支持 "UtteranceLevel" 模型


@torch.no_grad()
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
    """
    Copy/paste/tweak model's weights to transformers design.
    将模型的权重复制/粘贴/调整到 transformers 设计中。
    """
    checkpoint = torch.load(checkpoint_path, map_location="cpu")  # 加载 s3prl 检查点文件到内存中，使用 CPU
    if checkpoint["Config"]["downstream_expert"]["modelrc"]["select"] not in SUPPORTED_MODELS:
        raise NotImplementedError(f"The supported s3prl models are {SUPPORTED_MODELS}")  # 如果不支持当前模型，则抛出 NotImplementedError

    downstream_dict = checkpoint["Downstream"]  # 从检查点中获取下游模型的字典信息

    hf_congfig = HubertConfig.from_pretrained(config_path)  # 从预训练配置路径加载 Hubert 模型配置
    hf_model = HubertForSequenceClassification.from_pretrained(base_model_name, config=hf_congfig)  # 从预训练模型名称加载 Hubert 分类模型
    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        base_model_name, return_attention_mask=True, do_normalize=False
    )  # 从预训练模型名称加载 Wav2Vec2 特征提取器，并设置返回注意力掩码和不进行标准化

    if hf_congfig.use_weighted_layer_sum:
        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]  # 如果配置要求使用加权层求和，则加载权重到模型中的层权重属性

    hf_model.projector.weight.data = downstream_dict["projector.weight"]  # 加载下游模型投影层的权重
    hf_model.projector.bias.data = downstream_dict["projector.bias"]  # 加载下游模型投影层的偏置
    hf_model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]  # 加载下游模型分类器的权重
    hf_model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]  # 加载下游模型分类器的偏置

    hf_feature_extractor.save_pretrained(model_dump_path)  # 将特征提取器的配置保存到指定路径
    hf_model.save_pretrained(model_dump_path)  # 将 Hubert 分类模型保存到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建命令行参数解析器
    parser.add_argument(
        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
    )  # 添加命令行参数：huggingface 预训练基础模型的名称
    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")  # 添加命令行参数：huggingface 分类器配置文件的路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")  # 添加命令行参数：s3prl 检查点文件的路径
    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")  # 添加命令行参数：转换后模型的保存路径
    args = parser.parse_args()  # 解析命令行参数
    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)  # 调用函数进行 s3prl 检查点转换

`.\models\hubert\modeling_hubert.py`

# 设置编码格式为 UTF-8

# 版权声明及许可信息，指出代码的版权和许可条款

""" PyTorch Hubert model."""

# 导入警告模块，用于生成警告
import warnings
# 导入类型提示模块中的类型和元组
from typing import Optional, Tuple, Union

# 导入 NumPy 库
import numpy as np
# 导入 PyTorch 库
import torch
# 导入 PyTorch 中的检查点模块
import torch.utils.checkpoint
# 导入 PyTorch 中的神经网络模块
from torch import nn
# 从 PyTorch 中的损失函数模块导入交叉熵损失函数
from torch.nn import CrossEntropyLoss

# 导入特定路径中的模块
from ...activations import ACT2FN
# 导入 DeepSpeed 集成库中的函数，用于检查是否启用了 DeepSpeed zero3
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
# 导入模型输出相关的基类
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
# 导入模型工具函数
from ...modeling_utils import PreTrainedModel
# 导入常用工具函数
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入 Hubert 配置文件
from .configuration_hubert import HubertConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义隐藏状态起始位置的常量
_HIDDEN_STATES_START_POSITION = 1

# 模型配置文档字符串
_CONFIG_FOR_DOC = "HubertConfig"

# 检查点文档字符串
_CHECKPOINT_FOR_DOC = "facebook/hubert-large-ls960-ft"
# 期望的输出形状文档字符串
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]

# CTC（连续文本分类）预期输出文本字符串
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
# CTC 预期损失值
_CTC_EXPECTED_LOSS = 22.68

# 音频分类检查点文档字符串
_SEQ_CLASS_CHECKPOINT = "superb/hubert-base-superb-ks"
# 音频分类预期输出文本字符串
_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
# 音频分类预期损失值
_SEQ_CLASS_EXPECTED_LOSS = 8.53

# Hubert 预训练模型归档列表
HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/hubert-base-ls960",
    # 可在 https://huggingface.co/models?filter=hubert 查看所有 Hubert 模型
]

# 从 transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices 复制的函数
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    计算给定形状的随机掩码跨度。用于实现 ASR（自动语音识别）中的 SpecAugment 数据增强方法。
    注意，此方法未经过优化，应在 CPU 上作为训练期间的预处理的一部分运行，而不是在 TPU 上运行。
    """
    # 返回一个 NumPy 数组，表示随机生成的掩码跨度
    return np.ndarray
    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """
    # 解包形状参数
    batch_size, sequence_length = shape

    # 检查是否小于1
    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")

    # 检查是否大于序列长度
    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )

    # epsilon 用于概率舍入
    epsilon = np.random.rand(1).item()

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        # 计算应该被遮罩的 span 数量
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
        # 确保遮罩的 span 数量不低于最小要求
        num_masked_span = max(num_masked_span, min_masks)

        # 确保遮罩的 span 不超过序列长度
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length

        # 确保遮罩的 span 不超过 input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)

        return num_masked_span

    # 计算每个 batch 中的遮罩 span 的数量
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]
    )

    # 创建用于 SpecAugment 的遮罩数组
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
    spec_aug_mask_idxs = []

    # 计算最大允许的遮罩 span 数量
    max_num_masked_span = compute_num_masked_span(sequence_length)

    # 如果最大允许的遮罩 span 数量为 0，则直接返回空的遮罩数组
    if max_num_masked_span == 0:
        return spec_aug_mask
    # 对于每个输入长度进行循环处理
    for input_length in input_lengths:
        # 计算当前输入的被遮挡（masked）span的数量
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要遮挡的索引位置
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 选择第一个被抽样的索引，用作填充向量的虚拟索引
        # 确保所有批次的维度一致，因为可能存在概率舍入
        # 选择第一个样本只是将这些向量填充两次。
        if len(spec_aug_mask_idx) == 0:
            # 这种情况只能发生在`input_length`严格小于`sequence_length`的情况下，
            # 此时最后一个标记必须是填充标记，我们可以将其用作虚拟掩码ID
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟索引添加到`spec_aug_mask_idx`数组末尾，使其达到最大遮挡span数量
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将列表转换为NumPy数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将遮挡的索引扩展为遮挡span
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    # 将形状重新整理为(batch_size, max_num_masked_span * mask_length)
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 添加偏移量到起始索引，以创建遮挡span
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保索引不会超过序列长度
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 将1散布到遮挡的索引位置
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回生成的遮挡mask
    return spec_aug_mask
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer 复制过来，将 Wav2Vec2 替换为 Hubert
class HubertNoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 如果 layer_id 大于 0，则使用前一个卷积层的输出维度作为输入维度，否则使用 1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 使用当前层的卷积维度作为输出维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    # 前向传播函数
    def forward(self, hidden_states):
        # 对输入的隐藏状态应用卷积操作
        hidden_states = self.conv(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer 复制过来，将 Wav2Vec2 替换为 Hubert
class HubertLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 如果 layer_id 大于 0，则使用前一个卷积层的输出维度作为输入维度，否则使用 1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 使用当前层的卷积维度作为输出维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 创建一个 LayerNorm 层，对输出维度进行归一化
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    # 前向传播函数
    def forward(self, hidden_states):
        # 对输入的隐藏状态应用卷积操作
        hidden_states = self.conv(hidden_states)

        # 将卷积输出的维度换位，以便于 LayerNorm 的应用
        hidden_states = hidden_states.transpose(-2, -1)
        # 应用 LayerNorm
        hidden_states = self.layer_norm(hidden_states)
        # 恢复维度的排列顺序
        hidden_states = hidden_states.transpose(-2, -1)

        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer 复制过来，将 Wav2Vec2 替换为 Hubert
class HubertGroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 如果 layer_id 大于 0，则使用前一个卷积层的输出维度作为输入维度，否则使用 1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 使用当前层的卷积维度作为输出维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

        # 创建一个 GroupNorm 层，分组数为输出维度，通道数为输出维度
        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    # 前向传播函数
    def forward(self, hidden_states):
        # 对输入的隐藏状态应用卷积操作
        hidden_states = self.conv(hidden_states)
        # 应用 GroupNorm
        hidden_states = self.layer_norm(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding 复制代码，并将 Wav2Vec2 替换为 Hubert
class HubertPositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一维卷积层，用于位置编码
        self.conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=config.num_conv_pos_embeddings,
            padding=config.num_conv_pos_embeddings // 2,
            groups=config.num_conv_pos_embedding_groups,
        )

        # 初始化权重归一化函数
        weight_norm = nn.utils.weight_norm
        if hasattr(nn.utils.parametrizations, "weight_norm"):
            weight_norm = nn.utils.parametrizations.weight_norm

        # 如果启用了 DeepSpeed zero3 加速
        if is_deepspeed_zero3_enabled():
            import deepspeed

            # 使用 GatheredParameters 将权重进行分组
            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                self.conv = weight_norm(self.conv, name="weight", dim=2)
            # 注册外部参数以便 DeepSpeed 管理
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 否则正常进行权重归一化
            self.conv = weight_norm(self.conv, name="weight", dim=2)

        # 创建用于填充的层
        self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
        # 选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 转置隐藏状态张量的维度
        hidden_states = hidden_states.transpose(1, 2)

        # 进行卷积操作
        hidden_states = self.conv(hidden_states)
        # 进行填充操作
        hidden_states = self.padding(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)

        # 再次转置隐藏状态张量的维度
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制代码，并将 Wav2Vec2 替换为 Hubert
class HubertSamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 根据卷积位置编码的数量确定是否需要移除填充
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        # 如果需要移除填充，则进行切片操作
        if self.num_pad_remove > 0:
            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制代码，并将 Wav2Vec2 替换为 Hubert
class HubertFeatureEncoder(nn.Module):
    """从原始音频波形构建特征"""

    # 构造函数留空，直接继承 nn.Module 的构造函数
    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类（superclass）的初始化方法
        super().__init__()

        # 根据配置文件中的特征提取归一化方式进行不同处理
        if config.feat_extract_norm == "group":
            # 如果是"group"方式，创建一组卷积层，第一个使用组归一化
            conv_layers = [HubertGroupNormConvLayer(config, layer_id=0)] + [
                HubertNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果是"layer"方式，创建一组卷积层，全部使用层归一化
            conv_layers = [HubertLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
        else:
            # 如果归一化方式不是合法值，抛出数值错误异常
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )

        # 将创建的卷积层列表转换为 nn.ModuleList，使其成为 nn.Module 的一部分
        self.conv_layers = nn.ModuleList(conv_layers)

        # 梯度检查点技术默认关闭
        self.gradient_checkpointing = False

        # 默认所有参数需要梯度计算
        self._requires_grad = True

    # 冻结模型参数，使其不再计算梯度
    def _freeze_parameters(self):
        # 遍历所有参数，设置其 requires_grad 属性为 False
        for param in self.parameters():
            param.requires_grad = False
        # 同时设置模型的 _requires_grad 属性为 False
        self._requires_grad = False

    # 前向传播方法
    def forward(self, input_values):
        # 将输入数据转换为二维张量
        hidden_states = input_values[:, None]

        # 如果需要梯度并且当前处于训练模式，确保 hidden_states 的 requires_grad 属性为 True
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        # 遍历所有卷积层并逐层进行前向传播计算
        for conv_layer in self.conv_layers:
            # 如果需要梯度、开启了梯度检查点并且处于训练模式，则使用梯度检查点技术
            if self._requires_grad and self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,  # 调用当前卷积层的前向传播方法
                    hidden_states,        # 当前隐藏状态作为输入
                )
            else:
                # 否则，直接调用当前卷积层的前向传播方法
                hidden_states = conv_layer(hidden_states)

        # 返回最终的隐藏状态作为输出
        return hidden_states
class HubertFeatureExtractor(HubertFeatureEncoder):
    # 继承自HubertFeatureEncoder类的特征提取器类
    def __init__(self, config):
        super().__init__(config)
        # 警告：该类已被弃用，将在Transformers v5中移除，请使用`HubertFeatureEncoder`代替。
        warnings.warn(
            f"The class `{self.__class__.__name__}` has been depreciated "
            "and will be removed in Transformers v5. "
            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
            FutureWarning,
        )


class HubertFeatureProjection(nn.Module):
    # Hubert特征投影模块
    def __init__(self, config):
        super().__init__()
        self.feat_proj_layer_norm = config.feat_proj_layer_norm
        if self.feat_proj_layer_norm:
            # 如果配置中包含特征投影层标准化，则初始化LayerNorm
            self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
        # 线性映射投影到隐藏层大小
        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # 随机失活层
        self.dropout = nn.Dropout(config.feat_proj_dropout)

    def forward(self, hidden_states):
        # 非投影的隐藏状态用于量化
        if self.feat_proj_layer_norm:
            hidden_states = self.layer_norm(hidden_states)
        hidden_states = self.projection(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return hidden_states


# 从transformers.models.bart.modeling_bart.BartAttention复制到HubertAttention，将Bart->Hubert
class HubertAttention(nn.Module):
    """来自'Attention Is All You Need'论文的多头注意力机制"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[HubertConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        # 线性映射层，用于查询、键、值和输出投影
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 重新塑造张量形状，以便进行多头注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        ```
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward复制到HubertFeedForward，用Hubert替换Wav2Vec2
class HubertFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 定义中间层全连接层，将隐藏大小转换为中间大小
        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择隐藏层激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

        # 定义输出全连接层，将中间大小转换回隐藏大小
        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.output_dropout = nn.Dropout(config.hidden_dropout)

    def forward(self, hidden_states):
        # 中间全连接层和激活函数
        hidden_states = self.intermediate_dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        hidden_states = self.intermediate_dropout(hidden_states)

        # 输出全连接层和dropout
        hidden_states = self.output_dense(hidden_states)
        hidden_states = self.output_dropout(hidden_states)
        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer复制到HubertEncoderLayer，用Hubert替换Wav2Vec2
class HubertEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义注意力层，使用HubertAttention
        self.attention = HubertAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        self.dropout = nn.Dropout(config.hidden_dropout)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义FeedForward层，使用HubertFeedForward
        self.feed_forward = HubertFeedForward(config)
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 记录注意力残差
        attn_residual = hidden_states
        # 执行注意力计算
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        hidden_states = self.dropout(hidden_states)
        # 添加注意力残差到隐藏状态
        hidden_states = attn_residual + hidden_states

        # 层归一化
        hidden_states = self.layer_norm(hidden_states)
        # 使用FeedForward层处理隐藏状态
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer复制到HubertAttnAdapterLayer，用Hubert替换Wav2Vec2
class HubertAttnAdapterLayer(nn.Module):
    def __init__(self, config):
        """
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        """
        # 调用父类的初始化方法
        super().__init__()
        # 设置输入维度为配置文件中的适配器注意力维度
        self.input_dim = config.adapter_attn_dim
        # 设置隐藏维度为配置文件中的隐藏大小
        self.hidden_dim = config.hidden_size

        # 使用LayerNorm对隐藏状态进行归一化
        self.norm = nn.LayerNorm(self.hidden_dim)
        # 第一个线性层，将隐藏状态映射到适配器注意力维度
        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
        # 激活函数ReLU
        self.act_fn = nn.ReLU()
        # 第二个线性层，将适配器注意力维度映射回隐藏维度
        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)

    def forward(self, hidden_states: torch.FloatTensor):
        # 对输入的隐藏状态进行LayerNorm归一化
        hidden_states = self.norm(hidden_states)

        # 第一个线性层的前向传播
        hidden_states = self.linear_1(hidden_states)
        # 应用ReLU激活函数
        hidden_states = self.act_fn(hidden_states)
        # 第二个线性层的前向传播
        hidden_states = self.linear_2(hidden_states)

        # 返回处理后的隐藏状态
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm 复制过来，将 Wav2Vec2 替换为 Hubert
class HubertEncoderLayerStableLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义自注意力层 HubertAttention，使用配置中的隐藏尺寸、注意力头数和注意力丢弃率，作为编码器而非解码器
        self.attention = HubertAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        # 定义 Dropout 层，使用配置中的隐藏层丢弃率
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 定义 LayerNorm 层，使用配置中的隐藏尺寸和层标准化系数
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义前馈神经网络 HubertFeedForward
        self.feed_forward = HubertFeedForward(config)
        # 定义最终的 LayerNorm 层，使用配置中的隐藏尺寸和层标准化系数
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 如果配置中有 adapter_attn_dim 属性，则定义 HubertAttnAdapterLayer，否则为 None
        if getattr(config, "adapter_attn_dim", None) is not None:
            self.adapter_layer = HubertAttnAdapterLayer(config)
        else:
            self.adapter_layer = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 保存注意力残差
        attn_residual = hidden_states
        # 应用 LayerNorm 层
        hidden_states = self.layer_norm(hidden_states)
        # 应用自注意力层 HubertAttention，获取注意力权重（如果需要），输出新的隐藏状态
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        # 应用 Dropout
        hidden_states = self.dropout(hidden_states)
        # 加上注意力残差，形成新的隐藏状态
        hidden_states = attn_residual + hidden_states
        # 应用前馈神经网络，并加上最终的 LayerNorm
        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))

        # 如果存在 adapter_layer，则应用它
        if self.adapter_layer is not None:
            hidden_states = hidden_states + self.adapter_layer(hidden_states)

        # 输出包含最终隐藏状态的元组 outputs
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将注意力权重加入 outputs 元组
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder 复制过来，将 Wav2Vec2 替换为 Hubert
class HubertEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 定义位置卷积嵌入层 HubertPositionalConvEmbedding
        self.pos_conv_embed = HubertPositionalConvEmbedding(config)
        # 定义 LayerNorm 层，使用配置中的隐藏尺寸和层标准化系数
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义 Dropout 层，使用配置中的隐藏层丢弃率
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 定义多层 HubertEncoderLayer 层，并放入 nn.ModuleList 中
        self.layers = nn.ModuleList([HubertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 是否启用渐变检查点
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            # 初始化隐藏状态输出，根据需要创建空元组或者None
            all_hidden_states = () if output_hidden_states else None
            # 初始化自注意力输出，根据需要创建空元组或者None
            all_self_attentions = () if output_attentions else None

            if attention_mask is not None:
                # 确保填充的标记输出为0
                expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
                hidden_states[~expand_attention_mask] = 0

                # 扩展注意力掩码
                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
                attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
                attention_mask = attention_mask.expand(
                    attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
                )

            # 计算位置嵌入
            position_embeddings = self.pos_conv_embed(hidden_states)
            # 将位置嵌入加到隐藏状态上
            hidden_states = hidden_states + position_embeddings
            # LayerNorm 归一化
            hidden_states = self.layer_norm(hidden_states)
            # Dropout
            hidden_states = self.dropout(hidden_states)

            # 检查是否启用了deepspeed zero3
            deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()

            # 遍历所有层进行处理
            for layer in self.layers:
                if output_hidden_states:
                    # 如果需要输出隐藏状态，则将当前层的隐藏状态添加到all_hidden_states中
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 添加LayerDrop（参见https://arxiv.org/abs/1909.11556 进行描述）
                dropout_probability = torch.rand([])

                # 根据LayerDrop的概率决定是否跳过当前层
                skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
                if not skip_the_layer or deepspeed_zero3_is_enabled:
                    # 如果不跳过当前层或者启用了deepspeed zero3，则进行前向传播
                    if self.gradient_checkpointing and self.training:
                        # 使用梯度检查点进行前向传播（checkpointing）
                        layer_outputs = self._gradient_checkpointing_func(
                            layer.__call__,
                            hidden_states,
                            attention_mask,
                            output_attentions,
                        )
                    else:
                        # 普通的前向传播
                        layer_outputs = layer(
                            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
                        )
                    hidden_states = layer_outputs[0]

                if skip_the_layer:
                    # 如果跳过当前层，则输出设置为None
                    layer_outputs = (None, None)

                if output_attentions:
                    # 如果需要输出注意力权重，则将当前层的注意力权重添加到all_self_attentions中
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            if output_hidden_states:
                # 如果需要输出隐藏状态，则将最终的隐藏状态添加到all_hidden_states中
                all_hidden_states = all_hidden_states + (hidden_states,)

            if not return_dict:
                # 如果不需要返回字典形式的输出，则返回一个元组，过滤掉为None的部分
                return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            # 如果需要返回字典形式的输出，则创建BaseModelOutput对象并返回
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderStableLayerNorm 复制代码，并将 Wav2Vec2 替换为 Hubert
class HubertEncoderStableLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 初始化位置编码卷积嵌入层，使用 HubertPositionalConvEmbedding 类
        self.pos_conv_embed = HubertPositionalConvEmbedding(config)
        # 初始化层归一化层，归一化隐藏状态特征向量
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化丢弃层，以减少隐藏状态特征向量中的部分信息，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 初始化层列表，包含 HubertEncoderLayerStableLayerNorm 类的隐藏层，数量由配置中的 num_hidden_layers 决定
        self.layers = nn.ModuleList(
            [HubertEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
        )
        # 梯度检查点设置为关闭状态
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        # 初始化所有隐藏状态为一个空元组，如果不输出隐藏状态则为 None
        all_hidden_states = () if output_hidden_states else None
        # 初始化所有自注意力权重为一个空元组，如果不输出注意力权重则为 None
        all_self_attentions = () if output_attentions else None

        # 如果存在注意力遮罩
        if attention_mask is not None:
            # 确保填充的标记不被注意到
            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
            hidden_states[~expand_attention_mask] = 0

            # 扩展注意力遮罩
            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
            attention_mask = attention_mask.expand(
                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
            )

        # 计算位置嵌入
        position_embeddings = self.pos_conv_embed(hidden_states)
        hidden_states = hidden_states + position_embeddings
        hidden_states = self.dropout(hidden_states)

        # 检查是否启用了 DeepSpeed Zero3
        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()

        # 对每个层进行循环
        for layer in self.layers:
            # 如果输出隐藏状态，则记录当前隐藏状态
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 添加 LayerDrop（参见 https://arxiv.org/abs/1909.11556）
            dropout_probability = torch.rand([])

            # 根据 LayerDrop 的概率决定是否跳过当前层
            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
            if not skip_the_layer or deepspeed_zero3_is_enabled:
                # 如果启用了梯度检查点和处于训练模式，则使用梯度检查点来调用当前层
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        layer.__call__,
                        hidden_states,
                        attention_mask,
                        output_attentions,
                    )
                else:
                    # 否则直接调用当前层
                    layer_outputs = layer(
                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
                    )
                hidden_states = layer_outputs[0]

            # 如果跳过当前层，则层输出为空
            if skip_the_layer:
                layer_outputs = (None, None)

            # 如果输出注意力权重，则记录当前层的自注意力权重
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 对最终的隐藏状态进行 Layer Norm 处理
        hidden_states = self.layer_norm(hidden_states)

        # 如果输出隐藏状态，则记录最终的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，则按顺序返回相关结果
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则返回 Base Model Output 对象
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 HubertConfig
    config_class = HubertConfig
    # 模型的前缀名为 "hubert"
    base_model_prefix = "hubert"
    # 主要输入的名称为 "input_values"
    main_input_name = "input_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层，则使用正态分布初始化权重
        if isinstance(module, nn.Linear):
            # 与 TensorFlow 版本略有不同，后者使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        # 如果是 LayerNorm 或 GroupNorm，则初始化偏置为零，权重为1
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        # 如果是 1D 卷积层
        elif isinstance(module, nn.Conv1d):
            # 检查是否启用了 DeepSpeed 的 Zero3 模式
            if is_deepspeed_zero3_enabled():
                import deepspeed

                # 如果模块有 weight_v 和 weight_g 属性
                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
                    # 使用 GatheredParameters 进行初始化
                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
                        nn.init.kaiming_normal_(module.weight.data)
                else:
                    # 使用 GatheredParameters 进行初始化
                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
                        nn.init.kaiming_normal_(module.weight.data)
            else:
                # 使用 kaiming_normal_ 方法初始化权重
                nn.init.kaiming_normal_(module.weight.data)

        # 如果是线性层或 1D 卷积层，并且有偏置，则将偏置初始化为零
        if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
            module.bias.data.zero_()

    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        Computes the output length of the convolutional layers
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 计算 1D 卷积层的输出长度，使用公式来源于 PyTorch 文档
            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        # 遍历配置中的卷积核大小和步长
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            # 更新输入长度为卷积层输出长度
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        return input_lengths
    # 定义一个方法，用于生成特征向量的注意力掩码
    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
        # 根据注意力掩码的长度信息计算输出长度，并转换为长整型
        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
        # 获取当前批次的大小
        batch_size = attention_mask.shape[0]

        # 初始化一个全零的注意力掩码张量，形状为(batch_size, feature_vector_length)
        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )

        # 设置注意力掩码的部分值为1，确保在输出长度之前的所有位置都被关注
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1

        # 翻转张量，并对每行进行累积求和，然后再次翻转，并将结果转换为布尔类型
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()

        # 返回生成的注意力掩码张量
        return attention_mask
HUBERT_START_DOCSTRING = r"""
    Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden
    Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
    Ruslan Salakhutdinov, Abdelrahman Mohamed.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

HUBERT_INPUTS_DOCSTRING = r"""
    Placeholder for the documentation string describing the inputs expected by the `Hubert` model.
"""
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            # 输入的原始语音波形的浮点值。可以通过将 `.flac` 或 `.wav` 音频文件加载到 `List[float]` 或 `numpy.ndarray` 类型的数组中获得。可以使用 `soundfile` 库 (`pip install soundfile`)。使用 [`AutoProcessor`] 进行填充和转换成 `torch.FloatTensor` 类型的张量，详见 [`Wav2Vec2Processor.__call__`]。

        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩，用于避免在填充标记索引上执行卷积和注意力操作。遮罩值为 `[0, 1]`：

            - 1 表示 **未被遮罩** 的标记，
            - 0 表示 **被遮罩** 的标记。

            [什么是注意力遮罩?](../glossary#attention-mask)

            <Tip warning={true}>

            只有当相应的处理器具有 `config.return_attention_mask == True` 时才应传递 `attention_mask`。对于所有处理器具有 `config.return_attention_mask == False` 的模型，例如 [hubert-base](https://huggingface.co/facebook/hubert-base-ls960)，不应传递 `attention_mask` 以避免在进行批量推理时性能下降。对于这样的模型，`input_values` 应简单地填充为 0 并且不传递 `attention_mask`。请注意，这些模型在 `input_values` 是否填充上也会产生略微不同的结果。

            </Tip>

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关更多详细信息，请参见返回的张量下的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关更多详细信息，请参见返回的张量下的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
在代码中添加注释，解释每个语句的作用和功能。
"""

@add_start_docstrings(
    "The bare Hubert Model transformer outputting raw hidden-states without any specific head on top.",
    HUBERT_START_DOCSTRING,
)
class HubertModel(HubertPreTrainedModel):
    def __init__(self, config: HubertConfig):
        super().__init__(config)
        self.config = config  # 初始化模型配置
        self.feature_extractor = HubertFeatureEncoder(config)  # 使用给定配置创建特征提取器
        self.feature_projection = HubertFeatureProjection(config)  # 使用给定配置创建特征投影器

        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
            # 如果配置中有时间或特征掩码概率大于零，则初始化一个可学习的掩码嵌入向量

        if config.do_stable_layer_norm:
            self.encoder = HubertEncoderStableLayerNorm(config)
            # 如果配置要求稳定的层归一化，则使用稳定层归一化版本的编码器
        else:
            self.encoder = HubertEncoder(config)
            # 否则使用普通版本的编码器

        # 初始化权重并应用最终处理
        self.post_init()

    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states复制而来
    def _mask_hidden_states(
        self,
        hidden_states: torch.FloatTensor,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        # 掩码模型的隐藏状态，可以选择性地使用时间索引或注意力掩码

        ):
        """
        对隐藏状态进行掩码操作。

        Args:
            hidden_states (torch.FloatTensor): 输入的隐藏状态张量。
            mask_time_indices (Optional[torch.FloatTensor]): 可选的时间索引掩码张量。
            attention_mask (Optional[torch.LongTensor]): 可选的注意力掩码张量。

        """
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        # 检查配置中是否允许应用 SpecAugment，如果不允许，则直接返回隐藏状态
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # apply SpecAugment along time axis with given mask_time_indices
            # 根据给定的 mask_time_indices 在时间轴上应用 SpecAugment
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # calculate mask_time_indices if not provided explicitly
            # 如果未明确提供 mask_time_indices，则计算它
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            # 生成索引并沿特征轴应用 SpecAugment
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        return hidden_states
        """

        Returns a tuple containing model outputs or a BaseModelOutput.

        Example:

        ```
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```"""

        # Initialize variables with default values if not provided
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Extract features from input_values using feature_extractor
        extract_features = self.feature_extractor(input_values)
        extract_features = extract_features.transpose(1, 2)  # Transpose dimensions for further processing

        # Compute attention mask specific to feature vectors if provided
        if attention_mask is not None:
            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)

        # Project features into hidden states
        hidden_states = self.feature_projection(extract_features)

        # Mask certain time indices in hidden states if specified
        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)

        # Encode hidden states using the encoder
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract last hidden states from encoder outputs
        hidden_states = encoder_outputs[0]

        # Return model outputs based on return_dict flag
        if not return_dict:
            return (hidden_states,) + encoder_outputs[1:]  # Return tuple of hidden states and additional outputs

        # Return BaseModelOutput object with specified attributes
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    HUBERT_START_DOCSTRING,
)
# 定义了一个名为 HubertForCTC 的类，继承自 HubertPreTrainedModel
# 此类实现了带有语言建模头部的 Hubert 模型，用于连接主义时间分类（CTC）任务。
class HubertForCTC(HubertPreTrainedModel):
    def __init__(self, config, target_lang: Optional[str] = None):
        super().__init__(config)

        # 初始化 Hubert 模型
        self.hubert = HubertModel(config)
        # Dropout 层
        self.dropout = nn.Dropout(config.final_dropout)

        # 可选的目标语言设定
        self.target_lang = target_lang

        # 检查配置中是否定义了词汇表大小，如果没有则抛出异常
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )

        # 根据配置定义线性层作为语言建模头部
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 初始化权重并应用最终处理
        self.post_init()

    # 覆盖 PreTrainedModel 的 tie_weights 方法，以便在通过 from_pretrained(...) 传递 target_lang=... 时能正确加载适配器权重
    def tie_weights(self):
        """
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        """

        # 注意，tie_weights 通常用于绑定输入和输出嵌入权重。在这里重新定义它的目的是为了正确加载 Hubert 的适配器层，
        # 以便不需要引入新的 API 到 PreTrainedModel。虽然有些技巧性，但 Hubert 永远不需要绑定输入和输出嵌入，因此在这里重新用于适配器加载是可以接受的。

        # 获取目标语言
        target_lang = self.target_lang

        # 如果 target_lang 不为 None，且配置中未定义 adapter_attn_dim，则抛出异常
        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
        # 如果 target_lang 为 None，且配置中定义了 adapter_attn_dim，则记录日志提示用户默认 target_lang 为 'eng'
        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
            logger.info("By default `target_lang` is set to 'eng'.")
        # 如果 target_lang 不为 None，则加载适配器
        elif target_lang is not None:
            self.load_adapter(target_lang, force_load=True)
    # 调用此函数将冻结特征编码器的梯度计算，使其在训练过程中不会更新参数。
    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 发出警告提示，说明函数 `freeze_feature_extractor` 将在 Transformers v5 中移除，并建议使用 `freeze_feature_encoder` 替代。
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用 `freeze_feature_encoder` 方法来实现特征编码器参数的冻结。
        self.freeze_feature_encoder()

    # 调用此函数将冻结特征编码器的梯度计算，使其在训练过程中不会更新参数。
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 调用 Hubert 模型中的特征提取器的 `_freeze_parameters` 方法来冻结参数。
        self.hubert.feature_extractor._freeze_parameters()

    # 调用此函数将冻结基础模型的梯度计算，使其在训练过程中不会更新参数，仅分类头会更新。
    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        # 遍历 Hubert 模型的所有参数，并将其 `requires_grad` 设置为 False，以冻结基础模型的参数。
        for param in self.hubert.parameters():
            param.requires_grad = False

    # 重写 `forward` 方法，将其注解添加到模型的前向传播文档中，并附上代码示例的文档字符串。
    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 初始化返回字典，如果未指定则使用配置中的返回字典设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 Hubert 模型，获取输出结果
        outputs = self.hubert(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从输出中获取隐藏状态，并应用 dropout
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # 将隐藏状态传入语言模型头部，生成预测 logits
        logits = self.lm_head(hidden_states)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 如果标签存在，检查标签值是否超出词汇表大小，如果是则引发 ValueError
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # 根据注意力掩码计算输入长度
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

            # 假设填充的标记用 -100 填充，不被注意到时
            # 创建标签掩码以计算目标长度
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # 使用 log_softmax 计算对数概率
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # 禁用 cuDNN 以确保兼容性
            with torch.backends.cudnn.flags(enabled=False):
                # 计算 CTC 损失
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # 如果不要求返回字典，则根据输出格式构建返回结果
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则创建 CausalLMOutput 对象并返回
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
# 使用 Hubert 模型进行序列分类，该模型在顶部有一个用于分类的线性层（基于池化输出）
@add_start_docstrings(
    """
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    """,
    HUBERT_START_DOCSTRING,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification 复制而来，将 Wav2Vec2 改为 Hubert，wav2vec2 改为 hubert，WAV_2_VEC_2 改为 HUBERT
class HubertForSequenceClassification(HubertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 如果配置中存在 `add_adapter` 属性且为 True，则抛出异常，因为序列分类不支持使用 Hubert 适配器
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)"
            )
        # 创建 HubertModel 对象
        self.hubert = HubertModel(config)
        # 计算层数，包括变换器层和输入嵌入层
        num_layers = config.num_hidden_layers + 1
        # 如果配置指定使用加权层求和，则初始化层权重
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        # 创建投影层，将隐藏状态映射到分类器投影空间
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
        # 创建分类器层，将投影后的特征映射到类别数量
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 冻结特征提取器，不再更新其参数
    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    # 冻结特征编码器，不再更新其参数
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.hubert.feature_extractor._freeze_parameters()

    # 冻结基础模型，不再更新其参数，只更新分类头
    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.hubert.parameters():
            param.requires_grad = False

    # 将 HUBERT_INPUTS_DOCSTRING 添加到模型前向传播函数的文档字符串中
    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
    # 将代码示例的文档字符串添加到模型前向传播函数的文档字符串中，指定了检查点、输出类型、配置类、模态（audio）、预期输出和预期损失
    @add_code_sample_docstrings(
        checkpoint=_SEQ_CLASS_CHECKPOINT,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 设置是否返回字典形式的输出结果，默认为模型配置中指定的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果配置中指定使用加权层求和的隐藏状态，则设置为True，否则使用传入参数
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 使用 Hubert 模型进行前向传播，获取输出结果
        outputs = self.hubert(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置中指定使用加权层求和的隐藏状态
        if self.config.use_weighted_layer_sum:
            # 从输出结果中提取隐藏状态
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            # 将隐藏状态堆叠起来，按照加权向量进行加权求和
            hidden_states = torch.stack(hidden_states, dim=1)
            # 对加权向量进行 softmax 归一化处理
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            # 按加权向量加权求和隐藏状态
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接使用第一个输出的隐藏状态
            hidden_states = outputs[0]

        # 使用投影层进行映射
        hidden_states = self.projector(hidden_states)

        # 如果没有传入注意力掩码，则计算平均池化输出
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            # 否则根据注意力掩码生成填充掩码，将填充位置的隐藏状态置为0
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states[~padding_mask] = 0.0
            # 计算填充掩码后的池化输出
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        # 使用分类器计算 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        # 如果传入了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        # 如果不要求返回字典形式的输出
        if not return_dict:
            # 组装输出元组，包括 logits 和隐藏状态
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            # 如果有损失，则将损失加入输出元组
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的输出结果
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-五十七-

Transformers 源码解析（五十七）

.\models\groupvit\modeling_tf_groupvit.py

.\models\groupvit\__init__.py

.\models\herbert\tokenization_herbert.py

.\models\herbert\tokenization_herbert_fast.py

.\models\herbert\__init__.py

.\models\hubert\configuration_hubert.py

.\models\hubert\convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py

.\models\hubert\convert_hubert_original_pytorch_checkpoint_to_pytorch.py

.\models\hubert\convert_hubert_original_s3prl_checkpoint_to_pytorch.py

.\models\hubert\modeling_hubert.py

`.\models\groupvit\modeling_tf_groupvit.py`

`.\models\groupvit\init.py`

`.\models\herbert\tokenization_herbert.py`

`.\models\herbert\tokenization_herbert_fast.py`

`.\models\herbert\init.py`

`.\models\hubert\configuration_hubert.py`

`.\models\hubert\convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py`

`.\models\hubert\convert_hubert_original_pytorch_checkpoint_to_pytorch.py`

`.\models\hubert\convert_hubert_original_s3prl_checkpoint_to_pytorch.py`

`.\models\hubert\modeling_hubert.py`