Transformers 源码解析（五十八）

`.\models\hubert\modeling_tf_hubert.py`

# 设置编码为 UTF-8
# 版权声明，指明版权归 Fairseq 作者和 HuggingFace Inc. 团队所有
#
# 根据 Apache License, Version 2.0 许可证，除非符合许可证要求，否则不得使用此文件
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"现状"分发软件
# 没有任何明示或暗示的担保或条件。有关详细信息，请参阅许可证

""" TensorFlow Hubert 模型."""

from __future__ import annotations

# 引入警告模块
import warnings
# 引入类型提示
from typing import Any, Optional, Tuple, Union

# 引入 numpy 库，并命名为 np
import numpy as np
# 引入 TensorFlow 库，并命名为 tf
import tensorflow as tf

# 引入相关模块和函数
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
from ...modeling_tf_utils import (
    TFPreTrainedModel,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 引入 Hubert 模型的配置文件
from .configuration_hubert import HubertConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的配置名
_CONFIG_FOR_DOC = "HubertConfig"

# 预训练模型存档列表
TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/hubert-base-ls960",
    # 查看所有 Hubert 模型，请访问 https://huggingface.co/models?filter=hubert
]

# 定义一个大负数常量
LARGE_NEGATIVE = -1e8


# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2._sample_without_replacement 复制而来
def _sample_without_replacement(distribution, num_samples):
    """
    未实现的无重复分类抽样。目前可以使用 Gumbel-max 技巧代替 - 参见
    https://github.com/tensorflow/tensorflow/issues/9260 了解更多信息
    """
    # 使用 Gumbel-max 技巧进行抽样
    z = -tf.math.log(tf.random.uniform(shape_list(distribution), 0, 1))
    _, indices = tf.nn.top_k(distribution + z, num_samples)
    return indices


# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2._scatter_values_on_batch_indices 复制而来
def _scatter_values_on_batch_indices(values, batch_indices, output_shape):
    """
    类似于 PyTorch 中的 scatter 函数，使用格式为 (batch_dim, indices) 的索引
    """
    indices_shape = shape_list(batch_indices)
    # 将批次维度广播到 indices_shape
    broad_casted_batch_dims = tf.reshape(
        tf.broadcast_to(tf.expand_dims(tf.range(indices_shape[0]), axis=-1), indices_shape), [1, -1]
    )
    # 将 batch_indices 转换为 pair_indices
    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
    # 将值 values 散布到 pair_indices 上
    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), output_shape)


# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2._compute_mask_indices 复制而来
def _compute_mask_indices(
    shape: Tuple[int, int],
    # 定义一个名为 shape 的变量，其类型为元组，包含两个整数值，分别表示形状的尺寸
    mask_prob: float,
    # 定义一个名为 mask_prob 的变量，其类型为浮点数，表示掩码生成的概率
    mask_length: int,
    # 定义一个名为 mask_length 的变量，其类型为整数，表示每个掩码的长度
    min_masks: int = 0,
    # 定义一个名为 min_masks 的变量，其类型为整数，默认值为 0，表示最少需要的掩码数量
def compute_random_mask_spans(shape: Tuple[int, int],
                              attention_mask: Optional[tf.Tensor] = None,
                              mask_prob: float = 0.15,
                              mask_length: int = 10,
                              min_masks: int = 0) -> tf.Tensor:
    """
    Computes random mask spans for a given shape

    Args:
        shape: the shape for which to compute masks.
            should be of size 2 where first element is batch size and 2nd is timesteps
        attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
        mask_prob:
            probability for each token to be chosen as start of the span to be masked. this will be multiplied by
            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
        mask_length: size of the mask
        min_masks: minimum number of masked spans

    Adapted from fairseq's data_utils.py.
    """

    # Extract batch size and sequence length from the shape tuple
    batch_size, sequence_length = shape

    # Check if mask length is valid
    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")

    # Assert that mask length is smaller than sequence length
    tf.debugging.assert_less(
        mask_length,
        sequence_length,
        message=(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"
            f" `sequence_length`: {sequence_length}`"
        ),
    )

    # Compute the number of masked spans in the batch
    num_masked_spans = mask_prob * tf.cast(sequence_length, tf.float32) / mask_length + tf.random.uniform((1,))
    num_masked_spans = tf.maximum(num_masked_spans, min_masks)
    num_masked_spans = tf.cast(num_masked_spans, tf.int32)

    # Ensure num masked indices <= sequence length
    num_masked_spans = tf.math.minimum(sequence_length // mask_length, num_masked_spans)
    num_masked_spans = tf.squeeze(num_masked_spans)

    # Initialize the specAugment mask
    spec_aug_mask = tf.zeros((batch_size, sequence_length), dtype=tf.int32)

    # Create a uniform distribution to sample from, ensuring offset samples are < sequence_length
    uniform_dist = tf.ones((batch_size, sequence_length - (mask_length - 1)))

    # Get random indices to mask using _sample_without_replacement function
    spec_aug_mask_idxs = _sample_without_replacement(uniform_dist, num_masked_spans)

    # Expand masked indices to masked spans
    spec_aug_mask_idxs = tf.expand_dims(spec_aug_mask_idxs, -1)
    spec_aug_mask_idxs = tf.tile(spec_aug_mask_idxs, (1, 1, mask_length))
    spec_aug_mask_idxs = tf.reshape(spec_aug_mask_idxs, (batch_size, num_masked_spans * mask_length))

    # Create offsets for each mask span
    offsets = tf.range(mask_length)[tf.newaxis, tf.newaxis, :]
    offsets = tf.tile(offsets, (batch_size, num_masked_spans, 1))
    offsets = tf.reshape(offsets, (batch_size, num_masked_spans * mask_length))

    # Apply offsets to the mask indices
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # Scatter indices to mask using _scatter_values_on_batch_indices function
    spec_aug_mask = _scatter_values_on_batch_indices(
        tf.ones_like(spec_aug_mask_idxs), spec_aug_mask_idxs, tf.shape(spec_aug_mask)
    )

    return spec_aug_mask
# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    # 获取输入张量的第二维长度，即序列长度
    src_len = shape_list(mask)[1]
    # 如果未提供目标长度，则默认使用源长度
    tgt_len = tgt_len if tgt_len is not None else src_len
    # 创建常量张量，数值为1.0
    one_cst = tf.constant(1.0)
    # 将输入的 mask 转换为浮点型张量
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在第二维和第三维上复制 mask 张量，扩展为 `[bsz, 1, tgt_len, src_len]`
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    return (one_cst - expanded_mask) * LARGE_NEGATIVE


# Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNorm with Wav2Vec2->Hubert
class TFHubertGroupNorm(keras.layers.Layer):
    """
    From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization
    """

    def __init__(
        self,
        groups: int = 32,
        axis: int = -1,
        epsilon: float = 1e-3,
        center: bool = True,
        scale: bool = True,
        beta_initializer: keras.initializers.Initializer = "zeros",
        gamma_initializer: keras.initializers.Initializer = "ones",
        beta_regularizer: keras.regularizers.Regularizer = None,
        gamma_regularizer: keras.regularizers.Regularizer = None,
        beta_constraint: keras.constraints.Constraint = None,
        gamma_constraint: keras.constraints.Constraint = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.supports_masking = True
        # 设置 GroupNormalization 的参数
        self.groups = groups
        self.axis = axis
        self.epsilon = epsilon
        self.center = center
        self.scale = scale
        self.beta_initializer = keras.initializers.get(beta_initializer)
        self.gamma_initializer = keras.initializers.get(gamma_initializer)
        self.beta_regularizer = keras.regularizers.get(beta_regularizer)
        self.gamma_regularizer = keras.regularizers.get(gamma_regularizer)
        self.beta_constraint = keras.constraints.get(beta_constraint)
        self.gamma_constraint = keras.constraints.get(gamma_constraint)
        self._check_axis()

    def build(self, input_shape):
        # 检查输入张量的形状是否为 None
        self._check_if_input_shape_is_none(input_shape)
        # 设置实例标准化中的组数
        self._set_number_of_groups_for_instance_norm(input_shape)
        # 检查维度大小
        self._check_size_of_dimensions(input_shape)
        # 创建输入规范
        self._create_input_spec(input_shape)

        # 添加 gamma 权重
        self._add_gamma_weight(input_shape)
        # 添加 beta 权重
        self._add_beta_weight(input_shape)
        self.built = True
        super().build(input_shape)
    # 定义一个方法，用于处理输入数据
    def call(self, inputs):
        # 获取输入数据的静态形状
        input_shape = keras.backend.int_shape(inputs)
        # 获取输入数据的动态形状
        tensor_input_shape = tf.shape(inputs)

        # 调用内部方法对输入数据进行分组重塑操作
        reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape)

        # 对重塑后的数据应用规范化操作
        normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)

        # 判断是否为实例规范化
        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
        if not is_instance_norm:
            # 如果不是实例规范化，将规范化后的数据重新整形为原始输入数据的形状
            outputs = tf.reshape(normalized_inputs, tensor_input_shape)
        else:
            # 如果是实例规范化，则直接使用规范化后的数据作为输出
            outputs = normalized_inputs

        # 返回处理后的输出数据
        return outputs

    # 获取当前层的配置信息，用于模型保存和加载时使用
    def get_config(self):
        config = {
            "groups": self.groups,
            "axis": self.axis,
            "epsilon": self.epsilon,
            "center": self.center,
            "scale": self.scale,
            "beta_initializer": keras.initializers.serialize(self.beta_initializer),
            "gamma_initializer": keras.initializers.serialize(self.gamma_initializer),
            "beta_regularizer": keras.regularizers.serialize(self.beta_regularizer),
            "gamma_regularizer": keras.regularizers.serialize(self.gamma_regularizer),
            "beta_constraint": keras.constraints.serialize(self.beta_constraint),
            "gamma_constraint": keras.constraints.serialize(self.gamma_constraint),
        }
        # 调用父类方法获取基础配置信息，并合并当前层的配置信息
        base_config = super().get_config()
        return {**base_config, **config}

    # 计算输出形状，这里直接返回输入形状
    def compute_output_shape(self, input_shape):
        return input_shape

    # 内部方法：将输入数据重塑为分组形式
    def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
        # 复制输入数据的形状作为分组形状的基础
        group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
        # 判断是否为实例规范化
        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
        if not is_instance_norm:
            # 如果不是实例规范化，根据分组数调整分组形状
            group_shape[self.axis] = input_shape[self.axis] // self.groups
            group_shape.insert(self.axis, self.groups)
            group_shape = tf.stack(group_shape)
            # 对输入数据进行形状重塑操作
            reshaped_inputs = tf.reshape(inputs, group_shape)
            return reshaped_inputs, group_shape
        else:
            # 如果是实例规范化，则直接返回原始输入数据和分组形状
            return inputs, group_shape

    # 内部方法：对重塑后的数据应用规范化操作
    def _apply_normalization(self, reshaped_inputs, input_shape):
        # 获取分组后数据的形状
        group_shape = keras.backend.int_shape(reshaped_inputs)
        # 确定规范化操作的约简轴
        group_reduction_axes = list(range(1, len(group_shape)))
        # 判断是否为实例规范化
        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
        if not is_instance_norm:
            # 如果不是实例规范化，调整约简轴的位置
            axis = -2 if self.axis == -1 else self.axis - 1
        else:
            axis = -1 if self.axis == -1 else self.axis - 1
        group_reduction_axes.pop(axis)

        # 计算分组均值和方差
        mean, variance = tf.nn.moments(reshaped_inputs, group_reduction_axes, keepdims=True)

        # 获取调整后的权重参数
        gamma, beta = self._get_reshaped_weights(input_shape)

        # 对重塑后的数据应用批量规范化操作
        normalized_inputs = tf.nn.batch_normalization(
            reshaped_inputs,
            mean=mean,
            variance=variance,
            scale=gamma,
            offset=beta,
            variance_epsilon=self.epsilon,
        )
        return normalized_inputs
    # 获取重塑后的权重，根据输入形状创建广播形状
    def _get_reshaped_weights(self, input_shape):
        broadcast_shape = self._create_broadcast_shape(input_shape)
        gamma = None
        beta = None
        # 如果启用了标准化参数，将 gamma 重塑为广播形状
        if self.scale:
            gamma = tf.reshape(self.gamma, broadcast_shape)

        # 如果启用了中心化参数，将 beta 重塑为广播形状
        if self.center:
            beta = tf.reshape(self.beta, broadcast_shape)
        return gamma, beta

    # 检查输入形状是否有未定义的维度
    def _check_if_input_shape_is_none(self, input_shape):
        dim = input_shape[self.axis]
        if dim is None:
            raise ValueError(
                "Axis "
                + str(self.axis)
                + " of input tensor should have a defined dimension but the layer received an input with shape "
                + str(input_shape)
                + "."
            )

    # 为实例标准化设置组数
    def _set_number_of_groups_for_instance_norm(self, input_shape):
        dim = input_shape[self.axis]

        # 如果未指定组数，将组数设置为输入张量的维度
        if self.groups == -1:
            self.groups = dim

    # 检查维度的大小是否符合要求
    def _check_size_of_dimensions(self, input_shape):
        dim = input_shape[self.axis]
        # 检查组数是否超过通道数
        if dim < self.groups:
            raise ValueError(
                "Number of groups ("
                + str(self.groups)
                + ") cannot be more than the number of channels ("
                + str(dim)
                + ")."
            )

        # 检查组数是否是通道数的倍数
        if dim % self.groups != 0:
            raise ValueError(
                "Number of groups ("
                + str(self.groups)
                + ") must be a multiple of the number of channels ("
                + str(dim)
                + ")."
            )

    # 检查是否尝试标准化批处理轴
    def _check_axis(self):
        if self.axis == 0:
            raise ValueError(
                "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
            )

    # 创建输入规范
    def _create_input_spec(self, input_shape):
        dim = input_shape[self.axis]
        # 根据输入形状创建输入规范
        self.input_spec = keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})

    # 添加 gamma 权重
    def _add_gamma_weight(self, input_shape):
        dim = input_shape[self.axis]
        shape = (dim,)
        
        # 如果启用了标准化，添加 gamma 权重
        if self.scale:
            self.gamma = self.add_weight(
                shape=shape,
                name="gamma",
                initializer=self.gamma_initializer,
                regularizer=self.gamma_regularizer,
                constraint=self.gamma_constraint,
            )
        else:
            self.gamma = None

    # 添加 beta 权重
    def _add_beta_weight(self, input_shape):
        dim = input_shape[self.axis]
        shape = (dim,)

        # 如果启用了中心化，添加 beta 权重
        if self.center:
            self.beta = self.add_weight(
                shape=shape,
                name="beta",
                initializer=self.beta_initializer,
                regularizer=self.beta_regularizer,
                constraint=self.beta_constraint,
            )
        else:
            self.beta = None
    # 定义一个方法用于创建广播形状，根据输入的形状来确定广播后的形状
    def _create_broadcast_shape(self, input_shape):
        # 创建一个与输入形状长度相同的列表，初始值全部为1，用于构建广播形状
        broadcast_shape = [1] * len(input_shape)
        # 判断是否是实例归一化，这里通过检查特定轴上的尺寸是否等于组数来确定
        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
        # 如果不是实例归一化
        if not is_instance_norm:
            # 将广播形状中特定轴的尺寸设置为输入形状中特定轴的尺寸除以组数的结果
            broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
            # 在特定轴前插入组数，以便于构建正确的广播形状
            broadcast_shape.insert(self.axis, self.groups)
        else:
            # 如果是实例归一化，则直接将广播形状中特定轴的尺寸设置为组数
            broadcast_shape[self.axis] = self.groups
        # 返回构建好的广播形状
        return broadcast_shape
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2WeightNormConv1D 复制而来，将 Wav2Vec2 改为 Hubert
class TFHubertWeightNormConv1D(keras.layers.Conv1D):
    """从 https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNorm 改编"""

    def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs):
        # 调用 Conv1D 的初始化方法，设定卷积核参数
        super().__init__(
            filters=filters,
            kernel_size=kernel_size,
            groups=groups,
            padding="valid",  # 使用有效填充方式
            use_bias=True,  # 使用偏置
            bias_initializer="he_normal",  # 偏置初始化方式为 he_normal
            **kwargs,
        )
        # 设置显式填充和卷积的通道方向
        self.explicit_padding = explicit_padding
        self.filter_axis = 2  # 卷积核的轴数
        self.kernel_norm_axes = tf.constant([0, 1])  # 卷积核的归一化轴

    def _init_norm(self):
        """设置权重向量的范数。"""
        kernel_norm = tf.sqrt(tf.reduce_sum(tf.square(self.weight_v), axis=self.kernel_norm_axes))
        self.weight_g.assign(kernel_norm[:, tf.newaxis, tf.newaxis])

    def _normalize_kernel(self):
        """生成归一化的权重。"""
        kernel = tf.nn.l2_normalize(self.weight_v, axis=self.kernel_norm_axes) * tf.transpose(self.weight_g)
        self.kernel = tf.transpose(kernel)

    def build(self, input_shape):
        if not self.built:
            super().build(input_shape)

            # 初始化权重向量并设为可训练
            self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
            self.weight_v = self.kernel

            # 添加权重 g，初始化为全1，设为可训练
            self.weight_g = self.add_weight(
                name="weight_g",
                shape=(int(self.weight_v.shape[self.filter_axis]), 1, 1),
                initializer="ones",
                dtype=self.weight_v.dtype,
                trainable=True,
            )
            # 初始化权重向量的范数
            self._init_norm()
            # 添加偏置，并初始化为0，设为可训练
            self.bias = self.add_weight(name="bias", shape=(self.filters,), initializer="zeros", trainable=True)

    def call(self, inputs):
        # TODO Matt: 在 call() 中对属性进行赋值在 TensorFlow 中是不正确的，应该保持幂等性。
        #            这整个层应该被替换为一个不继承 Conv1D 的层，而是调用一个生成归一化权重的函数性1D卷积。
        self._normalize_kernel()

        # 对输入进行显式填充
        padded_inputs = tf.pad(inputs, ((0, 0), (self.explicit_padding, self.explicit_padding), (0, 0)))
        # 调用父类 Conv1D 的 call 方法进行卷积运算
        output = super().call(padded_inputs)

        return output
    # 初始化方法，用于设置对象的初始状态
    def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
        # 调用父类的初始化方法，传递额外的关键字参数
        super().__init__(**kwargs)
        
        # 设置输入卷积维度为配置对象中的 conv_dim[layer_id]，若 layer_id > 0 则取对应的值，否则设为 1
        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
        
        # 设置输出卷积维度为配置对象中的 conv_dim[layer_id]
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个 1D 卷积层对象，设置滤波器数量、卷积核大小、步长、是否使用偏置，并命名为 "conv"
        self.conv = keras.layers.Conv1D(
            filters=self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            strides=config.conv_stride[layer_id],
            use_bias=config.conv_bias,
            name="conv",
        )
        
        # 获取激活函数，根据配置中的 feat_extract_activation 来选择
        self.activation = get_tf_activation(config.feat_extract_activation)

    # 调用方法，用于执行前向传播计算
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 对输入张量进行一维卷积操作
        hidden_states = self.conv(hidden_states)
        
        # 应用激活函数到卷积后的张量
        hidden_states = self.activation(hidden_states)
        
        # 返回处理后的张量作为输出
        return hidden_states

    # 构建方法，用于构建层的变量和权重，确保在首次调用 call 方法时已经构建
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        
        # 标记为已构建状态
        self.built = True
        
        # 如果存在卷积层对象，则在名称作用域下构建卷积层，指定输入形状为 [None, None, self.in_conv_dim]
        if getattr(self, "conv", None) is not None:
            with tf.name_scope(self.conv.name):
                self.conv.build([None, None, self.in_conv_dim])
# 从transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2LayerNormConvLayer复制代码，将Wav2Vec2改为Hubert
class TFHubertLayerNormConvLayer(keras.layers.Layer):
    def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        # 初始化卷积层的输入维度和输出维度
        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层对象
        self.conv = keras.layers.Conv1D(
            filters=self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            strides=config.conv_stride[layer_id],
            use_bias=config.conv_bias,
            name="conv",
        )
        # 创建一个层归一化层对象
        self.layer_norm = keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps)
        # 获取激活函数对象
        self.activation = get_tf_activation(config.feat_extract_activation)

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 执行一维卷积操作
        hidden_states = self.conv(hidden_states)
        # 执行层归一化操作
        hidden_states = self.layer_norm(hidden_states)
        # 执行激活函数操作
        hidden_states = self.activation(hidden_states)
        return hidden_states

    def build(self, input_shape=None):
        # 如果已经构建则直接返回
        if self.built:
            return
        self.built = True
        # 构建卷积层对象
        if getattr(self, "conv", None) is not None:
            with tf.name_scope(self.conv.name):
                self.conv.build([None, None, self.in_conv_dim])
        # 构建层归一化层对象
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.out_conv_dim])


# 从transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNormConvLayer复制代码，将Wav2Vec2改为Hubert
class TFHubertGroupNormConvLayer(keras.layers.Layer):
    def __init__(self, config: HubertConfig, layer_id: int = 0, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        # 初始化卷积层的输入维度和输出维度
        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层对象
        self.conv = keras.layers.Conv1D(
            filters=self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            strides=config.conv_stride[layer_id],
            use_bias=config.conv_bias,
            name="conv",
        )
        # 获取激活函数对象
        self.activation = get_tf_activation(config.feat_extract_activation)
        # 创建一个组归一化层对象
        self.layer_norm = TFHubertGroupNorm(groups=self.out_conv_dim, epsilon=config.layer_norm_eps, name="layer_norm")

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 执行一维卷积操作
        hidden_states = self.conv(hidden_states)
        # 执行组归一化操作
        hidden_states = self.layer_norm(hidden_states)
        # 执行激活函数操作
        hidden_states = self.activation(hidden_states)
        return hidden_states
    # 定义一个方法 `build`，用于构建神经网络层的结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，不重复构建
        if self.built:
            return
        # 将标记设置为已构建
        self.built = True
        
        # 如果存在 `conv` 属性，执行以下操作
        if getattr(self, "conv", None) is not None:
            # 使用 `tf.name_scope` 创建名为 `self.conv.name` 的命名空间
            with tf.name_scope(self.conv.name):
                # 使用 `self.in_conv_dim` 参数构建 `conv` 层
                self.conv.build([None, None, self.in_conv_dim])
        
        # 如果存在 `layer_norm` 属性，执行以下操作
        if getattr(self, "layer_norm", None) is not None:
            # 使用 `tf.name_scope` 创建名为 `self.layer_norm.name` 的命名空间
            with tf.name_scope(self.layer_norm.name):
                # 使用 `self.out_conv_dim` 参数构建 `layer_norm` 层
                self.layer_norm.build([None, None, self.out_conv_dim])
# 定义一个名为 TFHubertPositionalConvEmbedding 的自定义层，继承自 keras 的 Layer 类
class TFHubertPositionalConvEmbedding(keras.layers.Layer):
    # 初始化方法，接受一个 HubertConfig 对象作为参数
    def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        # 创建一个 TFHubertWeightNormConv1D 类对象，用于卷积操作
        self.conv = TFHubertWeightNormConv1D(
            filters=config.hidden_size,  # 卷积输出的维度大小
            kernel_size=config.num_conv_pos_embeddings,  # 卷积核的大小
            groups=config.num_conv_pos_embedding_groups,  # 卷积操作时的组数
            explicit_padding=config.num_conv_pos_embeddings // 2,  # 明确的填充大小
            name="conv",  # 层的名称
        )
        # 创建一个 TFHubertSamePadLayer 类对象，用于进行相同的填充操作
        self.padding = TFHubertSamePadLayer(config.num_conv_pos_embeddings)
        # 获取激活函数，根据配置参数中的 feat_extract_activation 设置
        self.activation = get_tf_activation(config.feat_extract_activation)
        self.config = config  # 保存配置对象

    # 定义 call 方法，接受输入的 hidden_states 张量，返回处理后的张量
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.conv(hidden_states)  # 进行卷积操作
        hidden_states = self.padding(hidden_states)  # 进行填充操作
        hidden_states = self.activation(hidden_states)  # 应用激活函数
        return hidden_states  # 返回处理后的张量

    # build 方法，用于构建层，根据输入形状 input_shape 构建 conv 层
    def build(self, input_shape=None):
        if self.built:  # 如果已经构建过，则直接返回
            return
        self.built = True  # 将 built 标记为 True，表示已构建
        if getattr(self, "conv", None) is not None:
            with tf.name_scope(self.conv.name):
                self.conv.build([None, None, self.config.hidden_size])
                # 使用配置中的 hidden_size 构建 conv 层的形状
    # 定义模型的构建方法，用于构建模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回，不进行重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 遍历模型中的每个卷积层
        for conv_layer in self.conv_layers:
            # 使用 TensorFlow 的命名空间为当前卷积层命名
            with tf.name_scope(conv_layer.name):
                # 构建当前卷积层，input_shape=None 表示使用默认输入形状
                conv_layer.build(None)
# 定义 TFHubertFeatureExtractor 类，继承自 TFHubertFeatureEncoder 类
class TFHubertFeatureExtractor(TFHubertFeatureEncoder):
    def __init__(self, config, **kwargs):
        # 调用父类 TFHubertFeatureEncoder 的构造函数
        super().__init__(config, **kwargs)
        # 发出警告，提醒该类已被弃用，并将在 Transformers v5 版本中移除，建议使用其基类代替
        warnings.warn(
            f"The class `{self.__class__.__name__}` has been depreciated "
            "and will be removed in Transformers v5. "
            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
            FutureWarning,
        )


# 定义 TFHubertFeatureProjection 类，继承自 keras.layers.Layer 类
class TFHubertFeatureProjection(keras.layers.Layer):
    def __init__(self, config: HubertConfig, **kwargs):
        # 调用父类的构造函数
        super().__init__(**kwargs)

        # 初始化层归一化模块，使用给定的 epsilon 值
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        
        # 初始化全连接层，用于特征投影
        self.projection = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            bias_initializer="zeros",
            name="projection",
        )
        
        # 初始化 Dropout 层，用于在训练时进行随机失活
        self.dropout = keras.layers.Dropout(rate=config.feat_proj_dropout)
        
        # 保存配置信息
        self.config = config

    # 定义调用方法，实现特征投影过程
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 应用层归一化
        hidden_states = self.layer_norm(hidden_states)
        
        # 应用特征投影
        hidden_states = self.projection(hidden_states)
        
        # 应用 Dropout
        hidden_states = self.dropout(hidden_states, training=training)
        
        return hidden_states

    # 定义构建方法，用于构建层对象
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        
        # 标记为已构建
        self.built = True
        
        # 构建层归一化模块，使用输入形状和配置的最后一个维度
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.conv_dim[-1]])
        
        # 构建特征投影层，使用输入形状和配置的最后一个维度
        if getattr(self, "projection", None) is not None:
            with tf.name_scope(self.projection.name):
                self.projection.build([None, None, self.config.conv_dim[-1]])


# 从 transformers.models.bart.modeling_tf_bart.TFBartAttention 复制并改名为 TFHubertAttention
class TFHubertAttention(keras.layers.Layer):
    """Multi-headed attention from "Attention Is All You Need"""

    # 初始化多头注意力层
    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

        self.num_heads = num_heads
        self.dropout = keras.layers.Dropout(dropout)
        self.head_dim = embed_dim // num_heads
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")


        # 初始化函数，用于初始化模型的参数和属性
        super().__init__(**kwargs)
        # 设置嵌入维度
        self.embed_dim = embed_dim

        # 设置注意力头的数量
        self.num_heads = num_heads
        # 设置 dropout 层，用于在训练过程中随机丢弃部分神经元，防止过拟合
        self.dropout = keras.layers.Dropout(dropout)
        # 计算每个注意力头的维度
        self.head_dim = embed_dim // num_heads
        # 检查 embed_dim 是否可以被 num_heads 整除
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子，用于缩放注意力分数
        self.scaling = self.head_dim**-0.5
        # 是否为解码器的标志位
        self.is_decoder = is_decoder

        # 初始化键、查询、值以及输出的投影层
        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")


    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))


        # 重新塑造张量的形状，以适应多头注意力的需求
        def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
            return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))


    def call(
        self,
        hidden_states: tf.Tensor,
        key_value_states: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        training: Optional[bool] = False,
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "k_proj", None) is not None:
            with tf.name_scope(self.k_proj.name):
                self.k_proj.build([None, None, self.embed_dim])
        if getattr(self, "q_proj", None) is not None:
            with tf.name_scope(self.q_proj.name):
                self.q_proj.build([None, None, self.embed_dim])
        if getattr(self, "v_proj", None) is not None:
            with tf.name_scope(self.v_proj.name):
                self.v_proj.build([None, None, self.embed_dim])
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.embed_dim])


        # 模型的调用方法，定义了模型的前向传播逻辑
        def call(
            self,
            hidden_states: tf.Tensor,
            key_value_states: tf.Tensor | None = None,
            past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
            attention_mask: tf.Tensor | None = None,
            layer_head_mask: tf.Tensor | None = None,
            training: Optional[bool] = False,
        # 模型的构建方法，用于构建模型的层次结构
        def build(self, input_shape=None):
            if self.built:
                return
            self.built = True
            # 构建键、查询、值以及输出的投影层
            if getattr(self, "k_proj", None) is not None:
                with tf.name_scope(self.k_proj.name):
                    self.k_proj.build([None, None, self.embed_dim])
            if getattr(self, "q_proj", None) is not None:
                with tf.name_scope(self.q_proj.name):
                    self.q_proj.build([None, None, self.embed_dim])
            if getattr(self, "v_proj", None) is not None:
                with tf.name_scope(self.v_proj.name):
                    self.v_proj.build([None, None, self.embed_dim])
            if getattr(self, "out_proj", None) is not None:
                with tf.name_scope(self.out_proj.name):
                    self.out_proj.build([None, None, self.embed_dim])
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2FeedForward 复制代码，将 Wav2Vec2 替换为 Hubert
class TFHubertFeedForward(keras.layers.Layer):
    def __init__(self, config: HubertConfig, **kwargs):
        super().__init__(**kwargs)

        # 中间层的 Dropout，使用给定的激活 dropout 率
        self.intermediate_dropout = keras.layers.Dropout(config.activation_dropout)

        # 中间层的全连接层，设置单元数、权重和偏置的初始化方式，并命名为 "intermediate_dense"
        self.intermediate_dense = keras.layers.Dense(
            units=config.intermediate_size,
            kernel_initializer=get_initializer(config.initializer_range),
            bias_initializer="zeros",
            name="intermediate_dense",
        )
        # 中间层的激活函数，根据配置选择 Tensorflow 的激活函数
        self.intermediate_act_fn = get_tf_activation(config.hidden_act)

        # 输出层的全连接层，设置单元数、权重和偏置的初始化方式，并命名为 "output_dense"
        self.output_dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            bias_initializer="zeros",
            name="output_dense",
        )
        # 输出层的 Dropout，使用给定的隐藏 dropout 率
        self.output_dropout = keras.layers.Dropout(config.hidden_dropout)
        self.config = config

    # 调用函数，实现前向传播
    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 中间层全连接操作
        hidden_states = self.intermediate_dense(hidden_states)
        # 中间层激活函数操作
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 中间层 Dropout 操作，根据训练模式决定是否启用
        hidden_states = self.intermediate_dropout(hidden_states, training=training)

        # 输出层全连接操作
        hidden_states = self.output_dense(hidden_states)
        # 输出层 Dropout 操作，根据训练模式决定是否启用
        hidden_states = self.output_dropout(hidden_states, training=training)
        return hidden_states

    # 构建层，初始化中间层和输出层的权重和偏置
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果中间层已存在，则构建中间层
        if getattr(self, "intermediate_dense", None) is not None:
            with tf.name_scope(self.intermediate_dense.name):
                self.intermediate_dense.build([None, None, self.config.hidden_size])
        # 如果输出层已存在，则构建输出层
        if getattr(self, "output_dense", None) is not None:
            with tf.name_scope(self.output_dense.name):
                self.output_dense.build([None, None, self.config.intermediate_size])


# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayer 复制代码，将 Wav2Vec2 替换为 Hubert
class TFHubertEncoderLayer(keras.layers.Layer):
    def __init__(self, config: HubertConfig, **kwargs):
        super().__init__(**kwargs)

        # 使用 HubertConfig 初始化注意力机制层
        self.attention = TFHubertAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
            name="attention",
        )
        # dropout 层，使用给定的隐藏 dropout 率
        self.dropout = keras.layers.Dropout(config.hidden_dropout)
        # 层归一化，设置 epsilon 值并命名为 "layer_norm"
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 前馈神经网络层，使用给定的 HubertConfig 配置并命名为 "feed_forward"
        self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
        # 最终层归一化，设置 epsilon 值并命名为 "final_layer_norm"
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
        self.config = config
    # 定义一个方法 `call`，用于执行 Transformer 层的前向传播
    def call(
        self,
        hidden_states: tf.Tensor,  # 输入张量 hidden_states，表示输入的隐藏状态
        attention_mask: tf.Tensor | None = None,  # 注意力掩码张量，默认为 None
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，默认为 False
        training: bool = False,  # 是否处于训练模式，默认为 False
    ) -> Tuple[tf.Tensor]:  # 返回一个元组，包含类型为 tf.Tensor 的 hidden_states

        # 复制隐藏状态作为注意力残差
        attn_residual = hidden_states
        # 调用 self.attention 对象的前向传播方法，获取更新后的 hidden_states、注意力权重 attn_weights 和一个占位符 _
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, training=training
        )
        # 在训练中使用 dropout 处理 hidden_states
        hidden_states = self.dropout(hidden_states, training=training)
        # 将注意力残差与更新后的 hidden_states 相加，得到新的 hidden_states
        hidden_states = attn_residual + hidden_states

        # 使用层归一化层处理 hidden_states
        hidden_states = self.layer_norm(hidden_states)
        # 将隐藏状态输入到 feed_forward 网络中，再将结果与原始 hidden_states 相加
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        # 最终再次进行层归一化处理
        hidden_states = self.final_layer_norm(hidden_states)

        # 构建输出元组，初始包含更新后的 hidden_states
        outputs = (hidden_states,)

        # 如果设置输出注意力权重，则将 attn_weights 加入到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回输出元组
        return outputs

    # 定义 build 方法，用于构建层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记当前对象为已构建状态
        self.built = True
        
        # 如果 self.attention 存在，则构建 self.attention 层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 如果 self.layer_norm 存在，则构建 self.layer_norm 层
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.hidden_size])
        
        # 如果 self.feed_forward 存在，则构建 self.feed_forward 层
        if getattr(self, "feed_forward", None) is not None:
            with tf.name_scope(self.feed_forward.name):
                self.feed_forward.build(None)
        
        # 如果 self.final_layer_norm 存在，则构建 self.final_layer_norm 层
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.config.hidden_size])
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayerStableLayerNorm 复制过来，将 Wav2Vec2 替换为 Hubert
class TFHubertEncoderLayerStableLayerNorm(keras.layers.Layer):
    def __init__(self, config: HubertConfig, **kwargs):
        super().__init__(**kwargs)
        # 初始化自注意力层，使用 HubertConfig 中定义的参数
        self.attention = TFHubertAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
            name="attention",
        )
        # 随机失活层，使用隐藏层失活率来初始化
        self.dropout = keras.layers.Dropout(config.hidden_dropout)
        # 层归一化，使用 HubertConfig 中定义的 epsilon 来初始化
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 前馈网络，使用 HubertConfig 初始化
        self.feed_forward = TFHubertFeedForward(config, name="feed_forward")
        # 最终的层归一化，使用 HubertConfig 中定义的 epsilon 来初始化
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
        self.config = config

    # 定义前向传播函数，接受隐藏状态、注意力掩码等输入，并返回一个元组
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 复制注意力层之前的隐藏状态，用于残差连接
        attn_residual = hidden_states
        # 应用层归一化到隐藏状态
        hidden_states = self.layer_norm(hidden_states)
        # 调用自注意力层，得到更新的隐藏状态和注意力权重
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, training=training
        )
        # 应用随机失活到更新的隐藏状态
        hidden_states = self.dropout(hidden_states, training=training)
        # 残差连接：原始隐藏状态 + 更新的隐藏状态
        hidden_states = attn_residual + hidden_states
        # 应用前馈网络和最终的层归一化到更新的隐藏状态
        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))

        # 构建输出元组，包含更新的隐藏状态
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，将注意力权重加入输出元组
        if output_attentions:
            outputs += (attn_weights,)

        return outputs

    # 构建层，确保所有子层都被构建
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果注意力层存在，则构建注意力层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果层归一化存在，则根据输入形状构建层归一化
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.hidden_size])
        # 如果前馈网络存在，则构建前馈网络
        if getattr(self, "feed_forward", None) is not None:
            with tf.name_scope(self.feed_forward.name):
                self.feed_forward.build(None)
        # 如果最终的层归一化存在，则根据输入形状构建最终的层归一化
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.config.hidden_size])


# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2Encoder 复制过来，将 Wav2Vec2 替换为 Hubert
class TFHubertEncoder(keras.layers.Layer):
    # 初始化方法，用于创建一个 Hubert 模型实例
    def __init__(self, config: HubertConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 保存传入的配置对象
        self.config = config
        # 创建位置卷积嵌入层，命名为 pos_conv_embed
        self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
        # 创建 LayerNormalization 层，使用给定的 epsilon 值
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 创建 Dropout 层，使用给定的 dropout 率
        self.dropout = keras.layers.Dropout(config.hidden_dropout)
        # 创建多个 HubertEncoderLayer 层，根据配置中的层数进行命名
        self.layer = [TFHubertEncoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)]

    # 模型调用方法，实现了 Hubert 模型的前向传播
    def call(
        self,
        hidden_states: tf.Tensor,  # 输入的隐藏状态张量
        attention_mask: tf.Tensor | None = None,  # 注意力遮罩张量，默认为 None
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，默认为 False
        output_hidden_states: Optional[bool] = False,  # 是否输出隐藏状态，默认为 False
        return_dict: Optional[bool] = True,  # 是否以字典形式返回输出，默认为 True
        training: Optional[bool] = False,  # 是否处于训练模式，默认为 False
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 如果要输出隐藏状态，则初始化 all_hidden_states 为空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果要输出注意力权重，则初始化 all_self_attentions 为空元组
        all_self_attentions = () if output_attentions else None

        # 如果传入了 attention_mask，则将隐藏状态张量与 attention_mask 进行逐元素乘法
        if attention_mask is not None:
            hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
            # 对 attention_mask 进行扩展，用于后续处理
            attention_mask = _expand_mask(attention_mask)
        else:
            # 否则 attention_mask 为空
            attention_mask = None

        # 使用位置卷积嵌入层处理隐藏状态张量，加上位置嵌入
        position_embeddings = self.pos_conv_embed(hidden_states)
        hidden_states = hidden_states + position_embeddings
        # 对加和后的隐藏状态进行 LayerNormalization 处理
        hidden_states = self.layer_norm(hidden_states)
        # 对 LayerNormalization 后的隐藏状态应用 Dropout 处理
        hidden_states = self.dropout(hidden_states, training=training)

        # 遍历每一个 HubertEncoderLayer 层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前的隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 添加 LayerDrop（参见 https://arxiv.org/abs/1909.11556 ）
            dropout_probability = np.random.uniform(0, 1)
            # 如果处于训练状态并且随机数小于配置中的 layerdrop 率，则跳过当前层
            if training and (dropout_probability < self.config.layerdrop):
                continue

            # 调用当前层的 forward 方法，得到输出
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重，则将当前层的注意力权重输出添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则按顺序返回隐藏状态、隐藏状态序列、注意力权重序列中的非空元素
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，返回 TFBaseModelOutput 对象，包括最后的隐藏状态、隐藏状态序列和注意力权重序列
        return TFBaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
    # 构建模型的方法，用于定义模型的输入形状和层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在位置卷积嵌入层，构建该层
        if getattr(self, "pos_conv_embed", None) is not None:
            # 使用位置卷积嵌入层的名称作为命名空间
            with tf.name_scope(self.pos_conv_embed.name):
                # 调用位置卷积嵌入层的构建方法，传入None作为输入形状
                self.pos_conv_embed.build(None)
        
        # 如果存在层归一化层，构建该层
        if getattr(self, "layer_norm", None) is not None:
            # 使用层归一化层的名称作为命名空间
            with tf.name_scope(self.layer_norm.name):
                # 调用层归一化层的构建方法，传入形状为 [None, None, self.config.hidden_size]
                self.layer_norm.build([None, None, self.config.hidden_size])
        
        # 如果存在多个层，依次构建每一层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                # 使用当前层的名称作为命名空间
                with tf.name_scope(layer.name):
                    # 调用当前层的构建方法，传入None作为输入形状
                    layer.build(None)
# 从 transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderStableLayerNorm 复制代码，并将 Wav2Vec2 改为 Hubert
class TFHubertEncoderStableLayerNorm(keras.layers.Layer):
    # 初始化函数，接收 HubertConfig 类型的 config 参数，并调用父类的初始化方法
    def __init__(self, config: HubertConfig, **kwargs):
        super().__init__(**kwargs)
        # 将传入的 config 参数保存为对象的属性
        self.config = config
        # 创建 TFHubertPositionalConvEmbedding 对象，命名为 pos_conv_embed
        self.pos_conv_embed = TFHubertPositionalConvEmbedding(config, name="pos_conv_embed")
        # 创建 LayerNormalization 层，epsilon 参数使用 config 中的 layer_norm_eps，命名为 layer_norm
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 创建 Dropout 层，dropout 率使用 config 中的 hidden_dropout
        self.dropout = keras.layers.Dropout(config.hidden_dropout)
        # 创建 TFHubertEncoderLayerStableLayerNorm 层列表，命名为 layers，根据 config.num_hidden_layers 数量生成多个层对象
        self.layer = [
            TFHubertEncoderLayerStableLayerNorm(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)
        ]

    # 定义 call 方法，接收多个参数，返回 TFBaseModelOutput 或 Tuple[tf.Tensor] 类型
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 初始化 all_hidden_states 和 all_self_attentions 变量，根据输出标志确定是否初始化空元组
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 如果 attention_mask 不为 None，则将 hidden_states 加上 attention_mask 的扩展维度乘积
        if attention_mask is not None:
            hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
            # 调用 _expand_mask 函数扩展 attention_mask
            attention_mask = _expand_mask(attention_mask)
        else:
            attention_mask = None

        # 计算位置编码并将其加到 hidden_states 上
        position_embeddings = self.pos_conv_embed(hidden_states)
        hidden_states = hidden_states + position_embeddings
        # 使用 dropout 对 hidden_states 进行处理，根据 training 参数确定是否启用训练模式
        hidden_states = self.dropout(hidden_states, training=training)

        # 遍历 self.layer 中的每个层对象
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态，则将当前 hidden_states 加入 all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 添加 LayerDrop 功能，根据论文中描述的概率决定是否跳过当前层
            dropout_probability = np.random.uniform(0, 1)
            if training and (dropout_probability < self.config.layerdrop):  # 如果处于训练状态且概率小于 layerdrop 参数，则跳过该层
                continue

            # 调用当前层对象的 call 方法，处理 hidden_states 和 attention_mask 等参数
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新 hidden_states 为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果输出注意力权重，则将当前层的注意力权重加入 all_self_attentions
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 对最终的 hidden_states 应用 layer_norm
        hidden_states = self.layer_norm(hidden_states)

        # 如果输出隐藏状态，则将最终的 hidden_states 加入 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 参数为 False，则返回非空值的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，返回 TFBaseModelOutput 对象，包含最终的隐藏状态、所有隐藏状态和注意力权重
        return TFBaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
    # 如果已经构建过模型，则直接返回，避免重复构建
    if self.built:
        return

    # 将模型标记为已构建状态
    self.built = True

    # 如果存在位置编码的卷积嵌入层，则构建该层
    if getattr(self, "pos_conv_embed", None) is not None:
        with tf.name_scope(self.pos_conv_embed.name):
            self.pos_conv_embed.build(None)

    # 如果存在 Layer Normalization 层，则构建该层
    if getattr(self, "layer_norm", None) is not None:
        with tf.name_scope(self.layer_norm.name):
            # 构建 Layer Normalization 层，指定输入形状为 [None, None, self.config.hidden_size]
            self.layer_norm.build([None, None, self.config.hidden_size])

    # 如果存在多个层，则逐个构建这些层
    if getattr(self, "layer", None) is not None:
        for layer in self.layer:
            with tf.name_scope(layer.name):
                # 构建当前层，输入形状为 None，表示不限定输入维度
                layer.build(None)
@keras_serializable
class TFHubertMainLayer(keras.layers.Layer):
    # 设置配置类
    config_class = HubertConfig

    def __init__(self, config: HubertConfig, **kwargs):
        # 调用父类初始化方法
        super().__init__(**kwargs)
        # 设置配置属性
        self.config = config
        # 创建特征提取器对象
        self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor")
        # 创建特征投影对象
        self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")

        # 根据配置选择稳定层归一化编码器或一般编码器
        if config.do_stable_layer_norm:
            self.encoder = TFHubertEncoderStableLayerNorm(config, name="encoder")
        else:
            self.encoder = TFHubertEncoder(config, name="encoder")

    def build(self, input_shape=None):
        # 添加权重，用于掩码特定嵌入
        self.masked_spec_embed = self.add_weight(
            shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed"
        )

        # 如果已经建立过，直接返回
        if self.built:
            return
        self.built = True

        # 如果存在特征提取器，构建其结构
        if getattr(self, "feature_extractor", None) is not None:
            with tf.name_scope(self.feature_extractor.name):
                self.feature_extractor.build(None)
        
        # 如果存在特征投影器，构建其结构
        if getattr(self, "feature_projection", None) is not None:
            with tf.name_scope(self.feature_projection.name):
                self.feature_projection.build(None)
        
        # 如果存在编码器，构建其结构
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)

    def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
        """
        计算卷积层的输出长度
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 从 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html 获取的一维卷积层输出长度公式
            return (input_length - kernel_size) // stride + 1

        # 遍历配置中的卷积核大小和步幅，计算每一层的输出长度
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        return input_lengths
    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """
        # 获取 hidden_states 的形状信息：batch_size, sequence_length, hidden_size
        batch_size, sequence_length, hidden_size = shape_list(hidden_states)

        # 检查是否禁用了 SpecAugment 的应用
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        if mask_time_indices is not None:
            # 根据给定的 mask_time_indices 在时间轴上应用 SpecAugment
            hidden_states = tf.where(
                tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
                self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
                hidden_states,
            )

        elif self.config.mask_time_prob > 0:
            # 生成 mask_time_indices 并在时间轴上应用 SpecAugment
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                min_masks=2,
            )
            hidden_states = tf.where(
                tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
                self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
                hidden_states,
            )

        # 在特征轴上应用 SpecAugment
        if self.config.mask_feature_prob > 0:
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
            )
            hidden_states = tf.where(mask_feature_indices[:, tf.newaxis, :], hidden_states, 0)

        # 返回经过 SpecAugment 处理后的 hidden_states
        return hidden_states

    @unpack_inputs
    def call(
        self,
        input_values: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: tf.Tensor | None = None,
        output_hidden_states: tf.Tensor | None = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        **kwargs: Any,
        ):
            # 使用特征提取器提取特征，将输入转换为浮点数类型并进行训练
            hidden_states = self.feature_extractor(tf.cast(input_values, tf.float32), training=training)

            if attention_mask is not None:
                # 根据卷积公式计算真实的输出长度
                output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, -1))

                # 根据计算得到的长度创建序列掩码，最大长度为隐藏状态的长度，数据类型与隐藏状态一致
                attention_mask = tf.sequence_mask(
                    output_lengths, maxlen=shape_list(hidden_states)[1], dtype=hidden_states.dtype
                )

            # 使用特征投影器进行特征投影，同时根据是否训练状态进行操作
            hidden_states = self.feature_projection(hidden_states, training=training)

            # 获取参数中的时间索引掩码，如果处于训练状态
            mask_time_indices = kwargs.get("mask_time_indices", None)
            if training:
                # 根据时间索引掩码对隐藏状态进行掩码处理
                hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)

            # 将隐藏状态传入编码器进行编码，同时传递相关参数和是否返回字典
            encoder_outputs = self.encoder(
                hidden_states,
                attention_mask=attention_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                training=training,
            )
            # 从编码器输出中获取最后的隐藏状态
            hidden_states = encoder_outputs[0]

            if not return_dict:
                # 如果不返回字典，则返回元组形式的隐藏状态和其他编码器输出
                return (hidden_states,) + encoder_outputs[1:]

            # 如果返回字典，则创建 TFBaseModelOutput 对象，并包含相应的属性
            return TFBaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=encoder_outputs.hidden_states,
                attentions=encoder_outputs.attentions,
            )
class TFHubertPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 HubertConfig
    config_class = HubertConfig
    # 基础模型前缀为 "hubert"
    base_model_prefix = "hubert"
    # 主输入名称为 "input_values"
    main_input_name = "input_values"

    @property
    def input_signature(self):
        # 定义输入签名，指定输入参数的形状和数据类型
        return {
            "input_values": tf.TensorSpec((None, 16000), tf.float32, name="input_values"),
            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
            "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
        }

    def __init__(self, config, *inputs, **kwargs):
        # 初始化方法，调用父类的初始化函数
        super().__init__(config, *inputs, **kwargs)
        # 发出警告，说明在 CPU 上不支持后向传播操作
        logger.warning(
            f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
            "to train/fine-tune this model, you need a GPU or a TPU"
        )



HUBERT_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_values` only and nothing else: `model(input_values)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_values, attention_mask])` or `model([input_values, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_values": input_values, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

HUBERT_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare TFHubert Model transformer outputing raw hidden-states without any specific head on top.",
    HUBERT_START_DOCSTRING,
)
class TFHubertModel(TFHubertPreTrainedModel):
    def __init__(self, config: HubertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.config = config
        # 初始化 TFHubertMainLayer 对象，用于处理 Hubert 模型的主要逻辑
        self.hubert = TFHubertMainLayer(config, name="hubert")

    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
    @unpack_inputs
    def call(
        self,
        input_values: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        """
        根据给定的输入执行模型的前向传播，返回模型输出。

        Args:
            input_values (tf.Tensor): 输入张量，代表输入特征。
            attention_mask (tf.Tensor, optional): 注意力掩码张量，用于控制注意力分配。默认为 None。
            token_type_ids (tf.Tensor, optional): 标记类型 ID 张量，用于多序列输入。默认为 None。
            position_ids (tf.Tensor, optional): 位置 ID 张量，用于指示输入中每个位置的位置信息。默认为 None。
            head_mask (tf.Tensor, optional): 头部掩码张量，用于控制多头注意力中每个头的重要性。默认为 None。
            inputs_embeds (tf.Tensor, optional): 嵌入输入张量，用于直接提供输入的嵌入表示。默认为 None。
            output_attentions (bool, optional): 是否输出注意力权重。默认为 None。
            output_hidden_states (bool, optional): 是否输出隐藏状态。默认为 None。
            return_dict (bool, optional): 是否以字典形式返回结果。默认为 None。
            training (bool, optional): 是否处于训练模式。默认为 False。

        Returns:
            Union[TFBaseModelOutput, Tuple[tf.Tensor]]: 模型的输出结果，包含隐藏状态和/或注意力权重，具体取决于参数设置。

        Example:

        ```
        >>> from transformers import AutoProcessor, TFHubertModel
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```
        """

        # 设置输出的隐藏状态、注意力权重和返回字典形式的结果
        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
        output_attentions = output_attentions if output_attentions else self.config.output_attentions
        return_dict = return_dict if return_dict else self.config.return_dict

        # 调用 TFHubertMainLayer 对象进行前向传播
        outputs = self.hubert(
            input_values=input_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs
    # 构建模型的方法，在此方法中进行模型的初始化和构建
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 检查是否存在名为"hubert"的属性，并且该属性不为None
        if getattr(self, "hubert", None) is not None:
            # 使用"hubert"属性的名称作为命名空间
            with tf.name_scope(self.hubert.name):
                # 调用"hubert"对象的build方法，传入None作为输入形状
                self.hubert.build(None)
@add_start_docstrings(
    """TFHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    HUBERT_START_DOCSTRING,
)
class TFHubertForCTC(TFHubertPreTrainedModel):
    def __init__(self, config: HubertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 TFHubert 主层，使用给定的配置和名称
        self.hubert = TFHubertMainLayer(config, name="hubert")
        # 添加 dropout 层，使用给定的最终 dropout 率
        self.dropout = keras.layers.Dropout(config.final_dropout)
        # 添加全连接层 lm_head，输出大小为词汇表大小
        self.lm_head = keras.layers.Dense(config.vocab_size, name="lm_head")
        # 确定输出隐藏大小，如果配置中存在 `add_adapter` 并且为真，则使用 `output_hidden_size`，否则使用 `hidden_size`
        self.output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        """
        # 发出警告，告知方法即将被弃用，建议使用 `freeze_feature_encoder` 方法
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用 `freeze_feature_encoder` 方法来冻结特征编码器的梯度计算
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 将特征提取器的可训练属性设置为 False，禁止在训练过程中更新其参数
        self.hubert.feature_extractor.trainable = False

    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFCausalLMOutput, config_class=_CONFIG_FOR_DOC)
    @unpack_inputs
    def call(
        self,
        input_values: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ):
        """
        Call method to process inputs and return outputs, adhering to Hubert model's forward function.
        """
        # 省略了具体的前向传播逻辑，由装饰器 `add_start_docstrings_to_model_forward` 和 `replace_return_docstrings` 指定
        pass

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "hubert", None) is not None:
            with tf.name_scope(self.hubert.name):
                # 构建 `hubert` 层，输入形状为 None
                self.hubert.build(None)
        if getattr(self, "lm_head", None) is not None:
            with tf.name_scope(self.lm_head.name):
                # 构建 `lm_head` 层，输入形状为 [None, None, self.output_hidden_size]
                self.lm_head.build([None, None, self.output_hidden_size])

`.\models\hubert\init.py`

# 版权声明和许可证信息
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入类型检查模块中的 TYPE_CHECKING 类型
from typing import TYPE_CHECKING

# 导入依赖检查函数和 LazyModule 类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available

# 定义模块的导入结构字典
_import_structure = {"configuration_hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"]}

# 检查是否有 torch 可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加对应的 modeling_hubert 模块导入结构
    _import_structure["modeling_hubert"] = [
        "HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "HubertForCTC",
        "HubertForSequenceClassification",
        "HubertModel",
        "HubertPreTrainedModel",
    ]

# 检查是否有 tensorflow 可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 tensorflow 可用，则添加对应的 modeling_tf_hubert 模块导入结构
    _import_structure["modeling_tf_hubert"] = [
        "TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFHubertForCTC",
        "TFHubertModel",
        "TFHubertPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入配置和模型类
    from .configuration_hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig

    # 如果 torch 可用，则导入 torch 版的模型类
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_hubert import (
            HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            HubertForCTC,
            HubertForSequenceClassification,
            HubertModel,
            HubertPreTrainedModel,
        )

    # 如果 tensorflow 可用，则导入 tensorflow 版的模型类
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tf_hubert import (
            TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFHubertForCTC,
            TFHubertModel,
            TFHubertPreTrainedModel,
        )

# 如果不在类型检查模式下
else:
    import sys

    # 将当前模块注册为 LazyModule，延迟导入实现
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\ibert\configuration_ibert.py`

# coding=utf-8
# 声明编码格式为UTF-8
# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,
# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.
# 版权声明，包括作者信息和版权信息
# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
# 版权声明，包括年份和版权所有者信息
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 进行许可
# you may not use this file except in compliance with the License.
# 除非遵循 Apache License, Version 2.0，否则不得使用此文件
# You may obtain a copy of the License at
# 可以在以下链接获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何明示或暗示的担保或条件，软件在分发时是基于“按原样”分发的
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# 在许可下限制的特定语言和限制
""" I-BERT configuration"""
# 模块级文档字符串，描述本文件是关于 I-BERT 的配置信息

from collections import OrderedDict
# 导入 OrderedDict 类，用于有序字典
from typing import Mapping
# 导入 Mapping 类型提示

from ...configuration_utils import PretrainedConfig
# 从配置工具中导入预训练配置类 PretrainedConfig
from ...onnx import OnnxConfig
# 从 onnx 模块中导入 OnnxConfig
from ...utils import logging
# 从 utils 模块中导入 logging 模块

logger = logging.get_logger(__name__)
# 获取当前模块的 logger 对象

IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "kssteven/ibert-roberta-base": "https://huggingface.co/kssteven/ibert-roberta-base/resolve/main/config.json",
    "kssteven/ibert-roberta-large": "https://huggingface.co/kssteven/ibert-roberta-large/resolve/main/config.json",
    "kssteven/ibert-roberta-large-mnli": (
        "https://huggingface.co/kssteven/ibert-roberta-large-mnli/resolve/main/config.json"
    ),
}
# 定义一个字典，映射预训练模型名称到其配置文件的 URL

class IBertConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`IBertModel`]. It is used to instantiate a I-BERT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the IBERT
    [kssteven/ibert-roberta-base](https://huggingface.co/kssteven/ibert-roberta-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    """
    # IBertConfig 类，用于存储 IBERT 模型的配置信息

    model_type = "ibert"
    # 模型类型为 ibert

    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        position_embedding_type="absolute",
        quant_mode=False,
        force_dequant="none",
        **kwargs,
    ):
        """
        Initializes an IBertConfig object with default values for its parameters.
        构造函数，初始化 IBertConfig 对象，设置各个参数的默认值。
        """
        ):
            # 调用父类的构造函数，设置模型的特定参数和超参数
            super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

            # 设置模型的词汇表大小
            self.vocab_size = vocab_size
            # 设置模型的隐藏层大小
            self.hidden_size = hidden_size
            # 设置模型的隐藏层数量
            self.num_hidden_layers = num_hidden_layers
            # 设置模型的注意力头数量
            self.num_attention_heads = num_attention_heads
            # 设置模型的隐藏层激活函数
            self.hidden_act = hidden_act
            # 设置模型的中间层大小（全连接层）
            self.intermediate_size = intermediate_size
            # 设置模型的隐藏层dropout概率
            self.hidden_dropout_prob = hidden_dropout_prob
            # 设置模型的注意力层dropout概率
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            # 设置模型的最大位置嵌入长度
            self.max_position_embeddings = max_position_embeddings
            # 设置模型的类型词汇表大小
            self.type_vocab_size = type_vocab_size
            # 设置模型的初始化范围
            self.initializer_range = initializer_range
            # 设置模型的层归一化epsilon值
            self.layer_norm_eps = layer_norm_eps
            # 设置模型的位置嵌入类型
            self.position_embedding_type = position_embedding_type
            # 设置模型的量化模式
            self.quant_mode = quant_mode
            # 设置模型的强制去量化标志
            self.force_dequant = force_dequant
# 定义一个名为 IBertOnnxConfig 的类，它继承自 OnnxConfig 类
class IBertOnnxConfig(OnnxConfig):
    
    # 定义一个 inputs 属性，返回一个字典，键为字符串，值为映射（字典，键为整数，值为字符串）
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        
        # 如果任务是多项选择 ("multiple-choice")，则设置动态轴为三维：批量（batch）、选择（choice）、序列（sequence）
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则，设置动态轴为二维：批量（batch）、序列（sequence）
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，包含两个条目：("input_ids", dynamic_axis) 和 ("attention_mask", dynamic_axis)
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),
                ("attention_mask", dynamic_axis),
            ]
        )

`.\models\ibert\modeling_ibert.py`

# coding=utf-8
# 版权声明，版权归作者及 HuggingFace 公司所有，保留一切权利
# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,
# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.
# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""PyTorch I-BERT model."""

import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import gelu
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging

# 导入 logging 模块
logger = logging.get_logger(__name__)

# 以下是用于文档的定义
_CHECKPOINT_FOR_DOC = "kssteven/ibert-roberta-base"
_CONFIG_FOR_DOC = "IBertConfig"

# 预训练模型的存档列表
IBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "kssteven/ibert-roberta-base",
    "kssteven/ibert-roberta-large",
    "kssteven/ibert-roberta-large-mnli",
]


class IBertEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    
    IBertEmbeddings 类，与 BertEmbeddings 相同，但稍作调整以支持位置嵌入索引。
    """
    # 初始化函数，接受一个配置参数对象作为输入
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        
        # 设置量化模式，从配置对象中获取
        self.quant_mode = config.quant_mode
        
        # 设置嵌入比特位数
        self.embedding_bit = 8
        self.embedding_act_bit = 16
        self.act_bit = 8
        self.ln_input_bit = 22
        self.ln_output_bit = 32
    
        # 创建词嵌入对象，使用QuantEmbedding进行量化
        self.word_embeddings = QuantEmbedding(
            config.vocab_size,
            config.hidden_size,
            padding_idx=config.pad_token_id,
            weight_bit=self.embedding_bit,
            quant_mode=self.quant_mode,
        )
        
        # 创建token类型嵌入对象，使用QuantEmbedding进行量化
        self.token_type_embeddings = QuantEmbedding(
            config.type_vocab_size, config.hidden_size, weight_bit=self.embedding_bit, quant_mode=self.quant_mode
        )
    
        # 注册位置ID张量为缓冲区，使用torch.arange生成连续的位置ID
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        
        # 设置位置嵌入的类型，默认为绝对位置嵌入
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
    
        # 设置填充索引，并创建位置嵌入对象，使用QuantEmbedding进行量化
        self.padding_idx = config.pad_token_id
        self.position_embeddings = QuantEmbedding(
            config.max_position_embeddings,
            config.hidden_size,
            padding_idx=self.padding_idx,
            weight_bit=self.embedding_bit,
            quant_mode=self.quant_mode,
        )
    
        # 创建嵌入激活函数对象，使用QuantAct进行量化
        self.embeddings_act1 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
        self.embeddings_act2 = QuantAct(self.embedding_act_bit, quant_mode=self.quant_mode)
    
        # 创建层归一化对象，使用IntLayerNorm进行量化，保持与TensorFlow模型变量名一致
        self.LayerNorm = IntLayerNorm(
            config.hidden_size,
            eps=config.layer_norm_eps,
            output_bit=self.ln_output_bit,
            quant_mode=self.quant_mode,
            force_dequant=config.force_dequant,
        )
        
        # 创建输出激活函数对象，使用QuantAct进行量化
        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
        
        # 创建Dropout对象，使用配置中的隐藏层dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        ):
            # 如果没有给定位置编码，根据输入的 token ids 创建位置编码，保留任何填充的 token 的填充状态
            if position_ids is None:
                if input_ids is not None:
                    # 从输入的 token ids 创建位置编码
                    position_ids = create_position_ids_from_input_ids(
                        input_ids, self.padding_idx, past_key_values_length
                    ).to(input_ids.device)
                else:
                    # 根据输入的嵌入向量创建位置编码
                    position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

            # 如果给定了 input_ids，则获取其形状；否则获取 inputs_embeds 的形状去掉最后一个维度
            if input_ids is not None:
                input_shape = input_ids.size()
            else:
                input_shape = inputs_embeds.size()[:-1]

            # 如果没有给定 token_type_ids，则创建一个全零张量作为 token_type_ids
            if token_type_ids is None:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

            # 如果没有给定 inputs_embeds，则通过 word_embeddings 获取输入的嵌入向量及其缩放因子
            if inputs_embeds is None:
                inputs_embeds, inputs_embeds_scaling_factor = self.word_embeddings(input_ids)
            else:
                # 否则设定 inputs_embeds_scaling_factor 为 None
                inputs_embeds_scaling_factor = None

            # 根据 token_type_ids 获取 token 类型的嵌入向量及其缩放因子
            token_type_embeddings, token_type_embeddings_scaling_factor = self.token_type_embeddings(token_type_ids)

            # 将 inputs_embeds 和 token_type_embeddings 组合并通过 embeddings_act1 处理得到嵌入向量及其缩放因子
            embeddings, embeddings_scaling_factor = self.embeddings_act1(
                inputs_embeds,
                inputs_embeds_scaling_factor,
                identity=token_type_embeddings,
                identity_scaling_factor=token_type_embeddings_scaling_factor,
            )

            # 如果 position_embedding_type 是 "absolute"，则根据 position_ids 获取位置嵌入向量及其缩放因子
            if self.position_embedding_type == "absolute":
                position_embeddings, position_embeddings_scaling_factor = self.position_embeddings(position_ids)
                # 将 embeddings 和 position_embeddings 组合并通过 embeddings_act1 处理得到最终的嵌入向量及其缩放因子
                embeddings, embeddings_scaling_factor = self.embeddings_act1(
                    embeddings,
                    embeddings_scaling_factor,
                    identity=position_embeddings,
                    identity_scaling_factor=position_embeddings_scaling_factor,
                )

            # 对最终的嵌入向量进行 LayerNorm 处理，并返回处理后的嵌入向量及其缩放因子
            embeddings, embeddings_scaling_factor = self.LayerNorm(embeddings, embeddings_scaling_factor)
            embeddings = self.dropout(embeddings)
            # 对嵌入向量应用 output_activation，并返回处理后的嵌入向量及其缩放因子
            embeddings, embeddings_scaling_factor = self.output_activation(embeddings, embeddings_scaling_factor)
            return embeddings, embeddings_scaling_factor

        def create_position_ids_from_inputs_embeds(self, inputs_embeds):
            """
            We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

            Args:
                inputs_embeds: torch.Tensor

            Returns: torch.Tensor
            """
            # 获取输入嵌入向量的形状，并计算序列长度
            input_shape = inputs_embeds.size()[:-1]
            sequence_length = input_shape[1]

            # 根据序列长度生成从 padding_idx + 1 开始的连续位置编码
            position_ids = torch.arange(
                self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
            )
            return position_ids.unsqueeze(0).expand(input_shape)
# 定义 IBertSelfAttention 类，继承自 nn.Module，实现自注意力机制部分
class IBertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 检查 hidden_size 是否能被 num_attention_heads 整除，同时不应有 embedding_size 属性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        
        # 初始化量化模式和量化位数设置
        self.quant_mode = config.quant_mode
        self.weight_bit = 8
        self.bias_bit = 32
        self.act_bit = 8

        # 设置注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化 Q、K、V 的线性层，进行量化
        self.query = QuantLinear(
            config.hidden_size,
            self.all_head_size,
            bias=True,
            weight_bit=self.weight_bit,
            bias_bit=self.bias_bit,
            quant_mode=self.quant_mode,
            per_channel=True,
        )
        self.key = QuantLinear(
            config.hidden_size,
            self.all_head_size,
            bias=True,
            weight_bit=self.weight_bit,
            bias_bit=self.bias_bit,
            quant_mode=self.quant_mode,
            per_channel=True,
        )
        self.value = QuantLinear(
            config.hidden_size,
            self.all_head_size,
            bias=True,
            weight_bit=self.weight_bit,
            bias_bit=self.bias_bit,
            quant_mode=self.quant_mode,
            per_channel=True,
        )

        # 初始化 Q、K、V 的激活函数，进行量化
        self.query_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
        self.key_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
        self.value_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)

        # Dropout 层，用于注意力概率的 dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 位置嵌入类型设置为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        if self.position_embedding_type != "absolute":
            raise ValueError("I-BERT only supports 'absolute' for `config.position_embedding_type`")

        # 定义 Softmax 层，用于计算注意力权重
        self.softmax = IntSoftmax(self.act_bit, quant_mode=self.quant_mode, force_dequant=config.force_dequant)

    # 将输入张量 x 转换为注意力分数张量的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，实现自注意力机制的计算过程
    def forward(
        self,
        hidden_states,
        hidden_states_scaling_factor,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
    # 初始化函数，接收配置参数 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 设置量化模式
        self.quant_mode = config.quant_mode
        # 设置激活位数为 8
        self.act_bit = 8
        # 设置权重位数为 8
        self.weight_bit = 8
        # 设置偏置位数为 32
        self.bias_bit = 32
        # 设置输入层归一化的位数为 22
        self.ln_input_bit = 22
        # 设置输出层归一化的位数为 32
        self.ln_output_bit = 32

        # 创建一个量化线性层对象，用于神经网络的量化线性变换
        self.dense = QuantLinear(
            config.hidden_size,
            config.hidden_size,
            bias=True,
            weight_bit=self.weight_bit,
            bias_bit=self.bias_bit,
            quant_mode=self.quant_mode,
            per_channel=True,
        )
        # 创建一个输入层激活函数的量化对象
        self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)
        # 创建一个整数型层归一化对象，用于神经网络的整数型层次归一化
        self.LayerNorm = IntLayerNorm(
            config.hidden_size,
            eps=config.layer_norm_eps,
            output_bit=self.ln_output_bit,
            quant_mode=self.quant_mode,
            force_dequant=config.force_dequant,
        )
        # 创建一个输出激活函数的量化对象
        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
        # 创建一个 Dropout 层，用于随机置零输入张量的元素，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收隐藏状态、缩放因子、输入张量和其缩放因子作为输入
    def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
        # 使用量化线性层进行隐藏状态的线性变换
        hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
        # 对变换后的隐藏状态应用 Dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 使用输入层激活函数的量化对象，对隐藏状态进行激活函数操作
        hidden_states, hidden_states_scaling_factor = self.ln_input_act(
            hidden_states,
            hidden_states_scaling_factor,
            identity=input_tensor,
            identity_scaling_factor=input_tensor_scaling_factor,
        )
        # 使用整数型层归一化对象，对处理后的隐藏状态进行归一化操作
        hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)

        # 使用输出激活函数的量化对象，对归一化后的隐藏状态进行激活函数操作
        hidden_states, hidden_states_scaling_factor = self.output_activation(
            hidden_states, hidden_states_scaling_factor
        )
        # 返回处理后的隐藏状态和相应的缩放因子
        return hidden_states, hidden_states_scaling_factor
# 定义 IBertAttention 类，继承自 nn.Module，实现自注意力机制
class IBertAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 从配置中获取量化模式
        self.quant_mode = config.quant_mode
        # 初始化 IBertSelfAttention 层和 IBertSelfOutput 层
        self.self = IBertSelfAttention(config)
        self.output = IBertSelfOutput(config)
        # 初始化头部剪枝集合
        self.pruned_heads = set()

    # 剪枝指定的注意力头
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用辅助函数找到可剪枝的头部索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对自注意力机制的查询、键、值进行剪枝
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        # 对输出层的稠密层进行剪枝
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录剪枝的头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数
    def forward(
        self,
        hidden_states,
        hidden_states_scaling_factor,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
    ):
        # 调用自注意力层的前向传播
        self_outputs, self_outputs_scaling_factor = self.self(
            hidden_states,
            hidden_states_scaling_factor,
            attention_mask,
            head_mask,
            output_attentions,
        )
        # 调用自注意力输出层的前向传播
        attention_output, attention_output_scaling_factor = self.output(
            self_outputs[0], self_outputs_scaling_factor[0], hidden_states, hidden_states_scaling_factor
        )
        # 如果输出注意力矩阵，添加到输出中
        outputs = (attention_output,) + self_outputs[1:]
        outputs_scaling_factor = (attention_output_scaling_factor,) + self_outputs_scaling_factor[1:]
        return outputs, outputs_scaling_factor


# 定义 IBertIntermediate 类，继承自 nn.Module，实现中间层的量化操作
class IBertIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 从配置中获取量化模式
        self.quant_mode = config.quant_mode
        # 设置激活位数和权重位数
        self.act_bit = 8
        self.weight_bit = 8
        self.bias_bit = 32
        # 创建量化线性层
        self.dense = QuantLinear(
            config.hidden_size,
            config.intermediate_size,
            bias=True,
            weight_bit=self.weight_bit,
            bias_bit=self.bias_bit,
            quant_mode=self.quant_mode,
            per_channel=True,
        )
        # 检查隐藏激活函数是否为 "gelu"
        if config.hidden_act != "gelu":
            raise ValueError("I-BERT only supports 'gelu' for `config.hidden_act`")
        # 初始化中间激活函数为 IntGELU
        self.intermediate_act_fn = IntGELU(quant_mode=self.quant_mode, force_dequant=config.force_dequant)
        # 初始化输出激活函数为 QuantAct
        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
    # 前向传播函数，接受隐藏状态和隐藏状态缩放因子作为输入参数
    def forward(self, hidden_states, hidden_states_scaling_factor):
        # 将隐藏状态和缩放因子传递给稠密层进行处理
        hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
        # 将稠密层输出的隐藏状态和缩放因子传递给中间激活函数进行处理
        hidden_states, hidden_states_scaling_factor = self.intermediate_act_fn(
            hidden_states, hidden_states_scaling_factor
        )

        # 重新量化步骤：从32位转换为8位
        hidden_states, hidden_states_scaling_factor = self.output_activation(
            hidden_states, hidden_states_scaling_factor
        )
        # 返回处理后的隐藏状态和缩放因子
        return hidden_states, hidden_states_scaling_factor
class IBertOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.quant_mode = config.quant_mode  # 从配置中获取量化模式
        self.act_bit = 8  # 激活函数的位数设定为8位
        self.weight_bit = 8  # 权重的位数设定为8位
        self.bias_bit = 32  # 偏置的位数设定为32位
        self.ln_input_bit = 22  # LayerNorm输入的位数设定为22位
        self.ln_output_bit = 32  # LayerNorm输出的位数设定为32位

        # 创建量化线性层，指定输入大小、输出大小，并设定权重、偏置的位数，使用量化模式
        self.dense = QuantLinear(
            config.intermediate_size,
            config.hidden_size,
            bias=True,
            weight_bit=self.weight_bit,
            bias_bit=self.bias_bit,
            quant_mode=self.quant_mode,
            per_channel=True,
        )

        # 创建量化激活函数，设定输入位数和量化模式
        self.ln_input_act = QuantAct(self.ln_input_bit, quant_mode=self.quant_mode)

        # 创建整数化LayerNorm，指定输入大小、输出位数、量化模式和是否强制反量化
        self.LayerNorm = IntLayerNorm(
            config.hidden_size,
            eps=config.layer_norm_eps,
            output_bit=self.ln_output_bit,
            quant_mode=self.quant_mode,
            force_dequant=config.force_dequant,
        )

        # 创建量化激活函数，设定激活位数和量化模式
        self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)

        # 创建Dropout层，设定丢弃率为配置中的隐藏层dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, hidden_states_scaling_factor, input_tensor, input_tensor_scaling_factor):
        # 应用量化线性层，处理隐藏状态和其缩放因子
        hidden_states, hidden_states_scaling_factor = self.dense(hidden_states, hidden_states_scaling_factor)
        
        # 应用Dropout层，处理隐藏状态
        hidden_states = self.dropout(hidden_states)
        
        # 应用量化激活函数，处理隐藏状态，同时传入输入张量和其缩放因子作为辅助信息
        hidden_states, hidden_states_scaling_factor = self.ln_input_act(
            hidden_states,
            hidden_states_scaling_factor,
            identity=input_tensor,
            identity_scaling_factor=input_tensor_scaling_factor,
        )
        
        # 应用整数化LayerNorm，处理隐藏状态和其缩放因子
        hidden_states, hidden_states_scaling_factor = self.LayerNorm(hidden_states, hidden_states_scaling_factor)

        # 应用输出激活函数，处理隐藏状态和其缩放因子
        hidden_states, hidden_states_scaling_factor = self.output_activation(
            hidden_states, hidden_states_scaling_factor
        )
        
        # 返回处理后的隐藏状态和其缩放因子
        return hidden_states, hidden_states_scaling_factor


class IBertLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.quant_mode = config.quant_mode  # 从配置中获取量化模式
        self.act_bit = 8  # 激活函数的位数设定为8位

        self.seq_len_dim = 1  # 序列长度维度设定为1

        # 创建IBertAttention层，使用给定的配置
        self.attention = IBertAttention(config)
        
        # 创建IBertIntermediate层，使用给定的配置
        self.intermediate = IBertIntermediate(config)
        
        # 创建IBertOutput层，使用给定的配置
        self.output = IBertOutput(config)

        # 创建量化激活函数，设定输入位数和量化模式
        self.pre_intermediate_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)
        
        # 创建量化激活函数，设定输入位数和量化模式
        self.pre_output_act = QuantAct(self.act_bit, quant_mode=self.quant_mode)

    def forward(
        self,
        hidden_states,
        hidden_states_scaling_factor,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        # 继续编写其他参数
    ):
        self_attention_outputs, self_attention_outputs_scaling_factor = self.attention(
            hidden_states,
            hidden_states_scaling_factor,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
        )
        # 获取自注意力机制的输出和相应的缩放因子
        attention_output = self_attention_outputs[0]
        attention_output_scaling_factor = self_attention_outputs_scaling_factor[0]

        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，则添加自注意力权重

        # 将注意力输出作为输入，应用前馈网络
        layer_output, layer_output_scaling_factor = self.feed_forward_chunk(
            attention_output, attention_output_scaling_factor
        )
        outputs = (layer_output,) + outputs

        return outputs

    def feed_forward_chunk(self, attention_output, attention_output_scaling_factor):
        # 应用预激活函数到注意力输出和缩放因子
        attention_output, attention_output_scaling_factor = self.pre_intermediate_act(
            attention_output, attention_output_scaling_factor
        )
        # 将注意力输出传递给中间层前馈网络
        intermediate_output, intermediate_output_scaling_factor = self.intermediate(
            attention_output, attention_output_scaling_factor
        )

        # 应用预输出激活函数到中间层输出和缩放因子
        intermediate_output, intermediate_output_scaling_factor = self.pre_output_act(
            intermediate_output, intermediate_output_scaling_factor
        )
        # 应用输出层到中间层输出和相应的注意力输出及缩放因子
        layer_output, layer_output_scaling_factor = self.output(
            intermediate_output, intermediate_output_scaling_factor, attention_output, attention_output_scaling_factor
        )
        return layer_output, layer_output_scaling_factor
# 定义一个名为 IBertEncoder 的类，继承自 nn.Module 类，用于实现 BERT 编码器模型
class IBertEncoder(nn.Module):
    # 初始化方法，接收一个配置参数 config
    def __init__(self, config):
        super().__init__()  # 调用父类的初始化方法
        self.config = config  # 将传入的配置参数保存到对象的属性中
        self.quant_mode = config.quant_mode  # 从配置中获取量化模式设置
        # 创建一个由多个 IBertLayer 实例组成的模块列表，列表长度由配置中的 num_hidden_layers 决定
        self.layer = nn.ModuleList([IBertLayer(config) for _ in range(config.num_hidden_layers)])

    # 前向传播方法定义
    def forward(
        self,
        hidden_states,  # 输入的隐藏状态张量
        hidden_states_scaling_factor,  # 隐藏状态的缩放因子
        attention_mask=None,  # 注意力掩码，默认为 None
        head_mask=None,  # 头部掩码，默认为 None
        output_attentions=False,  # 是否输出注意力矩阵，默认为 False
        output_hidden_states=False,  # 是否输出所有隐藏状态，默认为 False
        return_dict=True,  # 是否以字典形式返回，默认为 True
    ):
        # 如果需要输出隐藏状态，则初始化一个空元组用于存储所有的隐藏状态张量
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力矩阵，则初始化一个空元组用于存储所有的自注意力矩阵
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = None  # 不支持交叉注意力，置为 None
        next_decoder_cache = None  # 不支持缓存，置为 None

        # 遍历每一个 IBertLayer 模块进行处理
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前的隐藏状态张量添加到 all_hidden_states 元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果存在头部掩码，则获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 调用当前层的前向传播方法，得到该层的输出
            layer_outputs = layer_module(
                hidden_states,
                hidden_states_scaling_factor,
                attention_mask,
                layer_head_mask,
                output_attentions,
            )

            # 更新隐藏状态张量为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力矩阵，则将当前层的自注意力矩阵添加到 all_self_attentions 元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态张量添加到 all_hidden_states 元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则返回一个元组，包含所有非 None 的值
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )

        # 如果需要以字典形式返回结果，则创建一个 BaseModelOutputWithPastAndCrossAttentions 实例作为返回值
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )


# 定义一个名为 IBertPooler 的类，继承自 nn.Module 类，用于实现 BERT 池化器模型
class IBertPooler(nn.Module):
    # 初始化方法，接收一个配置参数 config
    def __init__(self, config):
        super().__init__()  # 调用父类的初始化方法
        self.quant_mode = config.quant_mode  # 从配置中获取量化模式设置
        # 创建一个线性层，将输入特征大小映射到相同的输出特征大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()  # 定义 Tanh 激活函数

    # 前向传播方法定义
    def forward(self, hidden_states):
        # 只取第一个 token 对应的隐藏状态张量作为池化输出
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)  # 通过线性层映射
        pooled_output = self.activation(pooled_output)  # 应用 Tanh 激活函数
        return pooled_output


# 定义一个名为 IBertPreTrainedModel 的类，继承自 PreTrainedModel 类
class IBertPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，用于处理权重初始化和简单的接口，用于下载和加载预训练模型。
    """
    # 定义配置类为 IBertConfig
    config_class = IBertConfig
    # 定义基础模型前缀为 "ibert"
    base_model_prefix = "ibert"

    def _init_weights(self, module):
        """初始化权重"""
        # 如果模块是 QuantLinear 或 nn.Linear 类型
        if isinstance(module, (QuantLinear, nn.Linear)):
            # 使用正态分布初始化权重数据，均值为 0.0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果模块有偏置，则将偏置数据初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是 QuantEmbedding 或 nn.Embedding 类型
        elif isinstance(module, (QuantEmbedding, nn.Embedding)):
            # 使用正态分布初始化权重数据，均值为 0.0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果模块有填充索引，则将填充索引处的权重数据初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果模块是 IntLayerNorm 或 nn.LayerNorm 类型
        elif isinstance(module, (IntLayerNorm, nn.LayerNorm)):
            # 将模块的偏置数据初始化为零
            module.bias.data.zero_()
            # 将模块的权重数据填充为 1.0
            module.weight.data.fill_(1.0)

    def resize_token_embeddings(self, new_num_tokens=None):
        # 抛出未实现错误，因为 I-BERT 不支持调整 token embeddings
        raise NotImplementedError("`resize_token_embeddings` is not supported for I-BERT.")
IBERT_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`IBertConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""



IBERT_INPUTS_DOCSTRING = r"""
    This string contains the docstring for explaining the inputs accepted by the IBERT model.

    This docstring should describe the expected inputs for the model, such as input tensors or data structures,
    their types, shapes, and any preprocessing requirements.

    It provides guidance on how to format and prepare data for the model's forward pass, ensuring compatibility
    with the model's architecture and requirements.

    This documentation helps users understand how to correctly interface with the model, ensuring inputs are
    correctly formatted to achieve expected results.
"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列标记在词汇表中的索引。
            # 可以使用 `AutoTokenizer` 获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。
            # [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 遮罩，避免对填充的标记索引执行注意力操作。
            # 遮罩值在 `[0, 1]` 范围内：
            # - 1 表示**未遮罩**的标记，
            # - 0 表示**已遮罩**的标记。
            # [什么是注意力遮罩？](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 分段标记索引，指示输入的第一和第二部分。
            # 索引选在 `[0, 1]` 范围内：
            # - 0 对应*句子 A* 的标记，
            # - 1 对应*句子 B* 的标记。
            # [什么是分段标记 ID？](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。
            # 索引选在 `[0, config.max_position_embeddings - 1]` 范围内。
            # [什么是位置 ID？](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于空置自注意力模块中选定头部的遮罩。
            # 遮罩值在 `[0, 1]` 范围内：
            # - 1 表示**未遮罩**的头部，
            # - 0 表示**已遮罩**的头部。
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选参数，可以直接传递嵌入表示，而不是传递 `input_ids`。
            # 如果要控制如何将 `input_ids` 索引转换为相关联的向量，这很有用。
            # 这比模型内部的嵌入查找矩阵更灵活。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。
            # 查看返回的张量中的 `attentions` 以获取更多详细信息。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。
            # 查看返回的张量中的 `hidden_states` 以获取更多详细信息。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
@add_start_docstrings(
    "The bare I-BERT Model transformer outputting raw hidden-states without any specific head on top.",
    IBERT_START_DOCSTRING,
)
class IBertModel(IBertPreTrainedModel):
    """

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    """

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config
        self.quant_mode = config.quant_mode

        # Initialize the embeddings layer for the IBERT model
        self.embeddings = IBertEmbeddings(config)
        
        # Initialize the encoder layer for the IBERT model
        self.encoder = IBertEncoder(config)

        # Initialize the pooling layer if specified
        self.pooler = IBertPooler(config) if add_pooling_layer else None

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        # Return the word embeddings from the embeddings layer
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # Set new word embeddings to the embeddings layer
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # Iterate over layers and prune specific attention heads in each layer
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # Forward pass through the IBERT model
        # Detailed arguments are passed to handle different configurations
        pass


@add_start_docstrings(
    "I-BERT Model with a `language modeling` head on top.",
    IBERT_START_DOCSTRING
)
class IBertForMaskedLM(IBertPreTrainedModel):
    _tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)

        # Initialize the IBERT model without a pooling layer
        self.ibert = IBertModel(config, add_pooling_layer=False)
        
        # Initialize the language modeling head for IBERT
        self.lm_head = IBertLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        # Return the decoder weights from the language modeling head
        return self.lm_head.decoder
    def set_output_embeddings(self, new_embeddings):
        # 将语言模型头部的解码器层替换为新的嵌入层
        self.lm_head.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<mask>",
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        # 确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用iBERT模型进行前向传播
        outputs = self.ibert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传递给语言模型头部以获取预测分数
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        # 如果存在标签，则计算掩码语言建模损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果不需要返回字典格式的输出，则组装最终输出
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果需要返回字典格式的输出，则创建MaskedLMOutput对象
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
class IBertLMHead(nn.Module):
    """I-BERT Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化 LayerNorm 层，对隐藏层进行归一化，eps 是归一化过程中的小数值
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 初始化一个全连接层，输入维度是 config.hidden_size，输出维度是 config.vocab_size
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        # 初始化一个偏置参数，大小是 config.vocab_size
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        # 将偏置参数赋给 decoder 层的偏置
        self.decoder.bias = self.bias

    def forward(self, features, **kwargs):
        # 将输入 features 输入全连接层 dense
        x = self.dense(features)
        # 使用 GELU 激活函数处理全连接层输出
        x = gelu(x)
        # 对处理后的结果进行 LayerNorm 归一化
        x = self.layer_norm(x)

        # 使用全连接层 decoder 将结果映射回词汇表大小，加上偏置
        x = self.decoder(x)

        return x

    def _tie_weights(self):
        # 如果两个权重被分离（在TPU上或者当偏置被重新调整大小时），将偏置与 decoder 的偏置相连
        self.bias = self.decoder.bias


@add_start_docstrings(
    """
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    IBERT_START_DOCSTRING,
)
class IBertForSequenceClassification(IBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 设置分类任务的类别数
        self.num_labels = config.num_labels

        # 初始化 IBertModel，不添加池化层
        self.ibert = IBertModel(config, add_pooling_layer=False)
        # 初始化 IBertClassificationHead
        self.classifier = IBertClassificationHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用指定的 return_dict 值；否则使用 self.config.use_return_dict 的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用预训练模型 `ibert` 进行处理，获取输出结果
        outputs = self.ibert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从模型输出中提取序列输出（一般是经过分类器之前的最后一层隐藏状态）
        sequence_output = outputs[0]
        # 将序列输出传入分类器，得到 logits（预测的分类/回归结果）
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果有提供标签 labels
        if labels is not None:
            # 如果问题类型未定义
            if self.config.problem_type is None:
                # 根据 num_labels 的情况设置问题类型
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()  # 使用均方误差损失函数
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()  # 使用带 logits 的二元交叉熵损失函数
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，返回带有 logits 和其他输出的元组
        if not return_dict:
            output = (logits,) + outputs[2:]  # 将 logits 和额外的输出合并为元组
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，返回 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    I-BERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    IBERT_START_DOCSTRING,
)
class IBertForMultipleChoice(IBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 I-BERT 模型
        self.ibert = IBertModel(config)
        # Dropout 层，用于随机失活
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 分类器，线性层，将隐藏状态映射到单个输出值
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
        ) -> Union[MultipleChoiceModelOutput, Tuple[torch.FloatTensor]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据 `return_dict` 是否为 `None` 确定是否使用配置中的 `use_return_dict`
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 获取输入 `input_ids` 的第二维度大小作为 `num_choices`
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将 `input_ids`, `position_ids`, `token_type_ids`, `attention_mask`, `inputs_embeds` 扁平化处理
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 `ibert` 模型，传入扁平化的参数，返回模型的输出结果
        outputs = self.ibert(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取汇聚的输出
        pooled_output = outputs[1]

        # 对汇聚的输出应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 使用分类器得出 logits
        logits = self.classifier(pooled_output)
        # 重塑 logits 的形状，以适应多项选择的结构
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        # 如果存在 `labels`，计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果 `return_dict` 为 False，则返回扁平化后的输出和额外的隐藏状态
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 否则，返回一个包含损失、重塑后的 logits、隐藏状态和注意力的 `MultipleChoiceModelOutput` 对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    I-BERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    IBERT_START_DOCSTRING,
)
class IBertForTokenClassification(IBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数量

        self.ibert = IBertModel(config, add_pooling_layer=False)  # 初始化基于IBert的模型，不包含池化层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 使用配置中的dropout概率初始化dropout层
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 使用隐藏层大小和标签数量初始化分类器线性层

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[TokenClassifierOutput, Tuple[torch.FloatTensor]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用IBert模型的forward方法，传递参数并获取输出
        outputs = self.ibert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]  # 获取IBert模型的输出序列

        sequence_output = self.dropout(sequence_output)  # 应用dropout层到序列输出上
        logits = self.classifier(sequence_output)  # 应用分类器线性层到序列输出上，得到logits

        loss = None
        if labels is not None:
            # 如果提供了标签，计算交叉熵损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            # 如果不要求返回字典形式的输出，按原始格式输出
            output = (logits,) + outputs[2:]  # 将logits和其他输出状态组合起来
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的输出，构建TokenClassifierOutput对象并返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
class IBertClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个 dropout 层，用于防止过拟合，dropout 概率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 定义一个全连接层，输入维度为 config.hidden_size，输出维度为 config.num_labels
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        # 从 features 中获取每个样本的第一个 token 的隐藏状态，相当于取 [CLS] token
        hidden_states = features[:, 0, :]
        # 对隐藏状态进行 dropout
        hidden_states = self.dropout(hidden_states)
        # 将 dropout 后的隐藏状态输入全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出应用 tanh 激活函数
        hidden_states = torch.tanh(hidden_states)
        # 再次对隐藏状态进行 dropout
        hidden_states = self.dropout(hidden_states)
        # 将 dropout 后的隐藏状态输入最终的全连接层进行线性变换，得到模型的输出
        hidden_states = self.out_proj(hidden_states)
        return hidden_states


@add_start_docstrings(
    """
    I-BERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    IBERT_START_DOCSTRING,
)
class IBertForQuestionAnswering(IBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # 保存标签数量
        self.num_labels = config.num_labels

        # 初始化 I-BERT 模型，不加入 pooling 层
        self.ibert = IBertModel(config, add_pooling_layer=False)
        # 定义一个全连接层，用于生成问题回答的输出
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化模型权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(IBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[QuestionAnsweringModelOutput, Tuple[torch.FloatTensor]]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 设置返回字典是否已经指定，如果未指定则使用模型配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播，获取模型输出
        outputs = self.ibert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]

        # 将序列输出传入问答头部，获取起始和结束 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 是多维的，在第一个维度上进行压缩
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入的起始/结束位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义交叉熵损失函数，忽略指定的索引
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            # 计算起始和结束位置的平均损失
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果不返回字典，则输出损失和 logits 等信息
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 返回一个 QuestionAnsweringModelOutput 对象，包括损失、起始和结束 logits，以及其他隐藏状态和注意力信息
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 根据输入的 `input_ids` 生成对应的位置标识符。非填充符号被替换为它们的位置数字，位置数字从 `padding_idx+1` 开始计数。
# 填充符号被忽略。此函数改编自 fairseq 的 *utils.make_positions*。

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's *utils.make_positions*.

    Args:
    input_ids (`torch.LongTensor`):
           Indices of input sequence tokens in the vocabulary.

    Returns: torch.Tensor
    """
    # 使用 input_ids.ne(padding_idx) 生成一个 mask，标记非填充符号为 1，填充符号为 0
    mask = input_ids.ne(padding_idx).int()
    # 在每行中计算累积的非填充符号数量，类型转换为与 mask 相同的类型，然后加上 past_key_values_length
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # 将 incremental_indices 转换为长整型（torch.long），然后加上 padding_idx 得到最终的位置标识符
    return incremental_indices.long() + padding_idx

`.\models\ibert\quant_modules.py`

# 设置文件编码为 UTF-8
# 版权声明，包括作者信息和版权信息
# 版权所有 (c) 2021, NVIDIA CORPORATION. 保留所有权利。
#
# 根据 Apache 许可证 2.0 版本使用本文件
# 除非符合许可证规定，否则不得使用本文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发本软件
# 没有任何明示或暗示的担保或条件
# 有关特定语言的权限，请参阅许可证

import decimal  # 导入 decimal 库

import numpy as np  # 导入 numpy 库
import torch  # 导入 torch 库
from torch import nn  # 从 torch 中导入 nn 模块
from torch.autograd import Function  # 从 torch.autograd 中导入 Function 类

from ...utils import logging  # 从相对路径中导入 logging 模块

# 获取 logger 对象，用于记录日志信息
logger = logging.get_logger(__name__)


class QuantEmbedding(nn.Module):
    """
    `torch.nn.Embedding` 的量化版本。在 `torch.nn.Embedding` 的基础上增加了量化特定的参数。

    Args:
        weight_bit (`int`, *optional*, defaults to `8`):
            权重的量化位宽。
        momentum (`float`, *optional*, defaults to `0.95`):
            更新激活量化范围的动量。
        quant_mode (`bool`, *optional*, defaults to `False`):
            是否对该层进行量化。
    """

    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        padding_idx=None,
        max_norm=None,
        norm_type=2.0,
        scale_grad_by_freq=False,
        sparse=False,
        _weight=None,
        weight_bit=8,
        momentum=0.95,
        quant_mode=False,
    ):
        super().__init__()
        self.num_ = num_embeddings  # 设置 num_ 属性为 num_embeddings
        self.dim = embedding_dim  # 设置 dim 属性为 embedding_dim
        self.padding_idx = padding_idx  # 设置 padding_idx 属性
        self.max_norm = max_norm  # 设置 max_norm 属性
        self.norm_type = norm_type  # 设置 norm_type 属性
        self.scale_grad_by_freq = scale_grad_by_freq  # 设置 scale_grad_by_freq 属性
        self.sparse = sparse  # 设置 sparse 属性

        self.weight = nn.Parameter(torch.zeros([num_embeddings, embedding_dim]))  # 初始化权重参数
        self.register_buffer("weight_scaling_factor", torch.zeros(1))  # 注册缓冲区 weight_scaling_factor
        self.register_buffer("weight_integer", torch.zeros_like(self.weight))  # 注册缓冲区 weight_integer

        self.weight_bit = weight_bit  # 设置 weight_bit 属性
        self.momentum = momentum  # 设置 momentum 属性
        self.quant_mode = quant_mode  # 设置 quant_mode 属性
        self.percentile_mode = False  # 设置 percentile_mode 属性为 False
        self.weight_function = SymmetricQuantFunction.apply  # 设置 weight_function 属性为 SymmetricQuantFunction.apply
    # 定义前向传播函数，用于模型的正向计算
    def forward(self, x, positions=None, incremental_state=None):
        # 如果不处于量化模式，则直接返回原始的嵌入结果和空的状态
        if not self.quant_mode:
            return (
                nn.functional.embedding(
                    x,
                    self.weight,
                    self.padding_idx,
                    self.max_norm,
                    self.norm_type,
                    self.scale_grad_by_freq,
                    self.sparse,
                ),
                None,
            )

        # 获取模型的权重
        w = self.weight
        # 分离权重数据并进行转换
        w_transform = w.data.detach()
        # 计算权重数据的最小值，并扩展为1维张量
        w_min = w_transform.min().expand(1)
        # 计算权重数据的最大值，并扩展为1维张量
        w_max = w_transform.max().expand(1)

        # 计算权重的对称线性量化参数
        self.weight_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, False)
        # 使用量化函数将浮点权重转换为整数权重
        self.weight_integer = self.weight_function(
            self.weight, self.weight_bit, self.percentile_mode, self.weight_scaling_factor
        )

        # 使用整数权重进行嵌入操作
        emb_int = nn.functional.embedding(
            x,
            self.weight_integer,
            self.padding_idx,
            self.max_norm,
            self.norm_type,
            self.scale_grad_by_freq,
            self.sparse,
        )
        # 返回量化后的嵌入结果乘以权重的缩放因子，以及权重的缩放因子本身
        return emb_int * self.weight_scaling_factor, self.weight_scaling_factor
class QuantAct(nn.Module):
    """
    Quantizes the given activation.

    Args:
        activation_bit (`int`):
            Bitwidth for the quantized activation.
        act_range_momentum (`float`, *optional*, defaults to `0.95`):
            Momentum for updating the activation quantization range.
        per_channel (`bool`, *optional*, defaults to `False`):
            Whether to or not use channel-wise quantization.
        channel_len (`int`, *optional*):
            Specify the channel length when set the *per_channel* True.
        quant_mode (`bool`, *optional`, defaults to `False`):
            Whether or not the layer is quantized.
    """

    def __init__(self, activation_bit, act_range_momentum=0.95, per_channel=False, channel_len=None, quant_mode=False):
        super().__init__()

        self.activation_bit = activation_bit  # 设置激活量化的位宽
        self.act_range_momentum = act_range_momentum  # 激活量化范围动量更新的动量
        self.quant_mode = quant_mode  # 层是否量化的标志
        self.per_channel = per_channel  # 是否进行通道-wise的量化
        self.percentile = False  # 百分位数是否激活的标志
        self.act_function = SymmetricQuantFunction.apply  # 使用的量化函数

        if not self.per_channel:
            # 如果不是每个通道独立量化，则注册缓冲区
            self.register_buffer("x_min", torch.zeros(1))
            self.register_buffer("x_max", torch.zeros(1))
            self.register_buffer("act_scaling_factor", torch.zeros(1))
            self.x_min -= 1e-5  # 调整最小值的初始化偏移
            self.x_max += 1e-5  # 调整最大值的初始化偏移
        else:
            # 目前不支持通道-wise模式的量化
            raise NotImplementedError("per-channel mode is not currently supported for activation.")

    def __repr__(self):
        return (
            f"{self.__class__.__name__}(activation_bit={self.activation_bit}, "
            f"quant_mode: {self.quant_mode}, Act_min: {self.x_min.item():.2f}, "
            f"Act_max: {self.x_max.item():.2f})"
        )

    def forward(
        self,
        x,
        pre_act_scaling_factor=None,
        identity=None,
        identity_scaling_factor=None,
        specified_min=None,
        specified_max=None,
        ):
            # 根据标识(identity)是否为空来确定是否对输入进行偏移操作
            x_act = x if identity is None else identity + x
            # 如果处于训练模式，则收集运行时的统计信息
            if self.training:
                # 断言检查，确保激活量化模式下不支持百分位模式和按通道模式
                assert not self.percentile, "percentile mode is not currently supported for activation."
                assert not self.per_channel, "per-channel mode is not currently supported for activation."
                # 计算激活值张量的最小值和最大值
                x_min = x_act.data.min()
                x_max = x_act.data.max()

                # 断言检查，确保计算激活值的最小和最大时未检测到NaN值
                assert (
                    x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0
                ), "NaN detected when computing min/max of the activation"

                # 初始化过程
                if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
                    # 更新活动范围的最小值和最大值
                    self.x_min = self.x_min + x_min
                    self.x_max = self.x_max + x_max

                # 指数移动平均 (EMA)
                # 使用动量以防止量化值在每次迭代中发生显著变化
                elif self.act_range_momentum == -1:
                    self.x_min = torch.min(self.x_min, x_min)
                    self.x_max = torch.max(self.x_max, x_max)
                else:
                    self.x_min = self.x_min * self.act_range_momentum + x_min * (1 - self.act_range_momentum)
                    self.x_max = self.x_max * self.act_range_momentum + x_max * (1 - self.act_range_momentum)

        if not self.quant_mode:
            # 如果不处于量化模式，则直接返回经过激活函数处理后的值和空的量化参数
            return x_act, None

        # 根据指定的最小值和最大值或者默认的活动范围来计算活动缩放因子
        x_min = self.x_min if specified_min is None else specified_min
        x_max = self.x_max if specified_max is None else specified_max

        # 计算对应的对称线性量化参数
        self.act_scaling_factor = symmetric_linear_quantization_params(
            self.activation_bit, x_min, x_max, per_channel=self.per_channel
        )

        if pre_act_scaling_factor is None:
            # 如果没有预先计算的激活值缩放因子，则进行输入的量化操作
            quant_act_int = self.act_function(x, self.activation_bit, self.percentile, self.act_scaling_factor)
        else:
            # 否则，使用固定点乘法进行量化操作
            quant_act_int = FixedPointMul.apply(
                x,
                pre_act_scaling_factor,
                self.activation_bit,
                self.act_scaling_factor,
                identity,
                identity_scaling_factor,
            )

        # 计算正确的输出缩放因子，用于量化后的激活值
        correct_output_scale = self.act_scaling_factor.view(-1)

        return quant_act_int * correct_output_scale, self.act_scaling_factor
# 定义一个自定义的量化线性层，继承自 `torch.nn.Module`
class QuantLinear(nn.Module):
    """
    Quantized version of `torch.nn.Linear`. Adds quantization-specific arguments on top of `torch.nn.Linear`.

    Args:
        weight_bit (`int`, *optional*, defaults to `8`):
            Bitwidth for the quantized weight.
        bias_bit (`int`, *optional*, defaults to `32`):
            Bitwidth for the quantized bias.
        per_channel (`bool`, *optional*, defaults to `False`):
            Whether or not to use channel-wise quantization.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
    """

    # 初始化函数，设置量化线性层的参数和缓冲区
    def __init__(
        self, in_features, out_features, bias=True, weight_bit=8, bias_bit=32, per_channel=False, quant_mode=False
    ):
        super().__init__()
        # 设置输入和输出特征数
        self.in_features = in_features
        self.out_features = out_features

        # 初始化权重参数，并注册缓冲区 weight_integer 用于量化后的权重存储
        self.weight = nn.Parameter(torch.zeros([out_features, in_features]))
        self.register_buffer("weight_integer", torch.zeros_like(self.weight))
        # 初始化缩放因子，对每个输出特征都有一个缩放因子
        self.register_buffer("fc_scaling_factor", torch.zeros(self.out_features))
        
        # 如果有偏置项，则初始化偏置参数，并注册缓冲区 bias_integer 用于量化后的偏置存储
        if bias:
            self.bias = nn.Parameter(torch.zeros(out_features))
            self.register_buffer("bias_integer", torch.zeros_like(self.bias))

        # 设置权重和偏置的位宽，量化模式，是否使用通道级量化等属性
        self.weight_bit = weight_bit
        self.quant_mode = quant_mode
        self.per_channel = per_channel
        self.bias_bit = bias_bit
        self.quant_mode = quant_mode  # 设置量化模式
        self.percentile_mode = False  # 百分位模式，这里未启用
        self.weight_function = SymmetricQuantFunction.apply  # 设置权重量化函数

    # 返回对象的字符串表示，包含量化参数信息
    def __repr__(self):
        s = super().__repr__()
        s = f"({s} weight_bit={self.weight_bit}, quant_mode={self.quant_mode})"
        return s
    # 定义前向传播函数，接受输入 x 和可选的前一层激活量缩放因子 prev_act_scaling_factor
    def forward(self, x, prev_act_scaling_factor=None):
        # 如果不处于量化模式下，直接调用 PyTorch 的线性层函数进行前向传播
        if not self.quant_mode:
            return nn.functional.linear(x, weight=self.weight, bias=self.bias), None

        # 断言 prev_act_scaling_factor 是一个标量张量
        assert prev_act_scaling_factor is not None and prev_act_scaling_factor.shape == (1,), (
            "Input activation to the QuantLinear layer should be globally (non-channel-wise) quantized. "
            "Please add a QuantAct layer with `per_channel = True` before this QuantAct layer"
        )

        # 获取权重张量
        w = self.weight
        # 分离权重数据，并且不再追踪计算图
        w_transform = w.data.detach()
        
        # 如果按通道量化
        if self.per_channel:
            # 计算每个通道的最小值和最大值
            w_min, _ = torch.min(w_transform, dim=1, out=None)
            w_max, _ = torch.max(w_transform, dim=1, out=None)
        else:
            # 计算整个权重张量的最小值和最大值，并扩展为包含一个元素的张量
            w_min = w_transform.min().expand(1)
            w_max = w_transform.max().expand(1)

        # 计算量化参数，根据权重位数、最小值、最大值和是否按通道量化
        self.fc_scaling_factor = symmetric_linear_quantization_params(self.weight_bit, w_min, w_max, self.per_channel)
        # 计算量化后的整数权重
        self.weight_integer = self.weight_function(
            self.weight, self.weight_bit, self.percentile_mode, self.fc_scaling_factor
        )

        # 计算偏置项的缩放因子
        bias_scaling_factor = self.fc_scaling_factor * prev_act_scaling_factor

        # 如果存在偏置项
        if self.bias is not None:
            # 计算量化后的整数偏置项
            self.bias_integer = self.weight_function(self.bias, self.bias_bit, False, bias_scaling_factor)

        # 将 prev_act_scaling_factor 重塑为形状为 (1, -1) 的张量，并用它对输入 x 进行缩放
        prev_act_scaling_factor = prev_act_scaling_factor.view(1, -1)
        x_int = x / prev_act_scaling_factor

        # 使用量化后的整数权重和偏置项进行线性变换，并乘以偏置项的缩放因子
        return (
            nn.functional.linear(x_int, weight=self.weight_integer, bias=self.bias_integer) * bias_scaling_factor,
            bias_scaling_factor,
        )
class IntGELU(nn.Module):
    """
    Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.

    Args:
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
        force_dequant (`str`, *optional*, defaults to `"none"`):
            Force dequantize the layer if either "gelu" or "nonlinear" is given.
    """

    def __init__(self, quant_mode=True, force_dequant="none"):
        super().__init__()
        self.quant_mode = quant_mode  # 初始化量化模式标志，默认为 True

        if force_dequant in ["nonlinear", "gelu"]:
            logger.info("Force dequantize gelu")
            self.quant_mode = False  # 如果 force_dequant 参数为 "nonlinear" 或 "gelu"，强制取消量化模式

        if not self.quant_mode:
            self.activation_fn = nn.GELU()  # 如果未使用量化模式，则使用 nn.GELU 激活函数

        self.k = 1.4142  # 常数 k，用于计算缩放因子
        self.const = 14  # 虚拟的整数常数
        self.coeff = [-0.2888, -1.769, 1]  # 系数数组 [a, b, c]，用于计算整数误差函数
        self.coeff[2] /= self.coeff[0]  # 系数归一化处理

    def int_erf(self, x_int, scaling_factor):
        b_int = torch.floor(self.coeff[1] / scaling_factor)  # 计算 b 的整数值
        c_int = torch.floor(self.coeff[2] / scaling_factor**2)  # 计算 c 的整数值
        sign = torch.sign(x_int)  # 计算 x_int 的符号

        abs_int = torch.min(torch.abs(x_int), -b_int)  # 取绝对值并截断到 -b_int
        y_int = sign * ((abs_int + b_int) ** 2 + c_int)  # 计算整数误差函数

        scaling_factor = scaling_factor**2 * self.coeff[0]  # 更新缩放因子的平方乘以系数 a

        # 避免溢出，通过右移操作
        y_int = floor_ste.apply(y_int / 2**self.const)  # 使用 floor_ste 函数进行右移处理
        scaling_factor = scaling_factor * 2**self.const  # 更新缩放因子

        return y_int, scaling_factor  # 返回整数误差函数值和更新后的缩放因子

    def forward(self, x, scaling_factor=None):
        if not self.quant_mode:
            return self.activation_fn(x), None  # 如果未使用量化模式，直接返回激活函数处理后的结果

        x_int = x / scaling_factor  # 计算 x 的整数值
        sigmoid_int, sigmoid_scaling_factor = self.int_erf(x_int, scaling_factor / self.k)  # 计算整数误差函数

        shift_int = 1.0 // sigmoid_scaling_factor  # 计算整数误差函数的偏移量

        x_int = x_int * (sigmoid_int + shift_int)  # 应用整数误差函数和偏移量对 x_int 进行处理
        scaling_factor = scaling_factor * sigmoid_scaling_factor / 2  # 更新缩放因子

        return x_int * scaling_factor, scaling_factor  # 返回处理后的整数值和更新后的缩放因子


class IntSoftmax(nn.Module):
    """
    Quantized version of `torch.nn.Softmax`. Adds quantization-specific arguments on top of `torch.nn.Softmax`.

    Args:
        output_bit (`int`):
            Bitwidth for the layer output activation.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
        force_dequant (`str`, *optional*, defaults to `"none"`):
            Force dequantize the layer if either "softmax" or "nonlinear" is given.
    """
    # 初始化函数，设置输出位数、量化模式和强制去量化模式
    def __init__(self, output_bit, quant_mode=False, force_dequant="none"):
        # 调用父类初始化函数
        super().__init__()
        # 设置输出位数
        self.output_bit = output_bit
        # 最大位数设为32
        self.max_bit = 32
        # 设置量化模式
        self.quant_mode = quant_mode

        # 如果强制去量化模式为"nonlinear"或"softmax"
        if force_dequant in ["nonlinear", "softmax"]:
            # 输出日志信息
            logger.info("Force dequantize softmax")
            # 强制取消量化模式设为False
            self.quant_mode = False

        # 初始化量化操作对象，16为输入量化位数
        self.act = QuantAct(16, quant_mode=self.quant_mode)
        # 设置常数x0为-ln2
        self.x0 = -0.6931  # -ln2
        # 设置常数const为30，用作虚拟整数常量
        self.const = 30  # dummy integer constant
        # 设置多项式系数为ax**2 + bx + c，其中a为1.0，b为0.35815147，c为0.96963238
        self.coef = [0.35815147, 0.96963238, 1.0]
        # 根据a对b和c进行归一化处理
        self.coef[1] /= self.coef[0]
        self.coef[2] /= self.coef[0]

    # 整型多项式函数
    def int_polynomial(self, x_int, scaling_factor):
        # 禁用梯度计算
        with torch.no_grad():
            # 计算系数b_int和c_int
            b_int = torch.floor(self.coef[1] / scaling_factor)
            c_int = torch.floor(self.coef[2] / scaling_factor**2)
        # 计算多项式结果z
        z = (x_int + b_int) * x_int + c_int
        # 更新缩放因子为多项式系数乘以原缩放因子的平方
        scaling_factor = self.coef[0] * scaling_factor**2
        return z, scaling_factor

    # 整型指数函数
    def int_exp(self, x_int, scaling_factor):
        # 禁用梯度计算
        with torch.no_grad():
            # 计算整数化的x0_int
            x0_int = torch.floor(self.x0 / scaling_factor)
        # 限制x_int的最小值为常数const乘以x0_int
        x_int = torch.max(x_int, self.const * x0_int)

        # 计算q和r
        q = floor_ste.apply(x_int / x0_int)
        r = x_int - x0_int * q
        # 计算指数整数和缩放因子
        exp_int, exp_scaling_factor = self.int_polynomial(r, scaling_factor)
        # 对指数整数进行修剪并缩放
        exp_int = torch.clamp(floor_ste.apply(exp_int * 2 ** (self.const - q)), min=0)
        scaling_factor = exp_scaling_factor / 2**self.const
        return exp_int, scaling_factor

    # 前向传播函数
    def forward(self, x, scaling_factor):
        # 如果非量化模式，直接返回softmax函数结果和空值
        if not self.quant_mode:
            return nn.functional.softmax(x, dim=-1), None

        # 计算整数化的输入x_int
        x_int = x / scaling_factor

        # 计算x_int的最大值和更新x_int
        x_int_max, _ = x_int.max(dim=-1, keepdim=True)
        x_int = x_int - x_int_max

        # 计算指数整数和指数缩放因子
        exp_int, exp_scaling_factor = self.int_exp(x_int, scaling_factor)

        # 避免溢出
        exp, exp_scaling_factor = self.act(exp_int, exp_scaling_factor)
        exp_int = exp / exp_scaling_factor

        # 计算指数整数的总和
        exp_int_sum = exp_int.sum(dim=-1, keepdim=True)
        # 计算因子
        factor = floor_ste.apply(2**self.max_bit / exp_int_sum)
        # 对指数整数进行修剪并缩放
        exp_int = floor_ste.apply(exp_int * factor / 2 ** (self.max_bit - self.output_bit))
        scaling_factor = 1 / 2**self.output_bit
        return exp_int * scaling_factor, scaling_factor
    """
    Quantized version of `torch.nn.LayerNorm`. Adds quantization-specific arguments on top of `torch.nn.LayerNorm`.

    Args:
        normalized_shape (`int` or `list` or `torch.Size`):
            Shape of the input tensor over which normalization is applied.
        eps (`float`):
            Small value added to the denominator for numerical stability.
        output_bit (`int`, *optional*, defaults to `8`):
            Bitwidth for the layer output activation.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
        force_dequant (`str`, *optional*, defaults to `"none"`):
            If set to `"layernorm"` or `"nonlinear"`, forces dequantization of the layer.

    Attributes:
        weight (`torch.nn.Parameter`):
            Learnable parameter representing the scaling factor.
        bias (`torch.nn.Parameter`):
            Learnable parameter representing the bias.
        shift (`torch.Tensor`):
            Buffer holding the shift value for dynamic adjustment.
        output_bit (`int`):
            Bitwidth for the layer output activation.
        max_bit (`int`):
            Maximum allowable bitwidth for quantization.
        dim_sqrt (`None`):
            Placeholder for the square root of the dimension, initially `None`.
        activation (`QuantAct`):
            Instance of `QuantAct` for quantization-aware activation.

    Methods:
        set_shift(self, y_int):
            Adjusts `self.shift` based on the input tensor `y_int`.
        overflow_fallback(self, y_int):
            Handles overflow during training and adjusts `self.shift` accordingly.

    Notes:
        - This class extends `torch.nn.Module` and integrates quantization-specific features.
        - It manages parameters for scaling and bias, quantization mode, and dynamic shift adjustments.
        - The `QuantAct` instance `activation` handles activation quantization within the layer.
    """

    def __init__(self, normalized_shape, eps, output_bit=8, quant_mode=False, force_dequant="none"):
        super().__init__()
        # Initialize attributes related to normalization
        self.normalized_shape = normalized_shape
        self.eps = eps

        # Initialize learnable parameters
        self.weight = nn.Parameter(torch.zeros(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))

        # Manage quantization mode, with option for forced dequantization
        self.quant_mode = quant_mode
        if force_dequant in ["nonlinear", "layernorm"]:
            logger.info("Force dequantize layernorm")
            self.quant_mode = False

        # Buffer for dynamic shift adjustment
        self.register_buffer("shift", torch.zeros(1))
        
        # Configure output bitwidth and related parameters
        self.output_bit = output_bit
        self.max_bit = 32
        self.dim_sqrt = None
        
        # Quantized activation function
        self.activation = QuantAct(self.output_bit, quant_mode=self.quant_mode)

    def set_shift(self, y_int):
        """
        Adjusts `self.shift` based on the input tensor `y_int`.

        Args:
            y_int (`torch.Tensor`):
                Integer tensor representing the quantized activation values.
        """
        with torch.no_grad():
            y_sq_int = y_int**2
            var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
            shift = (torch.log2(torch.sqrt(var_int / 2**self.max_bit)).ceil()).max()
            shift_old = self.shift
            self.shift = torch.max(self.shift, shift)
            logger.info(f"Dynamic shift adjustment: {int(shift_old)} -> {int(self.shift)}")

    def overflow_fallback(self, y_int):
        """
        Handles overflow during training and adjusts `self.shift` accordingly.

        Args:
            y_int (`torch.Tensor`):
                Integer tensor representing the quantized activation values.

        Returns:
            `torch.Tensor`: Tensor representing the adjusted variance after shift.
        """
        self.set_shift(y_int)  # adjusts `self.shift`
        y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
        y_sq_int = y_int_shifted**2
        var_int = torch.sum(y_sq_int, axis=2, keepdim=True)
        return var_int
    # 定义前向传播函数，接受输入张量 x 和可选的缩放因子 scaling_factor
    def forward(self, x, scaling_factor=None):
        # 如果不是量化模式
        if not self.quant_mode:
            # 计算输入张量 x 沿第二个轴的均值
            mean = x.mean(axis=2, keepdim=True)
            # 对输入张量进行均值中心化
            y = x - mean
            # 计算中心化后的输入张量的方差
            var = torch.mean(y**2, axis=2, keepdim=True)
            # 根据均值和方差进行标准化处理
            x = y / torch.sqrt(self.eps + var)
            # 对标准化后的张量进行加权和偏移处理
            x = x * self.weight + self.bias
            # 返回处理后的张量和空的 scaling_factor
            return x, None

        # 如果是量化模式，并且还未计算过 feature 维度的平方根
        if self.dim_sqrt is None:
            # 计算 feature 维度的平方根并保存到 self.dim_sqrt 中
            n = torch.tensor(x.shape[2], dtype=torch.float)
            self.dim_sqrt = torch.sqrt(n).to(x.device)

        # 对输入张量 x 进行除以缩放因子的量化
        x_int = x / scaling_factor
        # 计算量化后的输入张量沿第二个轴的均值并四舍五入
        mean_int = round_ste.apply(x_int.mean(axis=2, keepdim=True))
        # 对量化后的输入张量进行均值中心化
        y_int = x_int - mean_int
        # 将中心化后的量化张量按照指定的位移因子进行向下取整操作
        y_int_shifted = floor_ste.apply(y_int / 2**self.shift)
        # 计算量化后的输入张量的平方
        y_sq_int = y_int_shifted**2
        # 计算量化后的输入张量的方差
        var_int = torch.sum(y_sq_int, axis=2, keepdim=True)

        # 如果处于训练阶段，并且检测到方差 var_int 存在溢出
        if self.training:
            # 如果方差 var_int 的最大值超过了 self.max_bit 所指定的阈值
            if var_int.max() >= 2**self.max_bit:
                # 执行溢出处理函数以获取修正后的方差 var_int
                var_int = self.overflow_fallback(y_int)
                # 断言确保修正后的方差 var_int 仍然小于 self.max_bit + 0.1
                assert var_int.max() < 2**self.max_bit + 0.1, (
                    "Error detected in overflow handling: "
                    "`var_int` exceeds `self.max_bit` (the maximum possible bit width)"
                )

        # 待替换为生成相同输出的整数平方根核函数
        std_int = floor_ste.apply(torch.sqrt(var_int)) * 2**self.shift
        # 计算因子，用于缩放输入张量 y_int
        factor = floor_ste.apply(2**31 / std_int)
        # 根据计算得到的因子对输入张量 y_int 进行进一步处理
        y_int = floor_ste.apply(y_int * factor / 2)
        # 计算缩放因子 scaling_factor，用于最终的缩放和偏移
        scaling_factor = self.dim_sqrt / 2**30

        # 缩放和偏移处理
        bias = self.bias.data.detach() / (self.weight.data.detach())
        bias_int = floor_ste.apply(bias / scaling_factor)

        y_int = y_int + bias_int
        scaling_factor = scaling_factor * self.weight
        x = y_int * scaling_factor

        # 返回处理后的张量 x 和最终的 scaling_factor
        return x, scaling_factor
# 计算给定张量中百分位数的最大值和最小值
def get_percentile_min_max(input, lower_percentile, upper_percentile, output_tensor=False):
    """
    Calculate the percentile max and min values in a given tensor

    Args:
        input (`torch.Tensor`):
            The target tensor to calculate percentile max and min.
        lower_percentile (`float`):
            If 0.1, means we return the value of the smallest 0.1% value in the tensor as percentile min.
        upper_percentile (`float`):
            If 99.9, means we return the value of the largest 0.1% value in the tensor as percentile max.
        output_tensor (`bool`, *optional*, defaults to `False`):
            If True, this function returns tensors, otherwise it returns values.

    Returns:
        `Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of *input*
    """
    # 获取输入张量的长度
    input_length = input.shape[0]

    # 计算下分位数和上分位数的索引
    lower_index = round(input_length * (1 - lower_percentile * 0.01))
    upper_index = round(input_length * upper_percentile * 0.01)

    # 计算上分位数的值
    upper_bound = torch.kthvalue(input, k=upper_index).values

    # 如果 lower_percentile 为 0，则下分位数设为 0，否则计算下分位数的值
    if lower_percentile == 0:
        lower_bound = upper_bound * 0
        # lower_index += 1
    else:
        lower_bound = -torch.kthvalue(-input, k=lower_index).values

    # 如果不需要输出张量，将下分位数和上分位数转换为标量值
    if not output_tensor:
        lower_bound = lower_bound.item()
        upper_bound = upper_bound.item()
    return lower_bound, upper_bound


def linear_quantize(input, scale, zero_point, inplace=False):
    """
    Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.

    Args:
        input (`torch.Tensor`):
            Single-precision input tensor to be quantized.
        scale (`torch.Tensor`):
            Scaling factor for quantization.
        zero_point (`torch.Tensor`):
            Shift for quantization.
        inplace (`bool`, *optional*, defaults to `False`):
            Whether to compute inplace or not.

    Returns:
        `torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
    """
    # 根据张量维度重新调整 scale 和 zero_point，适用于卷积权重和激活函数
    if len(input.shape) == 4:
        scale = scale.view(-1, 1, 1, 1)
        zero_point = zero_point.view(-1, 1, 1, 1)
    # 根据张量维度重新调整 scale 和 zero_point，适用于线性权重
    elif len(input.shape) == 2:
        scale = scale.view(-1, 1)
        zero_point = zero_point.view(-1, 1)
    else:
        scale = scale.view(-1)
        zero_point = zero_point.view(-1)
    # 执行量化操作：input = float / scale + zero_point
    if inplace:
        input.mul_(1.0 / scale).add_(zero_point).round_()
        return input
    return torch.round(1.0 / scale * input + zero_point)


def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_max, per_channel=False):
    """
    Compute the scaling factor with the given quantization range for symmetric quantization.
    """
    # 在对称量化情况下计算缩放因子，根据给定的量化范围
    # 在这部分，我们不需要进行任何梯度计算，
    # 为了确保这一点，我们使用 torch.no_grad() 来包裹代码块

    with torch.no_grad():
        # 计算量化的范围，使用的比特数为 num_bits
        n = 2 ** (num_bits - 1) - 1

        # 如果 per_channel 为 True，执行以下操作
        if per_channel:
            # 计算每个通道的最大饱和度，并取绝对值
            scale, _ = torch.max(torch.stack([saturation_min.abs(), saturation_max.abs()], dim=1), dim=1)
            # 将 scale 限制在最小值为 1e-8，然后进行量化范围的计算
            scale = torch.clamp(scale, min=1e-8) / n

        else:
            # 计算整体的最大饱和度，并取绝对值
            scale = max(saturation_min.abs(), saturation_max.abs())
            # 将 scale 限制在最小值为 1e-8，然后进行量化范围的计算
            scale = torch.clamp(scale, min=1e-8) / n

    # 返回计算得到的量化因子 scale
    return scale
class SymmetricQuantFunction(Function):
    """
    Class to quantize the given floating-point values using symmetric quantization with given range and bitwidth.
    """

    @staticmethod
    def forward(ctx, x, k, percentile_mode, scale):
        """
        Args:
            x (`torch.Tensor`):
                Floating point tensor to be quantized.
            k (`int`):
                Quantization bitwidth.
            percentile_mode (`bool`):
                Whether or not to use percentile calibration.
            scale (`torch.Tensor`):
                Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
                requires pre-calculated scaling factor.

        Returns:
            `torch.Tensor`: Symmetric-quantized value of *input*.
        """
        # Define the zero point as a tensor with value 0.0 on the same device as scale
        zero_point = torch.tensor(0.0).to(scale.device)

        # Calculate the maximum representable integer for given bitwidth k
        n = 2 ** (k - 1) - 1
        
        # Perform linear quantization with the given parameters
        new_quant_x = linear_quantize(x, scale, zero_point, inplace=False)
        
        # Clamp the quantized values to ensure they lie within the representable range
        new_quant_x = torch.clamp(new_quant_x, -n, n - 1)

        # Store scaling factor in context for backward pass
        ctx.scale = scale
        
        return new_quant_x

    @staticmethod
    def backward(ctx, grad_output):
        # Retrieve stored scaling factor from context
        scale = ctx.scale
        
        # Adjust scale shape based on gradient output dimensions
        if len(grad_output.shape) == 4:
            scale = scale.view(-1, 1, 1, 1)
        elif len(grad_output.shape) == 2:
            scale = scale.view(-1, 1)
        else:
            scale = scale.view(-1)
        
        # Return gradient scaled by the inverse of the scaling factor, and None for other arguments
        return grad_output.clone() / scale, None, None, None, None


class floor_ste(Function):
    """
    Straight-through Estimator(STE) for torch.floor()
    """

    @staticmethod
    def forward(ctx, x):
        # Forward pass computes the floor of input tensor x
        return torch.floor(x)

    @staticmethod
    def backward(ctx, grad_output):
        # Backward pass returns the gradient unchanged
        return grad_output.clone()


class round_ste(Function):
    """
    Straight-through Estimator(STE) for torch.round()
    """

    @staticmethod
    def forward(ctx, x):
        # Forward pass computes the round of input tensor x
        return torch.round(x)

    @staticmethod
    def backward(ctx, grad_output):
        # Backward pass returns the gradient unchanged
        return grad_output.clone()


def batch_frexp(inputs, max_bit=31):
    """
    Decompose the scaling factor into mantissa and twos exponent.

    Args:
        scaling_factor (`torch.Tensor`):
            Target scaling factor to decompose.

    Returns:
        ``Tuple(torch.Tensor, torch.Tensor)`: mantisa and exponent
    """

    # Get the shape of the input tensor
    shape_of_input = inputs.size()

    # Flatten the input tensor to 1D
    inputs = inputs.view(-1)

    # Use NumPy's frexp function to decompose each element of the tensor into mantissa and exponent
    output_m, output_e = np.frexp(inputs.cpu().numpy())
    
    # Quantize the mantissa and shift it to fit within max_bit range
    tmp_m = []
    for m in output_m:
        int_m_shifted = int(
            decimal.Decimal(m * (2**max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP)
        )
        tmp_m.append(int_m_shifted)
    output_m = np.array(tmp_m)

    # Calculate the exponent in terms of max_bit
    output_e = float(max_bit) - output_e

    # Return the quantized mantissa and exponent tensors reshaped to the original input shape
    return (
        torch.from_numpy(output_m).to(inputs.device).view(shape_of_input),
        torch.from_numpy(output_e).to(inputs.device).view(shape_of_input),
    )
class FixedPointMul(Function):
    """
    Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.

    Args:
        pre_act (`torch.Tensor`):
            Input tensor.
        pre_act_scaling_factor (`torch.Tensor`):
            Scaling factor of the input tensor *pre_act*.
        bit_num (`int`):
            Quantization bitwidth.
        z_scaling_factor (`torch.Tensor`):
            Scaling factor of the output tensor.
        identity (`torch.Tensor`, *optional*):
            Identity tensor, if exists.
        identity_scaling_factor (`torch.Tensor`, *optional*):
            Scaling factor of the identity tensor *identity*, if exists.

    Returns:
        `torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act* and
        *identity*), whose scale is rescaled to *z_scaling_factor*.
    """

    @staticmethod
    def forward(
        ctx,
        pre_act,
        pre_act_scaling_factor,
        bit_num,
        z_scaling_factor,
        identity=None,
        identity_scaling_factor=None,
    ):
        # Lambda function to reshape input tensor if necessary
        if len(pre_act_scaling_factor.shape) == 3:
            reshape = lambda x: x  # noqa: E731
        else:
            reshape = lambda x: x.view(1, 1, -1)  # noqa: E731
        
        # Store identity tensor in the context
        ctx.identity = identity

        # Maximum representable integer in fixed-point representation
        n = 2 ** (bit_num - 1) - 1

        # Perform operations with gradients turned off
        with torch.no_grad():
            # Reshape scaling factors
            pre_act_scaling_factor = reshape(pre_act_scaling_factor)
            if identity is not None:
                identity_scaling_factor = reshape(identity_scaling_factor)

            # Store scaling factor of the output tensor in the context
            ctx.z_scaling_factor = z_scaling_factor

            # Quantize input tensor pre_act
            z_int = torch.round(pre_act / pre_act_scaling_factor)
            _A = pre_act_scaling_factor.type(torch.double)
            _B = (z_scaling_factor.type(torch.float)).type(torch.double)
            new_scale = _A / _B
            new_scale = reshape(new_scale)

            # Compute mantissa and exponent using batch_frexp function
            m, e = batch_frexp(new_scale)

            # Compute the output tensor in fixed-point arithmetic
            output = z_int.type(torch.double) * m.type(torch.double)
            output = torch.round(output / (2.0**e))

            # If identity tensor is provided, perform additional fixed-point arithmetic
            if identity is not None:
                wx_int = torch.round(identity / identity_scaling_factor)

                _A = identity_scaling_factor.type(torch.double)
                _B = (z_scaling_factor.type(torch.float)).type(torch.double)
                new_scale = _A / _B
                new_scale = reshape(new_scale)

                m1, e1 = batch_frexp(new_scale)
                output1 = wx_int.type(torch.double) * m1.type(torch.double)
                output1 = torch.round(output1 / (2.0**e1))

                # Sum the outputs of pre_act and identity tensors
                output = output1 + output

            # Clamp the output tensor within the range of representable integers
            return torch.clamp(output.type(torch.float), -n - 1, n)

    @staticmethod
    # 定义反向传播函数，计算梯度
    def backward(ctx, grad_output):
        # 初始化变量用于存储身份梯度
        identity_grad = None
        # 如果上下文中的身份不为None，则计算身份梯度
        if ctx.identity is not None:
            # 克隆梯度输出并除以上下文中的缩放因子，作为身份梯度
            identity_grad = grad_output.clone() / ctx.z_scaling_factor
        # 返回计算得到的梯度，其他返回值为None
        return grad_output.clone() / ctx.z_scaling_factor, None, None, None, None, identity_grad, None

`.\models\ibert\init.py`

# 版权声明和许可声明，指明此代码的版权和使用许可
# 详细许可信息可以在 Apache License, Version 2.0 网页上找到：http://www.apache.org/licenses/LICENSE-2.0
#
# 如果按照许可证的规定，在没有软件的任何保证或条件的情况下分发此软件
# 请查看许可证以了解更多详细信息。

# 引入类型检查模块
from typing import TYPE_CHECKING

# 引入自定义工具函数和模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义导入结构字典，用于延迟加载模块
_import_structure = {"configuration_ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig", "IBertOnnxConfig"]}

# 检查是否存在 Torch 库，如果不存在则抛出自定义的依赖未可用异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加下列模块到导入结构字典中
    _import_structure["modeling_ibert"] = [
        "IBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "IBertForMaskedLM",
        "IBertForMultipleChoice",
        "IBertForQuestionAnswering",
        "IBertForSequenceClassification",
        "IBertForTokenClassification",
        "IBertModel",
        "IBertPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从 configuration_ibert 模块中导入指定的类和变量
    from .configuration_ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig, IBertOnnxConfig

    # 再次检查 Torch 是否可用，如果不可用则抛出自定义的依赖未可用异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，则从 modeling_ibert 模块中导入指定的类和变量
        from .modeling_ibert import (
            IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            IBertForMaskedLM,
            IBertForMultipleChoice,
            IBertForQuestionAnswering,
            IBertForSequenceClassification,
            IBertForTokenClassification,
            IBertModel,
            IBertPreTrainedModel,
        )

# 如果不在类型检查模式下
else:
    # 导入 sys 模块
    import sys

    # 将当前模块替换为延迟加载模块 _LazyModule 的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\idefics\configuration_idefics.py`

# coding=utf-8
# 声明文件编码格式为UTF-8

# 版权声明和许可证信息，说明此代码基于EleutherAI的GPT-NeoX库，经过修改以适应与Meta AI团队训练的模型的轻微架构差异
# 详细说明了代码的版权信息和许可证，允许在Apache License, Version 2.0下使用此文件

# 导入必要的模块和库
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取与当前模块关联的日志记录器对象
logger = logging.get_logger(__name__)

# 预训练模型配置文件与存档映射表
IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "HuggingFaceM4/idefics-9b": "https://huggingface.co/HuggingFaceM4/idefics-9b/blob/main/config.json",
    "HuggingFaceM4/idefics-80b": "https://huggingface.co/HuggingFaceM4/idefics-80b/blob/main/config.json",
}

# IdeficsVisionConfig类，继承自PretrainedConfig类，用于存储Idefics模型的配置信息
class IdeficsVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义模型类型为 "idefics"
    model_type = "idefics"
    
    # 创建属性映射字典，将 "hidden_size" 映射为 "embed_dim"
    attribute_map = {
        "hidden_size": "embed_dim",
    }
    
    # 初始化函数，定义了模型的参数和默认取值
    def __init__(
        self,
        embed_dim=768,  # 编码器层和池化层的维度，默认为 768
        image_size=224,  # 每个图像的分辨率大小，默认为 224
        intermediate_size=5120,  # Transformer 编码器中"中间"（即前馈）层的维度，默认为 5120
        patch_size=14,  # 每个补丁的大小（分辨率），默认为 14
        num_hidden_layers=32,  # Transformer 编码器中的隐藏层数量，默认为 32
        num_attention_heads=16,  # 每个注意力层中的注意力头数，默认为 16
        num_channels=3,  # 图像通道数，默认为 3
        hidden_act="gelu",  # 编码器和池化器中的非线性激活函数，默认为 "gelu"
        layer_norm_eps=1e-5,  # 层归一化层使用的 epsilon，默认为 1e-5
        attention_dropout=0.0,  # 注意力概率的 dropout 比率，默认为 0.0
        initializer_range=0.02,  # 用于初始化所有权重矩阵的截断正态分布的标准差，默认为 0.02
        initializer_factor=1.0,  # 用于初始化权重矩阵的因子（通常保持为 1.0，仅用于初始化测试中）
        **kwargs,  # 其他参数，未指定的参数会被捕获在这里
        ):
            # 设置嵌入维度
            self.embed_dim = embed_dim
            # 设置图像尺寸
            self.image_size = image_size
            # 设置中间层大小
            self.intermediate_size = intermediate_size
            # 设置patch大小
            self.patch_size = patch_size
            # 设置隐藏层数量
            self.num_hidden_layers = num_hidden_layers
            # 设置注意力头数量
            self.num_attention_heads = num_attention_heads
            # 设置通道数量
            self.num_channels = num_channels
            # 设置层归一化 epsilon 值
            self.layer_norm_eps = layer_norm_eps
            # 设置注意力机制的 dropout 率
            self.attention_dropout = attention_dropout
            # 设置初始化范围
            self.initializer_range = initializer_range
            # 设置初始化因子
            self.initializer_factor = initializer_factor
            # 设置隐藏层激活函数
            self.hidden_act = hidden_act

            # 调用父类的初始化方法
            super().__init__(**kwargs)
class IdeficsConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import IdeficsModel, IdeficsConfig

    >>> # Initializing a Idefics idefics-9b style configuration
    >>> configuration = IdeficsConfig()
    ```

    注释：
    声明一个名为 IdeficsConfig 的配置类，用于存储 `IdeficsModel` 的配置信息。
    该配置类根据指定的参数实例化一个 Idefics 模型，定义模型的架构。
    使用默认参数实例化配置类会产生类似 Idefics-9B 模型的配置。
    例如，[HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b) 提供了相关的预训练模型。

    Configuration objects 继承自 [`PretrainedConfig`]，可用于控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。
    ```
    >>> # 从 idefics-9b 风格的配置中初始化一个模型
    >>> model = IdeficsModel(configuration)
    
    >>> # 访问模型的配置信息
    >>> configuration = model.config
        ):
        # 初始化函数，设置模型的各项参数
        self.vocab_size = vocab_size
        # 额外词汇表大小
        self.additional_vocab_size = additional_vocab_size
        # 隐藏层大小
        self.hidden_size = hidden_size
        # 中间层大小
        self.intermediate_size = intermediate_size
        # 隐藏层的数量
        self.num_hidden_layers = num_hidden_layers
        # 注意力头的数量
        self.num_attention_heads = num_attention_heads
        # dropout 概率
        self.dropout = dropout
        # 隐藏层激活函数
        self.hidden_act = hidden_act
        # 初始化范围
        self.initializer_range = initializer_range
        # alpha 初始化器
        self.alpha_initializer = alpha_initializer
        # alpha 初始化范围
        self.alphas_initializer_range = alphas_initializer_range
        # alpha 类型
        self.alpha_type = alpha_type
        # RMS 规范化的 epsilon
        self.rms_norm_eps = rms_norm_eps
        # 是否使用缓存
        self.use_cache = use_cache

        # 交叉层间隔
        self.cross_layer_interval = cross_layer_interval
        # qk 层归一化
        self.qk_layer_norms = qk_layer_norms
        # 冻结视觉层
        self.freeze_vision_layers = freeze_vision_layers

        # 冻结文本层
        self.freeze_text_layers = freeze_text_layers
        # 冻结文本模块例外
        self.freeze_text_module_exceptions = freeze_text_module_exceptions
        # 冻结视觉模块例外
        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
        # 冻结 LM 头部
        self.freeze_lm_head = freeze_lm_head

        # 是否使用重采样器
        self.use_resampler = use_resampler

        # 如果 perceiver_config 为 None，则使用默认配置
        if perceiver_config is None:
            self.perceiver_config = IdeficsPerceiverConfig()
        # 如果 perceiver_config 是字典类型，则使用给定的配置创建 IdeficsPerceiverConfig 对象
        elif isinstance(perceiver_config, dict):
            self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
        # 如果 perceiver_config 已经是 IdeficsPerceiverConfig 类型，则直接使用它
        elif isinstance(perceiver_config, IdeficsPerceiverConfig):
            self.perceiver_config = perceiver_config

        # 如果 vision_config 为 None，则使用默认配置
        if vision_config is None:
            self.vision_config = IdeficsVisionConfig()
        # 如果 vision_config 是字典类型，则使用给定的配置创建 IdeficsVisionConfig 对象
        elif isinstance(vision_config, dict):
            self.vision_config = IdeficsVisionConfig(**vision_config)
        # 如果 vision_config 已经是 IdeficsVisionConfig 类型，则直接使用它
        elif isinstance(vision_config, IdeficsVisionConfig):
            self.vision_config = vision_config

        # 调用父类的初始化方法，设置特殊标记的 token ID 和其他参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

        # 注意：不要在构造函数中进行任何基于 __init__ 参数的检查，
        # 因为 PretrainedConfig.from_dict 首先使用配置字典实例化类，然后
        # 仅在 from_pretrained 中使用 from_pretrained 的 kwargs 更新配置对象，
        # 所以在实例化此对象时，许多属性具有默认值，尚未被覆盖。
        # 请在运行父类的 from_pretrained 后，在 from_pretrained 中执行任何必要的检查。

`.\models\idefics\image_processing_idefics.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Idefics."""

from typing import Callable, Dict, List, Optional, Union

from PIL import Image  # 导入 PIL 库中的 Image 模块

from ...image_processing_utils import BaseImageProcessor, BatchFeature  # 导入自定义的图像处理工具
from ...image_transforms import resize, to_channel_dimension_format  # 导入自定义的图像转换函数
from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    make_list_of_images,
    to_numpy_array,
    valid_images,
)  # 导入图像处理和转换的实用函数
from ...utils import TensorType, is_torch_available  # 导入通用实用函数和 Torch 相关函数

IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]  # 定义 IDEFICS 标准均值
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]  # 定义 IDEFICS 标准标准差


def convert_to_rgb(image):
    # `image.convert("RGB")` 只对 .jpg 图片有效，因为它会为透明图像创建错误的背景。
    # `alpha_composite` 函数处理带有透明通道的图像。
    if image.mode == "RGB":  # 检查图像是否已经是 RGB 模式
        return image

    image_rgba = image.convert("RGBA")  # 将图像转换为 RGBA 模式
    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))  # 创建白色背景图像
    alpha_composite = Image.alpha_composite(background, image_rgba)  # 使用 alpha 合成处理透明通道
    alpha_composite = alpha_composite.convert("RGB")  # 将结果转换回 RGB 模式
    return alpha_composite


class IdeficsImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Idefics image processor.

    Args:
        image_size (`int`, *optional*, defaults to 224):
            Resize to image size
        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        image_num_channels (`int`, *optional*, defaults to 3):
            Number of image channels.
    """

    model_input_names = ["pixel_values"]  # 模型输入的名称列表，此处只有一个像素值的输入
    # 初始化方法，用于设置图像处理的参数和调用父类的初始化方法
    def __init__(
        self,
        image_size: int = 224,                            # 图像大小，默认为224像素
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值，可以是单个数值或列表形式的均值
        image_std: Optional[Union[float, List[float]]] = None,   # 图像标准差，可以是单个数值或列表形式的标准差
        image_num_channels: Optional[int] = 3,             # 图像通道数，默认为3通道（彩色图像）
        **kwargs,                                          # 其他关键字参数
    ) -> None:
        # 调用父类的初始化方法，处理其他传入的关键字参数
        super().__init__(**kwargs)

        # 设置对象的属性值，用于后续图像预处理使用
        self.image_size = image_size                       # 设置图像大小
        self.image_num_channels = image_num_channels       # 设置图像通道数
        self.image_mean = image_mean                       # 设置图像均值
        self.image_std = image_std                         # 设置图像标准差

    # 图像预处理方法，用于对输入图像进行预处理操作
    def preprocess(
        self,
        images: ImageInput,                                # 输入的图像数据，可以是单张图像或批量图像
        image_num_channels: Optional[int] = 3,             # 图像通道数，默认为3通道
        image_size: Optional[Dict[str, int]] = None,       # 图像大小的字典，包含宽和高
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值，可以是单个数值或列表形式的均值
        image_std: Optional[Union[float, List[float]]] = None,   # 图像标准差，可以是单个数值或列表形式的标准差
        transform: Callable = None,                        # 图像变换函数，用于额外的图像处理
        **kwargs,                                          # 其他关键字参数

`.\models\idefics\modeling_idefics.py`

# coding=utf-8
# 定义文件编码为UTF-8

# 版权声明和许可证信息，基于Apache License, Version 2.0
# 详细许可证信息可以在http://www.apache.org/licenses/LICENSE-2.0找到

""" PyTorch Idefics model. """
# 导入必要的库和模块
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# 导入自定义的模块和函数
from ... import PreTrainedModel
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
from ...modeling_outputs import ModelOutput
from ...modeling_utils import PretrainedConfig
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入IdeficsConfig配置文件
from .configuration_idefics import IdeficsConfig
# 导入IdeficsPerceiverResampler和IdeficsVisionTransformer模块
from .perceiver import IdeficsPerceiverResampler
from .vision import IdeficsVisionTransformer

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的配置示例
_CONFIG_FOR_DOC = "IdeficsConfig"

# 预训练模型的存档列表
IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "HuggingFaceM4/idefics-9b",
    "HuggingFaceM4/idefics-80b",
    # 查看所有Idefics模型 https://huggingface.co/models?filter=idefics
]

@dataclass
# 带有过去键/值的Idefics模型输出的基类，用于加速顺序解码
class IdeficsBaseModelOutputWithPast(ModelOutput):
    """
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    """
    """
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的输出隐藏状态序列。

            如果使用了 `past_key_values`，则只输出形状为 `(batch_size, 1, hidden_size)` 的每个序列的最后一个隐藏状态。
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *可选*, 当传入 `use_cache=True` 或 `config.use_cache=True` 时返回):
            长度为 `config.n_layers` 的元组，每个元组包含两个张量，形状为 `(batch_size, num_heads, sequence_length, embed_size_per_head)`。

            包含预计算的隐藏状态（自注意力块中的键和值，以及如果 `config.is_encoder_decoder=True` 在交叉注意力块中也包含），
            可用于加速序列解码。
        hidden_states (`tuple(torch.FloatTensor)`, *可选*, 当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            元组的 `torch.FloatTensor`（如果模型具有嵌入层，则为嵌入输出的张量 + 每层的输出张量），形状为 `(batch_size, sequence_length, hidden_size)`。

            模型每一层的隐藏状态，加上可选的初始嵌入层输出。
        attentions (`tuple(torch.FloatTensor)`, *可选*, 当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            元组的 `torch.FloatTensor`（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            自注意力头中注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
        image_hidden_states (`tuple(torch.FloatTensor)`, *可选*):
            元组的 `torch.FloatTensor`（图像嵌入输出的一个，形状为 `(batch_size, num_images, sequence_length, hidden_size)`）。

            模型通过视觉编码器生成的图像隐藏状态，以及通过感知者生成的图像隐藏状态。
    """

    last_hidden_state: torch.FloatTensor = None  # 初始化最后一个隐藏状态
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 初始化预计算的键和值
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 初始化所有层的隐藏状态
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # 初始化注意力权重
    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 初始化图像隐藏状态
@dataclass
class IdeficsCausalLMOutputWithPast(ModelOutput):
    """
    Base class for Idefics causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`).

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    """

    loss: Optional[torch.FloatTensor] = None  # 初始化为可选的 torch.FloatTensor，用于存储语言模型损失
    logits: torch.FloatTensor = None  # 初始化为 torch.FloatTensor，存储语言模型头部的预测分数（softmax之前）
    past_key_values: Optional[List[torch.FloatTensor]] = None  # 初始化为可选的列表，存储预计算的自注意力块中的键值对
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 初始化为可选的元组，存储模型每层的隐藏状态输出
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # 初始化为可选的元组，存储每层的注意力权重
    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 初始化为可选的元组，存储视觉编码器产生的图像隐藏状态


def expand_inputs_for_generation(
    input_ids,
    expand_size=1,
    is_encoder_decoder=False,
    attention_mask=None,
    encoder_outputs=None,
    **model_kwargs,
):
    """
    扩展输入以用于生成

    Args:
        input_ids: 输入的 token IDs
        expand_size: 扩展的大小，用于生成的副本数
        is_encoder_decoder: 是否是编码器-解码器结构
        attention_mask: 注意力掩码
        encoder_outputs: 编码器的输出，用于解码器的输入
        **model_kwargs: 其他模型的关键字参数
    """

    expanded_return_idx = (
        torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
    )
    # 使用索引从输入张量中选择特定的行，更新 input_ids 变量
    input_ids = input_ids.index_select(0, expanded_return_idx)

    # 将像素值添加到模型关键字参数中，如果已存在则保持不变
    model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)

    # 将图像编码器嵌入向量添加到模型关键字参数中，如果已存在则保持不变
    model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None)

    # 将感知器嵌入向量添加到模型关键字参数中，如果已存在则保持不变
    model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None)

    # 将图像注意力掩码添加到模型关键字参数中，如果已存在则保持不变
    model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)

    # 如果模型关键字参数中存在 'token_type_ids'，则选择特定行更新其对应值
    if "token_type_ids" in model_kwargs:
        token_type_ids = model_kwargs["token_type_ids"]
        model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)

    # 如果存在注意力掩码，选择特定行更新模型关键字参数中的 'attention_mask'
    if attention_mask is not None:
        model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)

    # 如果模型关键字参数中的 'image_attention_mask' 不为 None，则选择特定行更新它
    if model_kwargs["image_attention_mask"] is not None:
        model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
            0, expanded_return_idx
        )

    # 如果模型关键字参数中的 'pixel_values' 不为 None，则选择特定行更新它
    if model_kwargs["pixel_values"] is not None:
        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)

    # 否则，如果 'image_encoder_embeddings' 不为 None，则选择特定行更新它
    elif model_kwargs["image_encoder_embeddings"] is not None:
        model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select(
            0, expanded_return_idx
        )

    # 否则，如果 'perceiver_embeddings' 不为 None，则选择特定行更新它
    elif model_kwargs["perceiver_embeddings"] is not None:
        model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].index_select(
            0, expanded_return_idx
        )

    # 返回更新后的 input_ids 和 model_kwargs
    return input_ids, model_kwargs
def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
    token_type_ids = kwargs.get("token_type_ids", None)
    # 如果 past_key_values 在 kwargs 中定义，则只使用 input_ids 的最后一个 token
    if past_key_values:
        input_ids = input_ids[:, -1].unsqueeze(-1)
        if token_type_ids is not None:
            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

    attention_mask = kwargs.get("attention_mask", None)
    position_ids = kwargs.get("position_ids", None)

    if attention_mask is not None and position_ids is None:
        # 为批量生成创建动态的 position_ids
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids.masked_fill_(attention_mask == 0, 1)
        if past_key_values:
            position_ids = position_ids[:, -1].unsqueeze(-1)

    pixel_values = kwargs.get("pixel_values", None)
    image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None)
    perceiver_embeddings = kwargs.get("perceiver_embeddings", None)
    image_attention_mask = kwargs.get("image_attention_mask", None)
    interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)

    # 返回包含所有输入准备数据的字典
    return {
        "input_ids": input_ids,
        "past_key_values": past_key_values,
        "use_cache": kwargs.get("use_cache"),
        "position_ids": position_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "pixel_values": pixel_values,
        "image_encoder_embeddings": image_encoder_embeddings,
        "perceiver_embeddings": perceiver_embeddings,
        "image_attention_mask": image_attention_mask,
        "interpolate_pos_encoding": interpolate_pos_encoding,
    }


def freeze_model(model, module_exceptions=[]):
    # 映射常见模块类型到 PyTorch 中对应的类
    mapping = {
        "LayerNorm": nn.LayerNorm,
        "Linear": nn.Linear,
        "Embedding": nn.Embedding,
    }
    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
    # 遍历模型的所有模块，冻结除了例外模块之外的所有参数
    for module in model.modules():
        if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
            module.requires_grad_(True)  # 明确将其设置为 True，避免任何错误
        else:
            module.requires_grad_(False)
    return model


class IdeficsDecoupledEmbedding(nn.Embedding):
    # 源自 https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
    """
    实现参数解耦以允许冻结（或不冻结）嵌入的子集。在实践中，regular `weight` 可以训练或冻结
    （即 `partially_freeze=True`），如果 `num_additional_embeddings > 0`，则会创建
    `num_additional_embeddings` 个额外的始终训练的参数。如果 `num_additional_embeddings=0`，
    则模块默认为 `nn.Embedding` 的常规行为。
    """
    # 初始化函数，用于创建一个新的嵌入层对象
    def __init__(
        self,
        num_embeddings,
        num_additional_embeddings,
        embedding_dim,
        partially_freeze: Optional[bool] = False,
        device=None,
        dtype=None,
        padding_idx=None,
        **kwargs,
    ) -> None:
        """
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        """
        # 检查 padding_idx 是否有效，必须小于 num_embeddings
        if padding_idx is not None and padding_idx > num_embeddings:
            raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
        
        # 调用父类 nn.Embedding 的初始化方法，传入大部分参数
        super().__init__(
            num_embeddings=num_embeddings,
            embedding_dim=embedding_dim,
            device=device,
            dtype=dtype,
            padding_idx=padding_idx,
            **kwargs,
        )
        
        # 初始化特定于当前类的成员变量
        self.num_embeddings = num_embeddings
        self.padding_idx = padding_idx
        self.num_additional_embeddings = num_additional_embeddings
        self.partially_freeze = partially_freeze

        # 如果 partially_freeze 为 True，则冻结主要的 weight 参数
        if partially_freeze:
            self.weight.requires_grad_(False)

        # 如果有额外的嵌入向量需求，则创建额外的 nn.Embedding 对象
        if self.num_additional_embeddings > 0:
            self.additional_embedding = nn.Embedding(
                num_embeddings=self.num_additional_embeddings,
                embedding_dim=embedding_dim,
                device=device,
                dtype=dtype,
            )
    def forward(self, input_ids):
        """
        前向传播函数，用于模型的正向计算过程。

        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.
        我们有两个嵌入层，它们有不同的索引范围：
        - 一个是预训练的 self.weight
        - 另一个是正在训练的 self.additional_embedding.weight

        in order to make a lookup of the input ids, we:
        为了查找输入的 id，我们执行以下步骤：
        1. find out the indices of the entries belonging to the 2nd embedding
        1. 找出属于第二个嵌入层的条目的索引
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        2. 提取这些值，同时减去第一个嵌入层的大小（num_embeddings），因为第二个嵌入层的索引从0开始，而不是从num_embeddings开始
        3. perform the 2nd embedding lookup
        3. 执行第二个嵌入层的查找操作
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        4. 现在处理第一个嵌入层，我们用填充索引覆盖属于第二个嵌入层的索引
        5. perform the 1st embedding lookup
        5. 执行第一个嵌入层的查找操作
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup
        6. 现在我们用第二个嵌入层查找的值覆盖第一个嵌入层查找的值

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.
        注意：对于第一个嵌入层的查找，我们本可以只查找低索引而不进行填充，但那样我们就必须创建一个新的张量，并用两个分散在不同索引上的张量填充它 - 也就是不简单的连接操作 - 我还没有对复杂情况进行基准测试，如果更快的话，鉴于序列长度通常相对较短，可能并不更快，或者如果更快，提升也不会很大 - 但是测量一下可能是个好主意。

        """
        if self.num_additional_embeddings == 0:
            return F.embedding(input_ids, self.weight)

        # Clone so that we don't modify the original input_ids later on
        # 克隆 input_ids，以防后续修改原始输入
        input_ids = input_ids.clone()
        
        # Find indices where input_ids belong to the additional embedding
        # 找到 input_ids 中属于额外嵌入层的索引
        additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
        
        # Extract input_ids values that belong to the additional vocabulary
        # 提取属于额外词汇表的 input_ids 值
        input_ids_additional_vocab = input_ids[additional_vocab_indices]
        
        # Perform embedding lookup for additional embeddings
        # 执行额外嵌入层的查找
        additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)

        # Set indices of additional vocabulary to 0, as these results will be discarded
        # 将额外词汇表的索引设置为0，因为这些结果将被丢弃
        input_ids[additional_vocab_indices] = 0
        
        # Perform embedding lookup for the main embedding (self.weight)
        # 执行主嵌入层（self.weight）的查找
        full_vector = F.embedding(input_ids, self.weight)

        # Overwrite the records with high indices with values from additional embeddings
        # 用额外嵌入层的值覆盖高索引位置的记录
        full_vector[additional_vocab_indices] = additional_embeddings

        return full_vector

    def extra_repr(self) -> str:
        """
        返回模型的额外信息，用于描述模型的属性。

        Returns:
        返回包含模型属性的字符串
        """
        return "num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
            self.num_embeddings,
            self.num_additional_embeddings,
            self.embedding_dim,
            self.partially_freeze,
        )
class IdeficsDecoupledLinear(nn.Linear):
    # 从 https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear 派生而来的类，实现参数的解耦，允许部分参数冻结或训练。
    """
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    """

    def __init__(
        self,
        in_features: int,
        out_features: int,
        out_additional_features: int = 0,
        bias: bool = True,
        partially_freeze: bool = True,
        device=None,
        dtype=None,
    ) -> None:
        """
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        """
        super().__init__(in_features, out_features, bias, device, dtype)
        # 初始化自定义参数
        self.out_additional_features = out_additional_features
        self.partially_freeze = partially_freeze

        self.in_features = in_features
        self.out_features = out_features

        # 如果 partially_freeze 为 True，则冻结权重和偏置的梯度
        if partially_freeze:
            self.weight.requires_grad_(False)
            if bias:
                self.bias.requires_grad_(False)

        # 如果有额外的特征维度要训练，则创建额外的线性层 additional_fc
        if out_additional_features > 0:
            self.additional_fc = nn.Linear(
                in_features=in_features,
                out_features=out_additional_features,
                bias=bias,
                device=device,
                dtype=dtype,
            )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # 执行前向传播，计算线性层的输出
        output = F.linear(input, self.weight, self.bias)

        # 如果有额外的特征维度要处理，则将其连接到输出中
        if self.out_additional_features > 0:
            additional_features = self.additional_fc(input)
            output = torch.cat((output, additional_features), -1)

        return output

    def extra_repr(self) -> str:
        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
        # 重写 `nn.Linear.extra_repr` 方法，以包含新的参数信息
        return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
            self.in_features,
            self.out_features,
            self.out_additional_features,
            self.bias is not None,
            self.partially_freeze,
        )


# this was adapted from LlamaRMSNorm
class IdeficsRMSNorm(nn.Module):
    # 基于 LlamaRMSNorm 进行了适配
    def __init__(self, hidden_size, eps=1e-6):
        """
        IdeficsRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        # 初始化权重参数为可训练的张量
        self.weight = nn.Parameter(torch.ones(hidden_size))
        # 设置方差 epsilon
        self.variance_epsilon = eps
    # 定义前向传播方法，接收隐藏状态作为输入
    def forward(self, hidden_states):
        # 计算每个隐藏状态的方差，并保持维度不变
        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        # 根据方差对隐藏状态进行归一化处理
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # 如果权重的数据类型是半精度浮点数（float16 或 bfloat16），则将隐藏状态转换为相应的数据类型
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        # 返回加权后的归一化隐藏状态
        return self.weight * hidden_states
# 将 IdeficsRMSNorm 类型对象添加到 ALL_LAYERNORM_LAYERS 列表中
ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)


# 这是从 LlamaRotaryEmbedding 改编而来的类
class IdeficsEmbedding(torch.nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        # 计算频率的倒数，用于位置编码
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # 为了使 `torch.jit.trace` 能够正常工作，在这里构建缓存
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 设置余弦和正弦缓存
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # 与论文不同，但使用了不同的排列顺序来获得相同的计算结果
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


def rotate_half(x):
    """将输入的隐藏维度的一半进行旋转。"""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


# 从 transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb 复制而来
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """将旋转位置编码应用到查询和键张量上。
    # 通过使用位置索引从余弦向量中选择对应的值，并在指定的维度上进行unsqueeze操作，以便与q和k的形状进行广播。
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    
    # 通过使用位置索引从正弦向量中选择对应的值，并在指定的维度上进行unsqueeze操作，以便与q和k的形状进行广播。
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    
    # 将查询向量q与余弦向量cos相乘并加上查询向量q与正弦向量sin经过rotate_half函数后的乘积，生成旋转后的查询向量。
    q_embed = (q * cos) + (rotate_half(q) * sin)
    
    # 将键向量k与余弦向量cos相乘并加上键向量k与正弦向量sin经过rotate_half函数后的乘积，生成旋转后的键向量。
    k_embed = (k * cos) + (rotate_half(k) * sin)
    
    # 返回旋转后的查询向量和键向量组成的元组。
    return q_embed, k_embed
# 这段代码改编自 LlamaMLP
class IdeficsMLP(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        hidden_act: str,
    ):
        super().__init__()
        # 定义一个线性层，用于门控投影，输入维度为 hidden_size，输出维度为 intermediate_size，无偏置
        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
        # 定义一个线性层，用于下游投影，输入维度为 intermediate_size，输出维度为 hidden_size，无偏置
        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
        # 定义一个线性层，用于上游投影，输入维度为 hidden_size，输出维度为 intermediate_size，无偏置
        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
        # 激活函数为根据 hidden_act 参数选择的激活函数，从全局字典 ACT2FN 中获取
        self.act_fn = ACT2FN[hidden_act]

    def forward(self, x):
        # 执行前向传播，结合门控投影、激活函数和上游投影，然后通过下游投影得到输出
        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))


# 这段代码改编自 LlamaAttention
class IdeficsAttention(nn.Module):
    """来自 'Attention Is All You Need' 论文中的多头注意力"""

    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        dropout: float = 0.0,
        is_cross_attention: bool = False,
        config: PretrainedConfig = None,
        qk_layer_norms: bool = False,
    ):
        super().__init__()
    ):
        super().__init__()  # 调用父类的初始化方法
        self.hidden_size = hidden_size  # 设置模型的隐藏层大小
        self.num_heads = num_heads  # 设置注意力头的数量
        self.head_dim = hidden_size // num_heads  # 计算每个注意力头的维度
        self.dropout = dropout  # 设置dropout的比例
        self.is_causal = True  # 设定是否是因果注意力机制

        if (self.head_dim * num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {num_heads})."
            )  # 检查隐藏层大小是否能够被注意力头数量整除，如果不能则抛出数值错误异常

        self.is_cross_attention = is_cross_attention  # 标记是否是交叉注意力

        if not hasattr(nn.functional, "scaled_dot_product_attention"):
            raise ValueError("this model requires pytorch 2.0 or higher")  # 检查是否支持所需的PyTorch版本

        if self.is_cross_attention:
            kv_input_dim = (
                self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
            )
            self.q_proj = nn.Linear(
                self.hidden_size,
                num_heads * self.head_dim,
                bias=False,
            )  # 创建查询投影层
            self.k_proj = nn.Linear(kv_input_dim, num_heads * self.head_dim, bias=False)  # 创建键投影层
            self.v_proj = nn.Linear(
                kv_input_dim,
                num_heads * self.head_dim,
                bias=False,
            )  # 创建值投影层
        else:
            self.q_proj = nn.Linear(
                self.hidden_size,
                num_heads * self.head_dim,
                bias=False,
            )  # 创建查询投影层
            self.k_proj = nn.Linear(
                self.hidden_size,
                num_heads * self.head_dim,
                bias=False,
            )  # 创建键投影层
            self.v_proj = nn.Linear(
                self.hidden_size,
                num_heads * self.head_dim,
                bias=False,
            )  # 创建值投影层
        self.o_proj = nn.Linear(
            num_heads * self.head_dim,
            hidden_size,
            bias=False,
        )  # 创建输出投影层

        self.rotary_emb = IdeficsEmbedding(self.head_dim)  # 创建旋转嵌入层对象

        self.qk_layer_norms = qk_layer_norms  # 设置是否进行查询和键的层标准化

        if self.qk_layer_norms:
            self.q_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # 创建查询层标准化对象
            self.k_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # 创建键层标准化对象

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
        # 将张量重塑为(batch_size, sequence_length, num_heads, head_dim)，并转置维度以符合注意力机制的需求

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
# this was adapted from LlamaDecoderLayer
# 定义一个名为 IdeficsDecoderLayer 的类，继承自 nn.Module
class IdeficsDecoderLayer(nn.Module):
    def __init__(self, config: IdeficsConfig):
        super().__init__()
        # 初始化隐藏层大小
        self.hidden_size = config.hidden_size
        # 创建自注意力层对象，使用配置中的参数进行初始化
        self.self_attn = IdeficsAttention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.dropout,
            config=config,
        )
        # 创建MLP对象，使用配置中的参数进行初始化
        self.mlp = IdeficsMLP(
            hidden_size=self.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
        )
        # 创建输入层归一化对象，使用配置中的隐藏大小和RMS归一化的epsilon参数进行初始化
        self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        # 创建注意力后归一化对象，使用配置中的隐藏大小和RMS归一化的epsilon参数进行初始化
        self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        # 设置Dropout概率，使用配置中的dropout参数
        self.dropout = config.dropout

    # 定义前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """

        residual = hidden_states  # 保留输入 hidden_states 的原始值，用于后续残差连接

        hidden_states = self.input_layernorm(hidden_states)  # 使用层归一化对输入进行归一化处理

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对输出进行 dropout 处理
        hidden_states = residual + hidden_states  # 残差连接：将归一化前的输入与经过 self attention 和 dropout 处理后的输出相加

        # Fully Connected
        residual = hidden_states  # 保留上一步操作后的值，用于后续残差连接
        hidden_states = self.post_attention_layernorm(hidden_states)  # 使用层归一化对输出进行归一化处理
        hidden_states = self.mlp(hidden_states)  # 使用全连接层进行线性变换
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对输出进行 dropout 处理
        hidden_states = residual + hidden_states  # 残差连接：将归一化后的输出与经过 MLP 和 dropout 处理后的输出相加

        outputs = (hidden_states,)  # 将处理后的 hidden_states 放入输出元组中

        if output_attentions:
            outputs += (self_attn_weights,)  # 如果需要输出 attention 权重，则将 self_attn_weights 放入输出元组中

        if use_cache:
            outputs += (present_key_value,)  # 如果需要使用缓存的过去键值状态，则将 present_key_value 放入输出元组中

        return outputs  # 返回包含处理后的结果的元组
# 定义自定义的 gated cross-attention 层，继承自 nn.Module
class IdeficsGatedCrossAttentionLayer(nn.Module):
    # 前向传播方法定义，接收多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        attention_mask: Optional[torch.Tensor] = None,  # 可选的注意力遮罩张量
        image_hidden_states: Optional[torch.Tensor] = None,  # 可选的图像隐藏状态张量
        image_attention_mask: Optional[torch.Tensor] = None,  # 可选的图像注意力遮罩张量
        cross_attention_gate: Optional[torch.Tensor] = None,  # 可选的交叉注意力门控张量
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重的标志
        use_cache: Optional[bool] = False,  # 是否使用缓存的标志
        past_key_value: Optional[Tuple[torch.Tensor]] = None,  # 可选的过去的键值对元组
LLAMA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`IdeficsConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
# 定义一个预训练模型类，继承自 PreTrainedModel
class IdeficsPreTrainedModel(PreTrainedModel):
    config_class = IdeficsConfig  # 使用 IdeficsConfig 类作为配置类
    base_model_prefix = "model"  # 基础模型前缀为 "model"
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]  # 不拆分的模块列表
    _supports_sdpa = True  # 支持自动分配并行性加速（Self-Delegated Parallelism Acceleration, SDPA）

    def _init_weights(self, module):
        # 重要提示：这个 Idefics 的移植版本不适用于从头训练，只能用于推理和微调
        # 因此，初始化权重的正确代码已被删除。m4 代码库应该用于从头训练，并包含正确的代码。
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):  # 如果是线性层
            module.weight.data.normal_(mean=0.0, std=std)  # 权重初始化为正态分布
            if module.bias is not None:
                module.bias.data.zero_()  # 如果存在偏置，将其初始化为零
        elif isinstance(module, nn.Embedding):  # 如果是嵌入层
            module.weight.data.normal_(mean=0.0, std=std)  # 权重初始化为正态分布
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()  # 如果存在填充索引，将其初始化为零

    # 从 transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa 适配而来
    @classmethod
    # 定义一个类方法 `_check_and_enable_sdpa`，用于检查并启用 SDPA 注意力机制配置
    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
        # 检查是否启用了 `use_bettertransformer` 属性，用于决定是否返回原始配置
        _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
        if _is_bettertransformer:
            return config

        # 如果不仅仅是进行硬性检查，设置注意力实现方式为 "sdpa"
        if not hard_check_only:
            config._attn_implementation = "sdpa"
        # 返回修改后的配置对象
        return config
# 定义一个多行字符串，用于文档化LLaMA输入的说明文档
LLAMA_INPUTS_DOCSTRING = r"""
"""

# 使用装饰器为IdeficsModel类添加文档字符串，在输出原始隐藏状态时不添加特定的顶部头信息
@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
# 定义IdeficsModel类，继承自IdeficsPreTrainedModel类
class IdeficsModel(IdeficsPreTrainedModel):
    """
    Transformer解码器，由`config.num_hidden_layers`层组成。每一层是一个[`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    """

    def __init__(self, config: IdeficsConfig):
        # 调用父类的构造函数进行初始化
        super().__init__(config)
        # 将config参数保存在实例变量中
        self.config = config
        # 设置填充索引为config中定义的pad_token_id
        self.padding_idx = config.pad_token_id
        # 设置词汇表大小为config中定义的vocab_size
        self.vocab_size = config.vocab_size

        # 创建IdeficsDecoupledEmbedding实例，并保存在embed_tokens实例变量中
        self.embed_tokens = IdeficsDecoupledEmbedding(
            num_embeddings=config.vocab_size,
            num_additional_embeddings=config.additional_vocab_size,
            embedding_dim=config.hidden_size,
            partially_freeze=config.freeze_text_layers,
            padding_idx=self.padding_idx,
        )

        # 设置图像尺寸和视觉配置，从config参数中获取
        self.image_size = config.vision_config.image_size
        self.vision_config = config.vision_config
        # 创建IdeficsVisionTransformer实例，并保存在vision_model实例变量中
        self.vision_model = IdeficsVisionTransformer(config.vision_config)

        # 如果config中设置了使用resampler，则创建IdeficsPerceiverResampler实例，并保存在perceiver_resampler实例变量中
        if config.use_resampler:
            perceiver_config = config.perceiver_config
            self.perceiver_resampler = IdeficsPerceiverResampler(
                config,
                config.vision_config.embed_dim,
                perceiver_config.resampler_depth,
                perceiver_config.resampler_n_heads,
                perceiver_config.resampler_head_dim,
                perceiver_config.resampler_n_latents,
            )

        # 创建包含config.num_hidden_layers个IdeficsDecoderLayer实例的模块列表，并保存在layers实例变量中
        self.layers = nn.ModuleList([IdeficsDecoderLayer(config) for _ in range(config.num_hidden_layers)])

        # 设置跨层间隔为config中定义的cross_layer_interval
        self.cross_layer_interval = config.cross_layer_interval
        # 计算跨层注意力层的数量
        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
        # 创建包含num_cross_layers个IdeficsGatedCrossAttentionLayer实例的模块列表，并保存在gated_cross_attn_layers实例变量中
        self.gated_cross_attn_layers = nn.ModuleList(
            [IdeficsGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)]
        )
        # 设置梯度检查点标志为False
        self.gradient_checkpointing = False

        # 创建IdeficsRMSNorm实例，并保存在norm实例变量中
        self.norm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        # 初始化权重并进行最终处理
        self.post_init()

        # 冻结相关参数
        self.freeze_relevant_params(config)

    # 方法：冻结相关参数
    def freeze_relevant_params(self, config=None):
        if config is None:
            config = self.config

        # 如果配置中指定冻结文本层，则调用freeze_text_layers方法冻结相关模块
        if config.freeze_text_layers:
            self.freeze_text_layers(config.freeze_text_module_exceptions)

        # 如果配置中指定冻结视觉层，则调用freeze_vision_layers方法冻结视觉模型
        if config.freeze_vision_layers:
            freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)

    # 方法：冻结文本层
    def freeze_text_layers(self, module_exceptions=[]):
        # 遍历self.layers和self.norm列表中的模块，调用freeze_model函数冻结指定模块
        for module in [self.layers, self.norm]:
            freeze_model(module, module_exceptions=module_exceptions)

    # 方法：冻结视觉层
    def freeze_vision_layers(self, module_exceptions=[]):
        # 调用freeze_model函数冻结self.vision_model中指定的模块
        freeze_model(self.vision_model, module_exceptions=module_exceptions)

    # 方法：获取输入嵌入层
    def get_input_embeddings(self):
        return self.embed_tokens
    # 设置模型的输入嵌入表示
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 在模型前向传播过程中添加注释到模型文档的装饰器
    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的token IDs，类型为LongTensor
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩，可选的Tensor类型
        position_ids: Optional[torch.LongTensor] = None,  # 位置IDs，可选的LongTensor类型
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对，可选的浮点数张量列表
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入表示，可选的浮点数张量
        pixel_values: Optional[torch.FloatTensor] = None,  # 像素值，可选的浮点数张量
        image_encoder_embeddings: Optional[torch.FloatTensor] = None,  # 图像编码器嵌入，可选的浮点数张量
        perceiver_embeddings: Optional[torch.FloatTensor] = None,  # 感知器嵌入，可选的浮点数张量
        image_attention_mask: Optional[torch.Tensor] = None,  # 图像注意力遮罩，可选的Tensor类型
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选的布尔类型
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选的布尔类型
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选的布尔类型
        interpolate_pos_encoding: Optional[bool] = False,  # 是否插值位置编码，布尔类型，默认为False
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选的布尔类型
    class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
        # 在加载时需要忽略的键列表，用于处理缺失情况
        _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
        # 要绑定权重的键列表，指定需要共享权重的模型参数
        _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]

        def __init__(self, config, vision_model=None):
            # 调用父类的初始化方法，传入配置参数
            super().__init__(config)
            # 使用给定的配置参数初始化 IdeficsModel 模型
            self.model = IdeficsModel(config)

            # 使用 IdeficsDecoupledLinear 初始化 lm_head 层
            self.lm_head = IdeficsDecoupledLinear(
                in_features=config.hidden_size,
                out_features=config.vocab_size,
                out_additional_features=config.additional_vocab_size,
                bias=False,
                partially_freeze=config.freeze_lm_head,
            )

            # 执行初始化权重并进行最终处理
            self.post_init()

        def get_input_embeddings(self):
            # 返回模型的 embed_tokens 层，用于输入嵌入
            return self.model.embed_tokens

        def set_input_embeddings(self, value):
            # 设置模型的 embed_tokens 层，用于输入嵌入
            self.model.embed_tokens = value

        def get_output_embeddings(self):
            # 返回 lm_head 层，用于输出嵌入
            return self.lm_head

        def set_output_embeddings(self, new_embeddings):
            # 设置 lm_head 层，用于输出嵌入
            self.lm_head = new_embeddings

        def set_decoder(self, decoder):
            # 设置模型的 decoder 层
            self.model = decoder

        def get_decoder(self):
            # 返回模型的 decoder 层
            return self.model

        def tie_weights(self):
            """
            重写 `transformers.modeling_utils.PreTrainedModel.tie_weights` 方法，
            处理 IdeficsDecoupledLinear 和 IdeficsDecoupledEmbedding 的情况。
            """
            output_embeddings = self.get_output_embeddings()
            input_embeddings = self.get_input_embeddings()

            # 如果配置允许绑定词嵌入，则将输出嵌入的权重设置为输入嵌入的权重
            if getattr(self.config, "tie_word_embeddings", True):
                output_embeddings.weight = input_embeddings.weight
                # 如果存在额外的嵌入，则也绑定额外的嵌入权重
                if input_embeddings.num_additional_embeddings > 0:
                    assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
                    output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight

            # 更新输出嵌入的特征数和额外特征数，以匹配输入嵌入的数目
            if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
                output_embeddings.out_features = input_embeddings.num_embeddings
                if hasattr(output_embeddings, "out_additional_features") and hasattr(
                    input_embeddings, "num_additional_embeddings"
                ):
                    output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings

        @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=IdeficsCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法用于处理前向推断过程中的输入数据
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的token ID序列，默认为None
        attention_mask: Optional[torch.Tensor] = None,  # 可选的注意力掩码张量，默认为None
        position_ids: Optional[torch.LongTensor] = None,  # 可选的位置ID张量，默认为None
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 可选的过去键值对列表，默认为None
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 可选的嵌入输入张量，默认为None
        pixel_values: Optional[torch.FloatTensor] = None,  # 可选的像素值张量，默认为None
        image_encoder_embeddings: Optional[torch.FloatTensor] = None,  # 可选的图像编码器嵌入张量，默认为None
        perceiver_embeddings: Optional[torch.FloatTensor] = None,  # 可选的感知器嵌入张量，默认为None
        image_attention_mask: Optional[torch.Tensor] = None,  # 可选的图像注意力掩码张量，默认为None
        labels: Optional[torch.LongTensor] = None,  # 可选的标签张量，默认为None
        use_cache: Optional[bool] = None,  # 可选的缓存使用标志，默认为None
        output_attentions: Optional[bool] = None,  # 可选的输出注意力张量，默认为None
        output_hidden_states: Optional[bool] = None,  # 可选的输出隐藏状态标志，默认为None
        interpolate_pos_encoding: Optional[bool] = False,  # 可选的位置编码插值标志，默认为False
        return_dict: Optional[bool] = None,  # 可选的返回字典标志，默认为None
    ):
        # 定义一个方法用于准备生成过程中的输入数据
        def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
            # 从kwargs中获取image_hidden_states参数，如果存在的话
            image_hidden_states = kwargs.pop("image_hidden_states", None)
            if image_hidden_states is not None:
                # 如果配置中使用resampler，则将perceiver_embeddings设置为image_hidden_states，否则设置为None
                if self.config.use_resampler:
                    kwargs["perceiver_embeddings"] = image_hidden_states
                else:
                    kwargs["image_encoder_embeddings"] = image_hidden_states
                kwargs["pixel_values"] = None  # 将像素值设置为None
            # 调用准备生成输入数据的函数，传递input_ids、past以及其他未处理的kwargs参数
            inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
            unwanted_kwargs = ["token_type_ids"]  # 定义一个不需要的kwargs参数列表
            for kwarg in unwanted_kwargs:
                inputs.pop(kwarg, None)  # 从inputs中移除不需要的kwargs参数
            return inputs  # 返回处理后的inputs字典

        @staticmethod
        def _expand_inputs_for_generation(
            *args,
            **model_kwargs,
        ):
            # 调用扩展生成输入数据的函数，传递args和model_kwargs参数
            return expand_inputs_for_generation(*args, **model_kwargs)

        # 定义一个方法，用于生成过程中更新模型关键字参数
        def _update_model_kwargs_for_generation(
            self,
            outputs: ModelOutput,  # 输出模型的结果
            model_kwargs: Dict[str, Any],  # 模型关键字参数的字典
            is_encoder_decoder: bool = False,  # 是否是编码器-解码器结构，默认为False
            standardize_cache_format: bool = False,  # 是否标准化缓存格式，默认为False
        ) -> Dict[str, Any]:  # 返回更新后的模型关键字参数的字典
            # 调用父类的更新模型关键字参数函数，传递outputs、model_kwargs、is_encoder_decoder和standardize_cache_format参数
            model_kwargs = super()._update_model_kwargs_for_generation(
                outputs,
                model_kwargs,
                is_encoder_decoder,
                standardize_cache_format,
            )

            # 如果model_kwargs中包含'image_attention_mask'键
            if "image_attention_mask" in model_kwargs:
                image_attention_mask = model_kwargs["image_attention_mask"]
                # 取图像注意力掩码的最后一个mask并添加一个维度
                last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
                model_kwargs["image_attention_mask"] = last_mask  # 更新模型关键字参数中的'image_attention_mask'为最后一个mask

            # 获取预计算的image_hidden_states并添加到模型关键字参数中
            model_kwargs["image_hidden_states"] = outputs.image_hidden_states
            return model_kwargs  # 返回更新后的模型关键字参数的字典

        @staticmethod
        def _reorder_cache(past, beam_idx):
            reordered_past = ()
            # 遍历每一层的过去状态，并按beam_idx重新排序
            for layer_past in past:
                reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
            return reordered_past  # 返回重新排序后的过去状态

Transformers-源码解析-五十八-

Transformers 源码解析（五十八）

.\models\hubert\modeling_tf_hubert.py

.\models\hubert\__init__.py

.\models\ibert\configuration_ibert.py

.\models\ibert\modeling_ibert.py

.\models\ibert\quant_modules.py

.\models\ibert\__init__.py

.\models\idefics\configuration_idefics.py

.\models\idefics\image_processing_idefics.py

.\models\idefics\modeling_idefics.py

`.\models\hubert\modeling_tf_hubert.py`

`.\models\hubert\init.py`

`.\models\ibert\configuration_ibert.py`

`.\models\ibert\modeling_ibert.py`

`.\models\ibert\quant_modules.py`

`.\models\ibert\init.py`

`.\models\idefics\configuration_idefics.py`

`.\models\idefics\image_processing_idefics.py`

`.\models\idefics\modeling_idefics.py`