Transformers 源码解析（三十三）

`.\models\deberta\modeling_tf_deberta.py`

# 定义 TFDebertaContextPooler 类，用于处理 DeBERTa 模型的上下文池化操作
class TFDebertaContextPooler(keras.layers.Layer):
    def __init__(self, config: DebertaConfig, **kwargs):
        super().__init__(**kwargs)
        # 创建一个全连接层，用于池化上下文表示
        self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
        # 初始化一个稳定的 Dropout 层，用于在训练过程中进行正则化
        self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout")
        # 存储配置信息
        self.config = config

    def call(self, hidden_states, training: bool = False):
        # 通过仅使用第一个 token 对应的隐藏状态来进行模型的“池化”
        context_token = hidden_states[:, 0]
        # 在训练过程中，应用 Dropout 正则化到 context_token
        context_token = self.dropout(context_token, training=training)
        # 将经过 Dropout 后的 context_token 输入全连接层
        pooled_output = self.dense(context_token)
        # 应用激活函数到池化后的输出
        pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output)
        # 返回池化后的输出表示
        return pooled_output

    @property
    def output_dim(self) -> int:
        # 返回输出的维度，即隐藏大小
        return self.config.hidden_size
    # 定义神经网络模型的 build 方法，用于构建模型的结构
    def build(self, input_shape=None):
        # 如果模型已经构建完成，直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果存在 dense 属性（密集连接层），则构建该层
        if getattr(self, "dense", None) is not None:
            # 使用 tf.name_scope 为 dense 层创建命名空间，命名空间名称为 dense.name
            with tf.name_scope(self.dense.name):
                # 调用 dense 层的 build 方法，指定输入的形状为 [None, None, self.config.pooler_hidden_size]
                self.dense.build([None, None, self.config.pooler_hidden_size])
        # 如果存在 dropout 属性，则构建 dropout 层
        if getattr(self, "dropout", None) is not None:
            # 使用 tf.name_scope 为 dropout 层创建命名空间，命名空间名称为 dropout.name
            with tf.name_scope(self.dropout.name):
                # 调用 dropout 层的 build 方法，输入形状为 None（表示任意形状）
                self.dropout.build(None)
class TFDebertaXSoftmax(keras.layers.Layer):
    """
    Masked Softmax which is optimized for saving memory

    Args:
        input (`tf.Tensor`): The input tensor that will apply softmax.
        mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax
    """

    def __init__(self, axis=-1, **kwargs):
        super().__init__(**kwargs)
        self.axis = axis

    def call(self, inputs: tf.Tensor, mask: tf.Tensor):
        # 创建反向的掩码张量，将 mask 张量转换成布尔类型取反
        rmask = tf.logical_not(tf.cast(mask, tf.bool))
        # 将输入张量中掩码为 True 的位置置为负无穷，保证 softmax 计算时被忽略
        output = tf.where(rmask, float("-inf"), inputs)
        # 对处理后的张量应用稳定的 softmax 函数
        output = stable_softmax(output, self.axis)
        # 将之前处理的掩码位置重新置为 0.0，保证输出符合预期
        output = tf.where(rmask, 0.0, output)
        return output


class TFDebertaStableDropout(keras.layers.Layer):
    """
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    """

    def __init__(self, drop_prob, **kwargs):
        super().__init__(**kwargs)
        self.drop_prob = drop_prob

    @tf.custom_gradient
    def xdropout(self, inputs):
        """
        Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
        """
        # 使用 Bernoulli 分布生成 dropout 掩码
        mask = tf.cast(
            1
            - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
            tf.bool,
        )
        # 计算缩放因子
        scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32)
        if self.drop_prob > 0:
            # 如果 dropout 概率大于 0，则对输入张量应用 dropout 并乘以缩放因子
            inputs = tf.where(mask, 0.0, inputs) * scale

        def grad(upstream):
            if self.drop_prob > 0:
                # 计算 dropout 操作的反向传播梯度
                return tf.where(mask, 0.0, upstream) * scale
            else:
                return upstream

        return inputs, grad

    def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
        if training:
            # 在训练模式下应用自定义的 dropout 操作
            return self.xdropout(inputs)
        # 在推断模式下直接返回输入张量
        return inputs


class TFDebertaLayerNorm(keras.layers.Layer):
    """LayerNorm module in the TF style (epsilon inside the square root)."""

    def __init__(self, size, eps=1e-12, **kwargs):
        super().__init__(**kwargs)
        self.size = size
        self.eps = eps

    def build(self, input_shape):
        # 添加权重参数 gamma 和 beta
        self.gamma = self.add_weight(shape=[self.size], initializer=tf.ones_initializer(), name="weight")
        self.beta = self.add_weight(shape=[self.size], initializer=tf.zeros_initializer(), name="bias")
        return super().build(input_shape)

    def call(self, x: tf.Tensor) -> tf.Tensor:
        # 计算输入张量的均值、方差和标准差
        mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
        variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
        std = tf.math.sqrt(variance + self.eps)
        # 应用 LayerNorm 公式，输出归一化后的张量
        return self.gamma * (x - mean) / std + self.beta


class TFDebertaSelfOutput(keras.layers.Layer):
    # 这部分代码还未完整给出，故不做注释
    pass
    # 初始化函数，用于创建一个新的实例
    def __init__(self, config: DebertaConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 创建一个全连接层，用于处理隐藏状态，输出维度为config.hidden_size
        self.dense = keras.layers.Dense(config.hidden_size, name="dense")
        # 创建一个 LayerNormalization 层，设置 epsilon 为 config.layer_norm_eps
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建一个 dropout 层，使用 TFDebertaStableDropout 类，dropout 概率为 config.hidden_dropout_prob
        self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")
        # 将 config 对象存储在实例中，供后续调用使用
        self.config = config

    # 前向传播函数，接收隐藏状态和输入张量，根据训练标志进行处理
    def call(self, hidden_states, input_tensor, training: bool = False):
        # 全连接层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 应用 dropout 处理后的隐藏状态
        hidden_states = self.dropout(hidden_states, training=training)
        # 使用 LayerNormalization 层处理 dropout 后的隐藏状态和输入张量的和
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态
        return hidden_states

    # 构建函数，用于构建层的结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 设置标志为已构建
        self.built = True
        # 如果存在 dense 层，则构建 dense 层，并指定输入形状为 [None, None, self.config.hidden_size]
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果存在 LayerNorm 层，则构建 LayerNorm 层，并指定输入形状为 [None, None, self.config.hidden_size]
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        # 如果存在 dropout 层，则构建 dropout 层，输入形状为 None
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
# 定义 TFDebertaAttention 类，继承自 keras 的 Layer 类，实现自定义的注意力层
class TFDebertaAttention(keras.layers.Layer):
    def __init__(self, config: DebertaConfig, **kwargs):
        super().__init__(**kwargs)
        # 创建 TFDebertaDisentangledSelfAttention 实例，用于自注意力机制
        self.self = TFDebertaDisentangledSelfAttention(config, name="self")
        # 创建 TFDebertaSelfOutput 实例，用于处理自注意力输出
        self.dense_output = TFDebertaSelfOutput(config, name="output")
        self.config = config

    # 定义 call 方法，实现层的前向传播逻辑
    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        query_states: tf.Tensor = None,
        relative_pos: tf.Tensor = None,
        rel_embeddings: tf.Tensor = None,
        output_attentions: bool = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 使用 self.self 实例进行自注意力计算
        self_outputs = self.self(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
            output_attentions=output_attentions,
            training=training,
        )
        # 如果 query_states 为 None，则将其设置为输入张量 input_tensor
        if query_states is None:
            query_states = input_tensor
        # 使用 self.dense_output 实例处理自注意力输出
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=query_states, training=training
        )
        # 将处理后的输出和 self_outputs 的其余部分组成元组返回
        output = (attention_output,) + self_outputs[1:]

        return output

    # 定义 build 方法，用于构建层的参数
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 self.self 实例存在，则在 tf 的命名空间下构建其参数
        if getattr(self, "self", None) is not None:
            with tf.name_scope(self.self.name):
                self.self.build(None)
        # 如果 self.dense_output 实例存在，则在 tf 的命名空间下构建其参数
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)


# 定义 TFDebertaIntermediate 类，继承自 keras 的 Layer 类，实现中间层
class TFDebertaIntermediate(keras.layers.Layer):
    def __init__(self, config: DebertaConfig, **kwargs):
        super().__init__(**kwargs)
        # 创建 Dense 层实例，用于中间层的线性变换
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 根据配置获取激活函数，用于中间层的非线性变换
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    # 定义 call 方法，实现层的前向传播逻辑
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用 Dense 层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 使用配置中的激活函数进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    # 定义 build 方法，用于构建层的参数
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 self.dense 实例存在，则在 tf 的命名空间下构建其参数
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFDebertaOutput(keras.layers.Layer):
    # 此处为 TFDebertaOutput 类的定义，未提供具体实现和方法，不需要添加额外注释
    # 初始化函数，接受一个DebertaConfig对象和其他关键字参数
    def __init__(self, config: DebertaConfig, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)

        # 创建一个全连接层，输出单元数为config中指定的隐藏层大小，
        # 初始化方式使用config中指定的initializer_range
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 创建LayerNormalization层，epsilon值为config中指定的layer_norm_eps
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

        # 创建TFDebertaStableDropout层，dropout率为config中指定的hidden_dropout_prob
        self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")

        # 将传入的config对象保存在self.config中
        self.config = config

    # 调用函数，接收隐藏状态(hidden_states)、输入张量(input_tensor)和训练标志(training)，
    # 返回经过全连接层、dropout和LayerNormalization处理后的隐藏状态
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 使用self.dense对hidden_states进行全连接操作
        hidden_states = self.dense(inputs=hidden_states)

        # 使用self.dropout对全连接结果进行dropout操作，根据training参数决定是否使用训练模式
        hidden_states = self.dropout(hidden_states, training=training)

        # 将dropout后的结果与输入张量input_tensor相加，再使用self.LayerNorm进行LayerNormalization处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)

        # 返回处理后的隐藏状态
        return hidden_states

    # 构建函数，用于构建模型层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        
        # 标记为已经构建
        self.built = True
        
        # 如果存在self.dense属性，则使用tf.name_scope为dense层命名空间，
        # 并调用self.dense的build方法，传入输入形状[None, None, self.config.intermediate_size]
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        
        # 如果存在self.LayerNorm属性，则使用tf.name_scope为LayerNorm层命名空间，
        # 并调用self.LayerNorm的build方法，传入输入形状[None, None, self.config.hidden_size]
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        
        # 如果存在self.dropout属性，则使用tf.name_scope为dropout层命名空间，
        # 并调用self.dropout的build方法，传入None作为输入形状
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
class TFDebertaLayer(keras.layers.Layer):
    # TFDebertaLayer 类定义，继承自 keras 的 Layer 类
    def __init__(self, config: DebertaConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 初始化注意力、中间层和输出层组件
        self.attention = TFDebertaAttention(config, name="attention")
        self.intermediate = TFDebertaIntermediate(config, name="intermediate")
        self.bert_output = TFDebertaOutput(config, name="output")

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        query_states: tf.Tensor = None,
        relative_pos: tf.Tensor = None,
        rel_embeddings: tf.Tensor = None,
        output_attentions: bool = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用注意力机制模块，返回注意力输出
        attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
            output_attentions=output_attentions,
            training=training,
        )
        attention_output = attention_outputs[0]
        
        # 调用中间层，传入注意力输出，得到中间层输出
        intermediate_output = self.intermediate(hidden_states=attention_output)
        
        # 调用输出层，传入中间层输出和注意力输出，得到最终层输出
        layer_output = self.bert_output(
            hidden_states=intermediate_output, input_tensor=attention_output, training=training
        )
        
        # 构建返回的输出元组，包括最终层输出和可能的注意力信息
        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
        
        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        
        # 构建注意力、中间层和输出层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)


class TFDebertaEncoder(keras.layers.Layer):
    # TFDebertaEncoder 类定义，继承自 keras 的 Layer 类
    def __init__(self, config: DebertaConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 根据配置参数构建多层 TFDebertaLayer 组成的列表
        self.layer = [TFDebertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
        
        # 检查是否使用相对注意力机制
        self.relative_attention = getattr(config, "relative_attention", False)
        self.config = config
        
        # 如果使用相对注意力，设置最大相对位置
        if self.relative_attention:
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings
    # 构建模型层，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果使用相对注意力机制，添加相对位置嵌入权重
        if self.relative_attention:
            self.rel_embeddings = self.add_weight(
                name="rel_embeddings.weight",
                shape=[self.max_relative_positions * 2, self.config.hidden_size],
                initializer=get_initializer(self.config.initializer_range),
            )
        # 如果存在子层，则逐个构建这些子层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)

    # 获取相对位置嵌入权重
    def get_rel_embedding(self):
        rel_embeddings = self.rel_embeddings if self.relative_attention else None
        return rel_embeddings

    # 根据输入的注意力掩码，生成扩展后的注意力掩码
    def get_attention_mask(self, attention_mask):
        if len(shape_list(attention_mask)) <= 2:
            extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2)
            attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1)
            attention_mask = tf.cast(attention_mask, tf.uint8)
        elif len(shape_list(attention_mask)) == 3:
            attention_mask = tf.expand_dims(attention_mask, 1)

        return attention_mask

    # 获取相对位置编码
    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
        # 如果使用相对注意力且没有提供相对位置编码，则根据隐藏状态的形状生成
        if self.relative_attention and relative_pos is None:
            q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2]
            relative_pos = build_relative_position(q, shape_list(hidden_states)[-2])
        return relative_pos

    # 模型的调用函数，处理输入的隐藏状态和注意力掩码
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        query_states: tf.Tensor = None,
        relative_pos: tf.Tensor = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        training: bool = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 如果输出隐藏状态为True，则初始化一个空元组，否则为None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重为True，则初始化一个空元组，否则为None
        all_attentions = () if output_attentions else None

        # 调用self对象的get_attention_mask方法，生成注意力掩码
        attention_mask = self.get_attention_mask(attention_mask)
        # 调用self对象的get_rel_pos方法，生成相对位置编码
        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)

        # 如果hidden_states是一个序列对象，则将第一个元素作为next_kv，否则直接使用hidden_states
        if isinstance(hidden_states, Sequence):
            next_kv = hidden_states[0]
        else:
            next_kv = hidden_states

        # 调用self对象的get_rel_embedding方法，生成相对位置嵌入
        rel_embeddings = self.get_rel_embedding()

        # 遍历self.layer中的每个层模块
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态为True，则记录当前隐藏状态到all_hidden_states中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用当前层模块的__call__方法，计算当前层的输出
            layer_outputs = layer_module(
                hidden_states=next_kv,
                attention_mask=attention_mask,
                query_states=query_states,
                relative_pos=relative_pos,
                rel_embeddings=rel_embeddings,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新hidden_states为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果query_states不为None，则更新query_states为当前隐藏状态
            if query_states is not None:
                query_states = hidden_states
                # 如果hidden_states是一个序列对象，则更新next_kv为下一个层的隐藏状态
                if isinstance(hidden_states, Sequence):
                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
            else:
                # 否则直接更新next_kv为当前隐藏状态
                next_kv = hidden_states

            # 如果输出注意力权重为True，则记录当前层的注意力权重到all_attentions中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态到all_hidden_states中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果return_dict为False，则返回非None的结果组成的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        # 如果return_dict为True，则返回TFBaseModelOutput对象，包含最后的隐藏状态、所有隐藏状态和所有注意力权重
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
def build_relative_position(query_size, key_size):
    """
    根据查询和键构建相对位置关系

    假设查询的绝对位置 \\(P_q\\) 范围在 (0, query_size)，键的绝对位置 \\(P_k\\) 范围在 (0, key_size)，
    查询到键的相对位置为 \\(R_{q \\rightarrow k} = P_q - P_k\\)

    Args:
        query_size (int): 查询的长度
        key_size (int): 键的长度

    Return:
        `tf.Tensor`: 形状为 [1, query_size, key_size] 的张量，表示相对位置索引

    """
    q_ids = tf.range(query_size, dtype=tf.int32)  # 生成查询位置的索引
    k_ids = tf.range(key_size, dtype=tf.int32)    # 生成键位置的索引
    rel_pos_ids = q_ids[:, None] - tf.tile(tf.reshape(k_ids, [1, -1]), [query_size, 1])  # 计算相对位置
    rel_pos_ids = rel_pos_ids[:query_size, :]     # 裁剪得到查询长度范围内的相对位置
    rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0)  # 扩展维度，形成 [1, query_size, key_size]
    return tf.cast(rel_pos_ids, tf.int64)          # 转换为 int64 类型的张量返回


def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
    shapes = [
        shape_list(query_layer)[0],        # 查询层的批量大小
        shape_list(query_layer)[1],        # 查询层的序列长度
        shape_list(query_layer)[2],        # 查询层的隐藏单元数
        shape_list(relative_pos)[-1],      # 相对位置张量的最后一个维度大小
    ]
    return tf.broadcast_to(c2p_pos, shapes)  # 将 c2p_pos 广播扩展到指定形状的张量


def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
    shapes = [
        shape_list(query_layer)[0],        # 查询层的批量大小
        shape_list(query_layer)[1],        # 查询层的序列长度
        shape_list(key_layer)[-2],         # 键层的序列长度
        shape_list(key_layer)[-2],         # 键层的序列长度
    ]
    return tf.broadcast_to(c2p_pos, shapes)  # 将 c2p_pos 广播扩展到指定形状的张量


def pos_dynamic_expand(pos_index, p2c_att, key_layer):
    shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]]
    return tf.broadcast_to(pos_index, shapes)  # 将 pos_index 广播扩展到指定形状的张量


def torch_gather(x, indices, gather_axis):
    if gather_axis < 0:
        gather_axis = tf.rank(x) + gather_axis  # 将负数索引转换为正数索引

    if gather_axis != tf.rank(x) - 1:
        pre_roll = tf.rank(x) - 1 - gather_axis
        permutation = tf.roll(tf.range(tf.rank(x)), pre_roll, axis=0)  # 创建索引重排的置换
        x = tf.transpose(x, perm=permutation)   # 根据置换重新排列张量 x
        indices = tf.transpose(indices, perm=permutation)  # 根据置换重新排列索引张量 indices
    else:
        pre_roll = 0

    flat_x = tf.reshape(x, (-1, tf.shape(x)[-1]))    # 将张量 x 展平
    flat_indices = tf.reshape(indices, (-1, tf.shape(indices)[-1]))  # 将索引张量 indices 展平
    gathered = tf.gather(flat_x, flat_indices, batch_dims=1)  # 根据展平后的索引从 flat_x 中收集数据
    gathered = tf.reshape(gathered, tf.shape(indices))  # 将收集的数据重新 reshape 成原始索引张量的形状

    if pre_roll != 0:
        permutation = tf.roll(tf.range(tf.rank(x)), -pre_roll, axis=0)  # 创建索引重排的逆置换
        gathered = tf.transpose(gathered, perm=permutation)  # 根据逆置换重新排列 gathered 张量

    return gathered


class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
    """
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    """
    # 初始化函数，接受一个DebertaConfig对象和其他关键字参数
    def __init__(self, config: DebertaConfig, **kwargs):
        # 调用父类（AssumeRoleModel）的初始化方法
        super().__init__(**kwargs)
        # 检查隐藏大小是否是注意力头数的倍数，如果不是则引发异常
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        # 初始化注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        # 计算总的头大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        # 创建输入投影层，用于将输入转换为模型可用的格式
        self.in_proj = keras.layers.Dense(
            self.all_head_size * 3,
            kernel_initializer=get_initializer(config.initializer_range),
            name="in_proj",
            use_bias=False,
        )
        # 设置位置注意力类型，如果未指定，则为空列表
        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []

        # 是否使用相对注意力机制和对话头模式的标志
        self.relative_attention = getattr(config, "relative_attention", False)
        self.talking_head = getattr(config, "talking_head", False)

        # 如果启用对话头模式，创建头权重和头选择的投影层
        if self.talking_head:
            self.head_logits_proj = keras.layers.Dense(
                self.num_attention_heads,
                kernel_initializer=get_initializer(config.initializer_range),
                name="head_logits_proj",
                use_bias=False,
            )
            self.head_weights_proj = keras.layers.Dense(
                self.num_attention_heads,
                kernel_initializer=get_initializer(config.initializer_range),
                name="head_weights_proj",
                use_bias=False,
            )

        # 使用自定义的softmax层（TFDebertaXSoftmax），在最后一个轴上进行softmax操作
        self.softmax = TFDebertaXSoftmax(axis=-1)

        # 如果启用相对注意力机制，配置最大相对位置，设置位置丢弃层，并根据pos_att_type设置位置投影层
        if self.relative_attention:
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings
            self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout")
            if "c2p" in self.pos_att_type:
                self.pos_proj = keras.layers.Dense(
                    self.all_head_size,
                    kernel_initializer=get_initializer(config.initializer_range),
                    name="pos_proj",
                    use_bias=False,
                )
            if "p2c" in self.pos_att_type:
                self.pos_q_proj = keras.layers.Dense(
                    self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj"
                )

        # 设置注意力概率丢弃层
        self.dropout = TFDebertaStableDropout(config.attention_probs_dropout_prob, name="dropout")
        # 保存配置信息
        self.config = config
    # 定义神经网络层的构建方法，初始化权重等操作
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True

        # 添加查询偏置权重
        self.q_bias = self.add_weight(
            name="q_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
        )
        # 添加数值偏置权重
        self.v_bias = self.add_weight(
            name="v_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros()
        )

        # 如果存在输入投影层，则构建该层
        if getattr(self, "in_proj", None) is not None:
            with tf.name_scope(self.in_proj.name):
                self.in_proj.build([None, None, self.config.hidden_size])

        # 如果存在 dropout 层，则构建该层
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)

        # 如果存在头部 logits 投影层，则构建该层
        if getattr(self, "head_logits_proj", None) is not None:
            with tf.name_scope(self.head_logits_proj.name):
                self.head_logits_proj.build(None)

        # 如果存在头部权重投影层，则构建该层
        if getattr(self, "head_weights_proj", None) is not None:
            with tf.name_scope(self.head_weights_proj.name):
                self.head_weights_proj.build(None)

        # 如果存在位置 dropout 层，则构建该层
        if getattr(self, "pos_dropout", None) is not None:
            with tf.name_scope(self.pos_dropout.name):
                self.pos_dropout.build(None)

        # 如果存在位置投影层，则构建该层
        if getattr(self, "pos_proj", None) is not None:
            with tf.name_scope(self.pos_proj.name):
                self.pos_proj.build([self.config.hidden_size])

        # 如果存在位置查询投影层，则构建该层
        if getattr(self, "pos_q_proj", None) is not None:
            with tf.name_scope(self.pos_q_proj.name):
                self.pos_q_proj.build([self.config.hidden_size])

    # 将输入张量重塑为注意力得分所需的形状
    def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor:
        # 获取张量的形状列表，去除最后一个维度，并将最后两个维度合并
        shape = shape_list(tensor)[:-1] + [self.num_attention_heads, -1]
        tensor = tf.reshape(tensor=tensor, shape=shape)

        # 将张量从 [batch_size, seq_length, all_head_size] 转置为 [batch_size, seq_length, num_attention_heads, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    # 神经网络层的调用方法，执行注意力计算等操作
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        query_states: tf.Tensor = None,
        relative_pos: tf.Tensor = None,
        rel_embeddings: tf.Tensor = None,
        output_attentions: bool = False,
        training: bool = False,
    def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
        # 如果未提供相对位置信息，根据查询层的形状获取相对位置
        if relative_pos is None:
            q = shape_list(query_layer)[-2]
            relative_pos = build_relative_position(q, shape_list(key_layer)[-2])
        
        shape_list_pos = shape_list(relative_pos)
        
        # 如果相对位置的形状是二维，则扩展维度使其成为四维
        if len(shape_list_pos) == 2:
            relative_pos = tf.expand_dims(tf.expand_dims(relative_pos, 0), 0)
        # 如果相对位置的形状是三维，则在第二个维度上扩展维度
        elif len(shape_list_pos) == 3:
            relative_pos = tf.expand_dims(relative_pos, 1)
        # 如果相对位置的形状不是二维或三维，则抛出异常
        elif len(shape_list_pos) != 4:
            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {len(shape_list_pos)}")

        # 计算注意力跨度，确保不超过最大相对位置数，并转换为整型
        att_span = tf.cast(
            tf.minimum(
                tf.maximum(shape_list(query_layer)[-2], shape_list(key_layer)[-2]), self.max_relative_positions
            ),
            tf.int64,
        )
        
        # 根据注意力跨度选择相对位置嵌入，并扩展维度以匹配张量形状
        rel_embeddings = tf.expand_dims(
            rel_embeddings[self.max_relative_positions - att_span : self.max_relative_positions + att_span, :], 0
        )

        score = 0

        # 若位置注意力类型包含 "c2p"，执行内容到位置的注意力计算
        if "c2p" in self.pos_att_type:
            # 使用位置投影层对相对位置嵌入进行处理，并转置以便进行注意力计算
            pos_key_layer = self.pos_proj(rel_embeddings)
            pos_key_layer = self.transpose_for_scores(pos_key_layer)
            # 计算内容到位置的注意力分数
            c2p_att = tf.matmul(query_layer, tf.transpose(pos_key_layer, [0, 1, 3, 2]))
            # 对相对位置进行调整，并利用调整后的位置索引收集注意力分数
            c2p_pos = tf.clip_by_value(relative_pos + att_span, 0, att_span * 2 - 1)
            c2p_att = torch_gather(c2p_att, c2p_dynamic_expand(c2p_pos, query_layer, relative_pos), -1)
            score += c2p_att

        # 若位置注意力类型包含 "p2c"，执行位置到内容的注意力计算
        if "p2c" in self.pos_att_type:
            # 使用位置投影层对相对位置嵌入进行处理，并转置以便进行注意力计算
            pos_query_layer = self.pos_q_proj(rel_embeddings)
            pos_query_layer = self.transpose_for_scores(pos_query_layer)
            # 根据缩放因子对位置查询层进行归一化处理
            pos_query_layer /= tf.math.sqrt(tf.cast(shape_list(pos_query_layer)[-1] * scale_factor, dtype=tf.float32))
            # 如果查询层和键层的长度不同，重新构建相对位置
            if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]:
                r_pos = build_relative_position(shape_list(key_layer)[-2], shape_list(key_layer)[-2])
            else:
                r_pos = relative_pos
            # 对位置到内容的相对位置进行调整，并利用调整后的位置索引收集注意力分数
            p2c_pos = tf.clip_by_value(-r_pos + att_span, 0, att_span * 2 - 1)
            p2c_att = tf.matmul(key_layer, tf.transpose(pos_query_layer, [0, 1, 3, 2]))
            p2c_att = tf.transpose(
                torch_gather(p2c_att, p2c_dynamic_expand(p2c_pos, query_layer, key_layer), -1), [0, 1, 3, 2]
            )
            # 如果查询层和键层的长度不同，利用位置索引对注意力分数进行再次调整
            if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]:
                pos_index = tf.expand_dims(relative_pos[:, :, :, 0], -1)
                p2c_att = torch_gather(p2c_att, pos_dynamic_expand(pos_index, p2c_att, key_layer), -2)
            score += p2c_att

        # 返回最终的注意力分数
        return score
class TFDebertaEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        self.hidden_size = config.hidden_size
        self.max_position_embeddings = config.max_position_embeddings
        self.position_biased_input = getattr(config, "position_biased_input", True)
        self.initializer_range = config.initializer_range
        if self.embedding_size != config.hidden_size:
            # 如果embedding_size不等于hidden_size，则使用全连接层进行投影
            self.embed_proj = keras.layers.Dense(
                config.hidden_size,
                kernel_initializer=get_initializer(config.initializer_range),
                name="embed_proj",
                use_bias=False,
            )
        # LayerNormalization层，用于标准化层的输出
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # dropout层，用于随机丢弃一部分神经元，防止过拟合
        self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout")

    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            # 创建词嵌入权重矩阵，形状为[vocab_size, embedding_size]
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("token_type_embeddings"):
            if self.config.type_vocab_size > 0:
                # 如果有token_type信息，则创建token_type嵌入矩阵，形状为[type_vocab_size, embedding_size]
                self.token_type_embeddings = self.add_weight(
                    name="embeddings",
                    shape=[self.config.type_vocab_size, self.embedding_size],
                    initializer=get_initializer(self.initializer_range),
                )
            else:
                self.token_type_embeddings = None

        with tf.name_scope("position_embeddings"):
            if self.position_biased_input:
                # 如果需要使用位置信息偏置，则创建位置嵌入矩阵，形状为[max_position_embeddings, hidden_size]
                self.position_embeddings = self.add_weight(
                    name="embeddings",
                    shape=[self.max_position_embeddings, self.hidden_size],
                    initializer=get_initializer(self.initializer_range),
                )
            else:
                self.position_embeddings = None

        if self.built:
            return
        self.built = True
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 构建LayerNormalization层
                self.LayerNorm.build([None, None, self.config.hidden_size])
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                # 构建dropout层
                self.dropout.build(None)
        if getattr(self, "embed_proj", None) is not None:
            with tf.name_scope(self.embed_proj.name):
                # 构建全连接投影层
                self.embed_proj.build([None, None, self.embedding_size])
    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        mask: tf.Tensor = None,
        training: bool = False,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        # 检查是否提供了有效的输入数据，至少需要提供 `input_ids` 或 `input_embeds`
        if input_ids is None and inputs_embeds is None:
            raise ValueError("Need to provide either `input_ids` or `input_embeds`.")

        # 如果提供了 `input_ids`，则使用权重张量 `self.weight` 来获取嵌入向量
        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取输入嵌入张量的形状
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果未提供 `token_type_ids`，则将其初始化为全零张量
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 如果未提供 `position_ids`，则根据输入张量的最后一个维度创建位置张量
        if position_ids is None:
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        # 初始的最终嵌入张量即为输入嵌入张量
        final_embeddings = inputs_embeds

        # 如果模型配置要求在输入中加入位置偏置
        if self.position_biased_input:
            position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
            final_embeddings += position_embeds

        # 如果模型配置要求加入类型标记嵌入
        if self.config.type_vocab_size > 0:
            token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
            final_embeddings += token_type_embeds

        # 如果嵌入大小与隐藏层大小不一致，则使用 `self.embed_proj` 进行投影
        if self.embedding_size != self.hidden_size:
            final_embeddings = self.embed_proj(final_embeddings)

        # 对最终嵌入张量进行 Layer Normalization 处理
        final_embeddings = self.LayerNorm(final_embeddings)

        # 如果提供了掩码张量 `mask`
        if mask is not None:
            # 如果掩码张量的维度与最终嵌入张量的维度不同，进行维度调整
            if len(shape_list(mask)) != len(shape_list(final_embeddings)):
                if len(shape_list(mask)) == 4:
                    mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1)
                mask = tf.cast(tf.expand_dims(mask, axis=2), tf.float32)

            # 应用掩码到最终嵌入张量上
            final_embeddings = final_embeddings * mask

        # 对最终嵌入张量应用 dropout，如果处于训练模式则启用
        final_embeddings = self.dropout(final_embeddings, training=training)

        # 返回处理后的最终嵌入张量
        return final_embeddings
# 定义 TFDebertaPredictionHeadTransform 类，作为 Keras 层
class TFDebertaPredictionHeadTransform(keras.layers.Layer):
    # 初始化方法，接受 DebertaConfig 对象和额外的关键字参数
    def __init__(self, config: DebertaConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 根据配置获取嵌入大小，如果未指定则使用 hidden_size
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        
        # 创建一个全连接层，输出单元数为嵌入大小，初始化器为配置中的初始化范围
        self.dense = keras.layers.Dense(
            units=self.embedding_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )
        
        # 如果 hidden_act 是字符串，则根据字符串获取激活函数；否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.transform_act_fn = config.hidden_act
        
        # 创建 LayerNormalization 层，使用配置中的 epsilon 值，命名为 LayerNorm
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # 保存配置对象
        self.config = config

    # 定义调用方法，接受隐藏状态张量并返回转换后的张量
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 先通过全连接层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 然后应用激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # 最后对结果应用 LayerNormalization
        hidden_states = self.LayerNorm(hidden_states)

        return hidden_states

    # 构建方法，用于构建层的权重
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 如果存在 dense 层，则构建其权重
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 构建时指定输入形状为 [None, None, hidden_size]
                self.dense.build([None, None, self.config.hidden_size])
        
        # 如果存在 LayerNorm 层，则构建其权重
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 构建时指定输入形状为 [None, None, embedding_size]
                self.LayerNorm.build([None, None, self.embedding_size])


# 定义 TFDebertaLMPredictionHead 类，作为 Keras 层
class TFDebertaLMPredictionHead(keras.layers.Layer):
    # 初始化方法，接受 DebertaConfig 对象、输入嵌入层和额外的关键字参数
    def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)
        
        # 保存配置对象和嵌入大小（默认为 hidden_size）
        self.config = config
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        
        # 创建 TFDebertaPredictionHeadTransform 实例，命名为 transform
        self.transform = TFDebertaPredictionHeadTransform(config, name="transform")
        
        # 输入的嵌入层
        self.input_embeddings = input_embeddings

    # 构建方法，用于构建层的权重
    def build(self, input_shape=None):
        # 添加一个全零的偏置，形状为 (vocab_size,)
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
        
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 如果存在 transform 层，则构建其权重
        if getattr(self, "transform", None) is not None:
            with tf.name_scope(self.transform.name):
                self.transform.build(None)

    # 获取输出嵌入层
    def get_output_embeddings(self) -> keras.layers.Layer:
        return self.input_embeddings

    # 设置输出嵌入层
    def set_output_embeddings(self, value: tf.Variable):
        self.input_embeddings.weight = value
        # 更新嵌入层的词汇大小
        self.input_embeddings.vocab_size = shape_list(value)[0]

    # 获取偏置
    def get_bias(self) -> Dict[str, tf.Variable]:
        return {"bias": self.bias}

    # 设置偏置
    def set_bias(self, value: tf.Variable):
        self.bias = value["bias"]
        # 更新配置中的词汇大小
        self.config.vocab_size = shape_list(value["bias"])[0]
    # 调用方法，输入隐藏状态张量，并通过 self.transform 方法进行转换
    hidden_states = self.transform(hidden_states=hidden_states)

    # 获取隐藏状态张量的序列长度
    seq_length = shape_list(hidden_states)[1]

    # 将隐藏状态张量重塑为二维张量，形状为 [-1, self.embedding_size]
    hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])

    # 执行矩阵乘法，计算隐藏状态张量与 self.input_embeddings.weight 的乘积，转置 self.input_embeddings.weight
    hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)

    # 将结果重新塑造为三维张量，形状为 [-1, seq_length, self.config.vocab_size]
    hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])

    # 使用偏置 self.bias 添加到隐藏状态张量上
    hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

    # 返回处理后的隐藏状态张量
    return hidden_states
class TFDebertaOnlyMLMHead(keras.layers.Layer):
    # 定义 TFDebertaOnlyMLMHead 类，继承自 keras 的 Layer 类
    def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs):
        # 初始化方法，接受 DebertaConfig 类型的 config 和 keras.Layer 类型的 input_embeddings 参数
        super().__init__(**kwargs)
        # 调用父类的初始化方法

        # 创建 TFDebertaLMPredictionHead 实例，并命名为 predictions
        self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions")

    # 定义 call 方法，接受 tf.Tensor 类型的 sequence_output 参数，返回 tf.Tensor 类型的 prediction_scores
    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
        # 调用 self.predictions 的 __call__ 方法，传入 hidden_states=sequence_output 参数
        prediction_scores = self.predictions(hidden_states=sequence_output)

        # 返回 prediction_scores
        return prediction_scores

    # 定义 build 方法，接受 input_shape 参数，默认为 None
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 将 built 属性设置为 True，表示已经构建
        self.built = True
        
        # 如果 self.predictions 存在
        if getattr(self, "predictions", None) is not None:
            # 使用 tf.name_scope 来限定作用域为 self.predictions.name
            with tf.name_scope(self.predictions.name):
                # 调用 self.predictions 的 build 方法，传入 None 参数
                self.predictions.build(None)


# @keras_serializable
class TFDebertaMainLayer(keras.layers.Layer):
    # 类变量 config_class，指定为 DebertaConfig 类
    config_class = DebertaConfig

    # 初始化方法，接受 DebertaConfig 类型的 config 参数和其他关键字参数
    def __init__(self, config: DebertaConfig, **kwargs):
        super().__init__(**kwargs)
        # 调用父类的初始化方法

        # 将 config 参数赋值给 self.config
        self.config = config

        # 创建 TFDebertaEmbeddings 实例，并命名为 embeddings
        self.embeddings = TFDebertaEmbeddings(config, name="embeddings")
        
        # 创建 TFDebertaEncoder 实例，并命名为 encoder
        self.encoder = TFDebertaEncoder(config, name="encoder")

    # 返回 embeddings 属性
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings

    # 设置 embeddings 属性的权重和词汇表大小
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # _prune_heads 方法，用于剪枝模型的头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    # 使用 unpack_inputs 装饰器，接受多个输入参数，并按需解包
    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        # 省略部分参数...
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 如果同时指定了 input_ids 和 inputs_embeds，抛出数值错误异常
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果指定了 input_ids，则获取其形状信息
        elif input_ids is not None:
            input_shape = shape_list(input_ids)
        # 如果指定了 inputs_embeds，则获取其形状信息去掉最后一维
        elif inputs_embeds is not None:
            input_shape = shape_list(inputs_embeds)[:-1]
        else:
            # 如果既没有指定 input_ids 也没有指定 inputs_embeds，则抛出数值错误异常
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 如果 attention_mask 为 None，则使用输入形状创建全为1的张量
        if attention_mask is None:
            attention_mask = tf.fill(dims=input_shape, value=1)

        # 如果 token_type_ids 为 None，则使用输入形状创建全为0的张量
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 使用 embeddings 层处理输入，获取嵌入输出
        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            mask=attention_mask,
            training=training,
        )

        # 使用 encoder 层处理嵌入输出，获取编码器的输出
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器输出中的序列输出（通常是最后一层的隐藏状态）
        sequence_output = encoder_outputs[0]

        # 如果不要求返回字典形式的输出，则返回编码器的输出
        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]

        # 如果要求返回字典形式的输出，则构造 TFBaseModelOutput 对象并返回
        return TFBaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        # 如果模型具有 embeddings 属性，则构建 embeddings 层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果模型具有 encoder 属性，则构建 encoder 层
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
"""
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` or `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            # 输入序列的 token 索引，在词汇表中的索引表示。
            # 可以使用 [`AutoTokenizer`] 获取。有关详细信息，请参阅 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            # 避免对填充的 token 索引进行注意力计算的掩码。掩码值为 `[0, 1]`：

            - 1 表示 **未被掩码的** token，
            - 0 表示 **被掩码的** token。

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            # 段 token 索引，用于指示输入的第一部分和第二部分。索引选取在 `[0, 1]`：

            - 0 对应 *句子 A* 的 token，
            - 1 对应 *句子 B* 的 token。

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            # 每个输入序列 token 在位置嵌入中的位置索引。选取范围在 `[0, config.max_position_embeddings - 1]`。

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选项，可以直接传递嵌入表示而不是 `input_ids`。如果您希望更多地控制如何将 `input_ids` 索引转换为关联向量，
            # 则此选项非常有用，而不是使用模型的内部嵌入查找矩阵。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关更多详细信息，请参见返回的张量下的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关更多详细信息，请参见返回的张量下的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是简单的元组。
"""
@add_start_docstrings(
    "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
    DEBERTA_START_DOCSTRING,
)
"""
# 定义 TFDebertaModel 类，继承自 TFDebertaPreTrainedModel 类
class TFDebertaModel(TFDebertaPreTrainedModel):
    def __init__(self, config: DebertaConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 DeBERTa 主层，使用给定的配置
        self.deberta = TFDebertaMainLayer(config, name="deberta")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义 call 方法，接收多种输入并调用 DeBERTa 主层
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 调用 DeBERTa 主层进行前向传播，返回输出
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return outputs

    # 构建模型，确保 DeBERTa 主层已经被构建
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)


"""
@add_start_docstrings("DeBERTa Model with a `language modeling` head on top.", DEBERTA_START_DOCSTRING)
"""
# 定义 TFDebertaForMaskedLM 类，继承自 TFDebertaPreTrainedModel 和 TFMaskedLanguageModelingLoss 类
class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config: DebertaConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 如果配置中设定为解码器，发出警告信息
        if config.is_decoder:
            logger.warning(
                "If you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化 DeBERTa 主层和 MLM 头部
        self.deberta = TFDebertaMainLayer(config, name="deberta")
        self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")

    # 返回 MLM 头部
    def get_lm_head(self) -> keras.layers.Layer:
        return self.mlm.predictions

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入模型的输入序列的 ID，可以为 None
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 可选的注意力掩码，用于指示哪些位置需要注意力
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # 可选的标记类型 ID，用于区分不同类型的输入
        position_ids: np.ndarray | tf.Tensor | None = None,  # 可选的位置 ID，用于指示输入中的位置
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 可选的嵌入输入，用于直接传递嵌入向量
        output_attentions: Optional[bool] = None,  # 是否返回注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否返回隐藏状态
        return_dict: Optional[bool] = None,  # 是否以字典形式返回输出
        labels: np.ndarray | tf.Tensor | None = None,  # 可选的标签，用于计算掩码语言建模损失
        training: Optional[bool] = False,  # 是否处于训练模式
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            计算掩码语言建模损失的标签。索引应在 `[-100, 0, ..., config.vocab_size]` 范围内。索引为 `-100` 的标记被忽略（掩码），
            损失仅计算标签在 `[0, ..., config.vocab_size]` 范围内的标记。
        """
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        sequence_output = outputs[0]  # 提取模型输出的序列输出
        prediction_scores = self.mlm(sequence_output=sequence_output, training=training)  # 使用序列输出预测得分
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)  # 计算损失，如果没有标签则为 None

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]  # 如果不以字典形式返回，构建输出元组
            return ((loss,) + output) if loss is not None else output  # 返回损失和输出元组或者仅输出元组

        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True  # 标记模型已构建
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)  # 构建模型部件
        if getattr(self, "mlm", None) is not None:
            with tf.name_scope(self.mlm.name):
                self.mlm.build(None)  # 构建掩码语言建模部件
# 使用装饰器添加模型的文档字符串，说明这是一个在DeBERTa模型基础上增加了序列分类/回归头的Transformer模型
@add_start_docstrings(
    """
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    DEBERTA_START_DOCSTRING,
)
class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: DebertaConfig, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 设置分类的标签数目
        self.num_labels = config.num_labels

        # 初始化DeBERTa主层和池化层
        self.deberta = TFDebertaMainLayer(config, name="deberta")
        self.pooler = TFDebertaContextPooler(config, name="pooler")

        # 从配置中获取分类器的dropout值或者使用默认的隐藏层dropout概率
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        # 初始化稳定的Dropout层用于分类器
        self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout")
        # 初始化分类器的全连接层
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )
        # 设置输出维度为池化层的输出维度
        self.output_dim = self.pooler.output_dim

    # 使用装饰器添加模型前向传播方法的文档字符串，描述输入参数和输出类型
    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 使用 `->` 符号指定函数返回类型为 TFSequenceClassifierOutput 或者包含 tf.Tensor 的元组
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中取出序列输出（第一个元素）
        sequence_output = outputs[0]
        # 将序列输出通过池化层得到汇总输出
        pooled_output = self.pooler(sequence_output, training=training)
        # 对汇总输出应用 dropout 操作
        pooled_output = self.dropout(pooled_output, training=training)
        # 将经过 dropout 处理后的汇总输出输入分类器，得到 logits
        logits = self.classifier(pooled_output)
        # 如果提供了 labels，则计算损失；否则将损失设为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果 return_dict 为 False，则返回一个包含 logits 和额外输出的元组
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
        
        # 如果 return_dict 为 True，则返回一个 TFSequenceClassifierOutput 对象
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 如果存在 self.deberta，则在 TensorFlow 的命名空间下构建 self.deberta
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
        
        # 如果存在 self.pooler，则在 TensorFlow 的命名空间下构建 self.pooler
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)
        
        # 如果存在 self.dropout，则在 TensorFlow 的命名空间下构建 self.dropout
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
        
        # 如果存在 self.classifier，则在 TensorFlow 的命名空间下构建 self.classifier
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.output_dim])
@add_start_docstrings(
    """
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    DEBERTA_START_DOCSTRING,
)
class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config: DebertaConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels  # 初始化模型的标签数量

        self.deberta = TFDebertaMainLayer(config, name="deberta")  # 使用配置初始化 DeBERTa 主层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)  # 根据配置添加 dropout 层
        self.classifier = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )  # 添加分类器层，输出维度为标签数量，使用配置的初始化器范围初始化
        self.config = config  # 保存配置对象

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        """
        DeBERTa 模型的前向传播方法，处理输入并返回输出结果。

        Args:
            input_ids (TFModelInputType | None): 输入的 token IDs
            attention_mask (np.ndarray | tf.Tensor | None): 注意力遮罩
            token_type_ids (np.ndarray | tf.Tensor | None): token 类型 IDs
            position_ids (np.ndarray | tf.Tensor | None): 位置 IDs
            inputs_embeds (np.ndarray | tf.Tensor | None): 嵌入式输入
            output_attentions (Optional[bool]): 是否输出注意力权重
            output_hidden_states (Optional[bool]): 是否输出隐藏状态
            return_dict (Optional[bool]): 是否以字典形式返回结果
            labels (np.ndarray | tf.Tensor | None): 标签数据
            training (Optional[bool]): 是否在训练模式下

        Returns:
            TFTokenClassifierOutput: DeBERTa 模型的输出结果对象
        """
        # 调用 TFDebertaForTokenClassification 模型的前向传播
        # 详细参数和用法示例请参考文档和代码样例
        return super().call(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            labels=labels,
            training=training,
        )
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 DeBERTa 模型进行前向传播，获取输出结果
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 DeBERTa 模型的输出中获取序列输出（通常是隐藏状态）
        sequence_output = outputs[0]
        # 根据训练状态进行 dropout 操作，以防止过拟合
        sequence_output = self.dropout(sequence_output, training=training)
        # 将序列输出传递给分类器，生成分类预测 logits
        logits = self.classifier(inputs=sequence_output)
        # 如果有标签，则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不要求返回字典格式的输出，则按照元组形式返回结果
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFTokenClassifierOutput 格式的输出，包括损失、logits、隐藏状态和注意力权重
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        # 设置模型为已构建状态
        self.built = True
        # 如果存在 DeBERTa 模型，则构建其参数
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
        # 如果存在分类器模型，则构建其参数
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
"""
DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
# 基于DeBERTa模型，在其隐藏状态输出顶部添加一个用于提取性问答任务（如SQuAD）的跨度分类头部（通过线性层计算`span start logits`和`span end logits`）。

@add_start_docstrings(
    """
    添加文档字符串注释到模型的前向传播函数，描述输入的详细信息。
    """,
    DEBERTA_START_DOCSTRING,
)
# 装饰器：添加起始文档字符串到模型的前向传播函数，使用了预定义的DeBERTa起始文档字符串格式。

class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnsweringLoss):
    # TFDebertaForQuestionAnswering类，继承自TFDebertaPreTrainedModel和TFQuestionAnsweringLoss。

    def __init__(self, config: DebertaConfig, *inputs, **kwargs):
        # 初始化函数，接收DebertaConfig类型的配置参数config和其他输入。

        super().__init__(config, *inputs, **kwargs)
        # 调用父类（TFDebertaPreTrainedModel和TFQuestionAnsweringLoss）的初始化函数。

        self.num_labels = config.num_labels
        # 设置类属性num_labels为配置参数config中的标签数目。

        self.deberta = TFDebertaMainLayer(config, name="deberta")
        # 创建TFDebertaMainLayer实例self.deberta，使用配置参数config并命名为"deberta"。

        self.qa_outputs = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        # 创建Dense层self.qa_outputs，用于输出QA任务结果，设置单元数为config.num_labels，初始化器使用config中的范围初始化器。

        self.config = config
        # 设置类属性config为配置参数config。

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 装饰器：添加起始文档字符串到模型的前向传播函数，描述输入的详细信息，并添加代码示例文档字符串。

    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 定义模型的前向传播函数call，接收多个输入参数和可选的控制参数。
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用模型的前向传播函数，并获取输出
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中提取序列输出
        sequence_output = outputs[0]
        # 使用序列输出计算问题回答的 logits
        logits = self.qa_outputs(inputs=sequence_output)
        # 将 logits 分割为起始位置和结束位置的预测
        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
        # 移除起始位置和结束位置 logits 的最后一个维度，使得维度降为 (batch_size,)
        start_logits = tf.squeeze(input=start_logits, axis=-1)
        end_logits = tf.squeeze(input=end_logits, axis=-1)
        # 初始化损失为 None
        loss = None

        # 如果提供了起始位置和结束位置的标签，则计算损失
        if start_positions is not None and end_positions is not None:
            # 构建标签字典
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            # 使用损失计算函数计算损失
            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))

        # 如果不需要返回字典形式的输出，则按照元组形式返回结果
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，则创建 TFQuestionAnsweringModelOutput 对象并返回
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过网络结构，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 self.deberta 属性，则构建 self.deberta 模型
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
        # 如果存在 self.qa_outputs 属性，则构建 self.qa_outputs 层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\deberta\tokenization_deberta.py`

# coding=utf-8
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for model DeBERTa."""

import json
import os
from typing import List, Optional, Tuple

import regex as re  # 导入 regex 库，用于支持正则表达式的操作

from ...tokenization_utils import AddedToken, PreTrainedTokenizer  # 导入基础的标记化工具和预训练的标记器
from ...utils import logging  # 导入日志记录工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件名的常量字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}

# 定义预训练模型与词汇文件映射的常量字典
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/vocab.json",
        "microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/vocab.json",
        "microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/vocab.json",
        "microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/vocab.json",
        "microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/vocab.json",
        "microsoft/deberta-xlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/vocab.json"
        ),
    },
    "merges_file": {
        "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/merges.txt",
        "microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt",
        "microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/merges.txt",
        "microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/merges.txt",
        "microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/merges.txt",
        "microsoft/deberta-xlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/merges.txt"
        ),
    },
}

# 定义预训练模型位置编码大小的常量字典
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/deberta-base": 512,
    "microsoft/deberta-large": 512,
    "microsoft/deberta-xlarge": 512,
    "microsoft/deberta-base-mnli": 512,
    "microsoft/deberta-large-mnli": 512,
    "microsoft/deberta-xlarge-mnli": 512,
}

# 定义预训练模型初始化配置的常量字典
PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/deberta-base": {"do_lower_case": False},
    "microsoft/deberta-large": {"do_lower_case": False},
}

# 从transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode函数复制过来的函数
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.
    
    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    # 定义一个列表，包含所有可打印ASCII字符的Unicode码点范围
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    
    # 复制bs列表到cs列表
    cs = bs[:]
    # 初始化计数器n为0
    n = 0
    # 遍历0到255的所有字节值
    for b in range(2**8):
        # 如果b不在bs列表中，则将b添加到bs列表，将2**8 + n添加到cs列表，并增加n计数器
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    
    # 将cs列表中的每个数值转换为对应的Unicode字符，并组成新的列表
    cs = [chr(n) for n in cs]
    
    # 返回一个将utf-8字节映射到Unicode字符串的字典
    return dict(zip(bs, cs))
# 从 transformers.models.gpt2.tokenization_gpt2.get_pairs 复制而来的函数，用于生成单词中的符号对集合
def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    # 初始化一个空集合，用于存放符号对
    pairs = set()
    # 从单词的第一个字符开始遍历到倒数第二个字符
    prev_char = word[0]
    for char in word[1:]:
        # 将当前字符与前一个字符组成一个符号对，并添加到集合中
        pairs.add((prev_char, char))
        # 更新前一个字符为当前字符，为下一次迭代做准备
        prev_char = char
    # 返回生成的符号对集合
    return pairs


class DebertaTokenizer(PreTrainedTokenizer):
    """
    Construct a DeBERTa tokenizer. Based on byte-level Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```
    >>> from transformers import DebertaTokenizer

    >>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
    >>> tokenizer("Hello world")["input_ids"]
    [1, 31414, 232, 2]

    >>> tokenizer(" Hello world")["input_ids"]
    [1, 20920, 232, 2]
    ```

    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

    <Tip>

    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).

    </Tip>

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # DeBERTa 分词器的构造函数，基于字节级字节对编码
    # 此分词器已经训练成将空格视为标记的一部分（类似 sentencepiece），因此一个单词的编码取决于它是否在句子开头（没有空格）

    # 示例代码块结束
    # 定义一个类，用于处理特定的词汇和标记化文件
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
    
    # 初始化方法，设置类的基本属性和参数
    def __init__(
        self,
        vocab_file,            # 词汇文件的路径
        merges_file,           # 合并文件的路径
        errors="replace",      # 解码字节到UTF-8时出现错误的处理策略
        bos_token="[CLS]",     # 序列开始标记
        eos_token="[SEP]",     # 序列结束标记
        sep_token="[SEP]",     # 分隔标记，用于多序列构建或特殊标记序列的最后一个标记
        cls_token="[CLS]",     # 分类器标记，在序列分类时使用
        unk_token="[UNK]",     # 未知标记，用于处理不在词汇表中的词汇
        pad_token="[PAD]",     # 填充标记，用于处理不同长度序列的批处理
        mask_token="[MASK]",   # 掩码标记，用于掩码语言建模
        add_prefix_space=False,  # 是否在输入前添加初始空格，用于Deberta分词器
        add_bos_token=False,   # 是否在输入前添加初始序列结束标记
        **kwargs,              # 其他可能的参数
    ):
    ):
        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
        self.add_bos_token = add_bos_token

        # 使用指定的词汇文件打开并加载词汇表，使用 UTF-8 编码
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 根据编码表生成解码表
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # 处理解码时的错误策略
        # 初始化字节编码器和解码器
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        # 使用指定的合并文件打开并读取 BPE 合并规则
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        # 创建 BPE 合并规则到索引的映射字典
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}  # 初始化缓存字典
        self.add_prefix_space = add_prefix_space  # 控制是否在添加特殊标记时加入前置空格

        # 使用正则表达式定义 tokenization 的模式，支持对缩写单词的大小写不敏感处理
        # 这里添加 re.IGNORECASE 标记，以便支持合并首字母大写的缩写单词
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        # 调用父类的初始化方法，传入参数设置
        super().__init__(
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            add_bos_token=add_bos_token,
            **kwargs,
        )

    @property
    # 从 transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size 复制而来
    def vocab_size(self):
        return len(self.encoder)

    # 从 transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab 复制而来
    def get_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    # 从 transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe 复制而来
    def bpe(self, token):
        # 如果 token 已经在缓存中，则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        
        # 将 token 转换成元组形式
        word = tuple(token)
        # 获取 token 的所有可能的 bigram 对
        pairs = get_pairs(word)

        # 如果不存在 bigram 对，直接返回 token
        if not pairs:
            return token

        # 开始进行 BPE 合并操作，直到无法再合并为止
        while True:
            # 找到当前权重最小的 bigram 对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果找到的 bigram 不在预先定义的 BPE 权重中，停止合并
            if bigram not in self.bpe_ranks:
                break
            # 分解当前的 word，将符合条件的 bigram 合并为一个 token
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            # 如果 word 只剩一个 token，停止合并
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        
        # 将合并后的 word 转换为字符串形式
        word = " ".join(word)
        # 将结果存入缓存
        self.cache[token] = word
        return word

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        通过连接和添加特殊 token 构建用于序列分类任务的模型输入。DeBERTa 的序列格式如下：

        - 单个序列: [CLS] X [SEP]
        - 序列对: [CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊 token 的 ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的 ID 列表（用于序列对）。

        Returns:
            `List[int]`: 带有适当特殊 token 的输入 ID 列表。
        """
        if token_ids_1 is None:
            # 返回只有一个序列的情况下的输入列表
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        # 构建包含两个序列的输入列表，包括特殊 token
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        """
        返回一个 mask 列表，指示哪些 token 是特殊 token。

        Args:
            token_ids_0 (`List[int]`):
                第一个序列的 token ID 列表。
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的 token ID 列表（用于序列对）。
            already_has_special_tokens (`bool`):
                是否已经包含了特殊 token。

        Returns:
            `List[int]`: mask 列表，每个元素为 1（特殊 token）或 0（普通 token）。
        """
        # 初始化 mask 列表，默认为全 0
        special_tokens_mask = [0] * len(token_ids_0)

        # 如果已经有特殊 token，直接返回全 1 的 mask 列表
        if already_has_special_tokens:
            return special_tokens_mask

        # 设置开始和结束的特殊 token 位置为 1
        special_tokens_mask[0] = 1  # CLS token
        special_tokens_mask[-1] = 1  # SEP token

        # 如果有第二个序列，则将第二个序列的 SEP token 位置也设置为 1
        if token_ids_1 is not None:
            special_tokens_mask += [1] * len(token_ids_1)  # SEP token for second sequence
        
        return special_tokens_mask
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            # If the token list already has special tokens, delegate the masking to the base class method
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If no special tokens exist in the token lists, construct a mask with appropriate positions
        if token_ids_1 is None:
            # For a single sequence, prepend and append with special tokens
            return [1] + ([0] * len(token_ids_0)) + [1]
        else:
            # For sequence pairs, prepend and append special tokens for both sequences
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]  # Separator token ID
        cls = [self.cls_token_id]  # Classification token ID

        if token_ids_1 is None:
            # If there's only one sequence, return a mask with zeros for its tokens
            return len(cls + token_ids_0 + sep) * [0]
        else:
            # For sequence pairs, return a mask with zeros for the first sequence and ones for the second
            return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
    def _tokenize(self, text):
        """Tokenize a string."""
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            # Encode each token byte and split it using BPE
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
    # 使用词汇表将一个 token（字符串）转换为对应的 id
    def _convert_token_to_id(self, token):
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 从词汇表将一个 id（整数）转换为对应的 token（字符串）
    def _convert_id_to_token(self, index):
        return self.decoder.get(index)

    # 将一个 token 序列（字符串列表）转换为单个字符串
    def convert_tokens_to_string(self, tokens):
        # 将 tokens 列表连接成一个字符串
        text = "".join(tokens)
        # 将字符串转换为字节数组，并使用 byte_decoder 进行解码成 utf-8 编码的文本
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text

    # 将词汇表保存到指定目录下的文件中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        # 拼接词汇表文件和合并文件的保存路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将词汇表编码（encoder）以 JSON 格式写入到 vocab_file 中
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将 BPE 合并信息写入到 merge_file 中
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            # 遍历 bpe_ranks 中的项目，按 token_index 排序，并将每个 BPE token 列表写入文件
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    # 如果 BPE 合并索引不是连续的，记录警告信息
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回保存的 vocab_file 和 merge_file 的路径
        return vocab_file, merge_file

    # 准备文本进行 tokenization 前的预处理操作
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        # 获取是否需要添加前缀空格的参数，默认使用 self.add_prefix_space
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        # 如果文本已经分成单词或者需要添加前缀空格，并且第一个字符不是空白字符，则在文本前添加空格
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
        # 返回处理后的文本和额外的参数
        return (text, kwargs)

`.\models\deberta\tokenization_deberta_fast.py`

# coding=utf-8
# Copyright 2020 Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fast Tokenization class for model DeBERTa."""

import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers  # 导入 pre_tokenizers 模块

from ...tokenization_utils_base import AddedToken, BatchEncoding  # 导入 tokenization_utils_base 模块中的 AddedToken 和 BatchEncoding 类
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入 tokenization_utils_fast 模块中的 PreTrainedTokenizerFast 类
from ...utils import logging  # 导入 utils 模块中的 logging 函数
from .tokenization_deberta import DebertaTokenizer  # 导入当前目录下的 tokenization_deberta 模块中的 DebertaTokenizer 类

logger = logging.get_logger(__name__)  # 获取当前模块的 logger 对象

VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/vocab.json",
        "microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/vocab.json",
        "microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/vocab.json",
        "microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/vocab.json",
        "microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/vocab.json",
        "microsoft/deberta-xlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/vocab.json"
        ),
    },
    "merges_file": {
        "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/merges.txt",
        "microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt",
        "microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/merges.txt",
        "microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/merges.txt",
        "microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/merges.txt",
        "microsoft/deberta-xlarge-mnli": (
            "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/merges.txt"
        ),
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/deberta-base": 512,
    "microsoft/deberta-large": 512,
    "microsoft/deberta-xlarge": 512,
    "microsoft/deberta-base-mnli": 512,
    "microsoft/deberta-large-mnli": 512,
    "microsoft/deberta-xlarge-mnli": 512,
}

PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/deberta-base": {"do_lower_case": False},
    # 预训练模型 "microsoft/deberta-base" 的初始化配置，指定 do_lower_case 为 False
}
    "microsoft/deberta-large": {"do_lower_case": False},


    # 定义一个键为 "microsoft/deberta-large" 的字典条目，其值为包含一个布尔值 False 的键 "do_lower_case"
}



class DebertaTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" DeBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```
    >>> from transformers import DebertaTokenizerFast

    >>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
    >>> tokenizer("Hello world")["input_ids"]
    [1, 31414, 232, 2]

    >>> tokenizer(" Hello world")["input_ids"]
    [1, 20920, 232, 2]
    ```

    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
    the model was not pretrained this way, it might yield a decrease in performance.

    <Tip>

    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.

    </Tip>

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    # 定义常量：词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 定义常量：预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 定义常量：预训练位置嵌入的最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 定义常量：模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
    # 慢速分词器类的引用
    slow_tokenizer_class = DebertaTokenizer

    # 初始化方法，用于创建一个新的 DebertaTokenizer 对象
    def __init__(
        self,
        vocab_file=None,           # 词汇文件的路径（可选）
        merges_file=None,          # 合并文件的路径（可选）
        tokenizer_file=None,       # 分词器文件的路径（可选）
        errors="replace",          # 解码字节流时的错误处理方式，默认为替换
        bos_token="[CLS]",         # 序列起始标记（可选，默认为 "[CLS]"）
        eos_token="[SEP]",         # 序列结束标记（可选，默认为 "[SEP]"）
        sep_token="[SEP]",         # 分隔标记，用于多序列构建等情况（可选，默认为 "[SEP]"）
        cls_token="[CLS]",         # 分类器标记，用于序列分类任务（可选，默认为 "[CLS]"）
        unk_token="[UNK]",         # 未知标记，当词汇中不存在时使用（可选，默认为 "[UNK]"）
        pad_token="[PAD]",         # 填充标记，用于填充不同长度的序列（可选，默认为 "[PAD]"）
        mask_token="[MASK]",       # 掩码标记，用于掩码语言建模任务（可选，默认为 "[MASK]"）
        add_prefix_space=False,    # 是否在输入前添加空格，用于 Deberta 分词器（可选，默认为 False）
        **kwargs,                  # 其他关键字参数
    ):
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
        self.add_bos_token = kwargs.pop("add_bos_token", False)

        # 获取当前预处理器（pre_tokenizer）的状态，并转换为字典
        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
        # 检查预处理器是否需要更新 `add_prefix_space` 属性
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            # 获取预处理器类型并重新设置 `add_prefix_space` 属性
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            pre_tok_state["add_prefix_space"] = add_prefix_space
            # 更新后的预处理器重新赋值给当前实例的 backend_tokenizer.pre_tokenizer
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        # 保存 add_prefix_space 属性到实例变量
        self.add_prefix_space = add_prefix_space

    @property
    def mask_token(self) -> str:
        """
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.

        Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
        comprise the space before the *[MASK]*.
        """
        # 如果 _mask_token 尚未设置，记录错误信息并返回 None
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
            return None
        # 返回 _mask_token 的字符串表示
        return str(self._mask_token)

    @mask_token.setter
    def mask_token(self, value):
        """
        Overriding the default behavior of the mask token to have it eat the space before it.
        """
        # 将 mask token 设置为包含前导空格的特殊词，确保 lstrip 为 True
        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
        self._mask_token = value

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def build_inputs_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. A DeBERTa sequence has the following format:

        - single sequence: [CLS] X [SEP]
        - pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # If only one sequence is provided, return it with [CLS] and [SEP] tokens
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        
        # For sequence pairs, concatenate them with [CLS], [SEP] tokens in between
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If only one sequence is provided, return token type IDs for the first sequence only
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # For sequence pairs, return token type IDs with 0s for the first sequence and 1s for the second sequence
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        """
        Encodes a batch of inputs into token IDs, attention masks, and other relevant information.

        Args:
            *args:
                Positional arguments for encoding.
            **kwargs:
                Keyword arguments for encoding, including `is_split_into_words`.

        Returns:
            `BatchEncoding`: Encoded batch containing token IDs, attention masks, and other metadata.
        """
        is_split_into_words = kwargs.get("is_split_into_words", False)
        
        # Asserts that the tokenizer is compatible with pretokenized inputs if `add_prefix_space=True` is not set
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        # Call the superclass method to perform batch encoding
        return super()._batch_encode_plus(*args, **kwargs)
    # 对输入参数进行编码处理，并返回BatchEncoding对象
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 获取是否已经分词的标志位，默认为False
        is_split_into_words = kwargs.get("is_split_into_words", False)
        
        # 断言条件：如果add_prefix_space为True或者is_split_into_words为False，则通过；否则抛出异常
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )
        
        # 调用父类方法对输入参数进行编码处理并返回结果
        return super()._encode_plus(*args, **kwargs)

    # 从transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast中复制的方法：保存词汇表
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 调用_tokenizer.model的save方法保存模型到指定目录，并返回保存的文件名列表
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        
        # 返回保存的文件名列表作为元组
        return tuple(files)

`.\models\deberta\init.py`

# 版权声明和许可信息
#
# 版权所有 (c) 2020 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权；
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以获取许可证的副本，请参阅
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，
# 没有任何明示或暗示的保证或条件。
# 有关许可证的详细信息，请参阅许可证。
#

# 导入类型检查工具
from typing import TYPE_CHECKING

# 从 utils 模块中导入所需的工具和异常
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaOnnxConfig"],
    "tokenization_deberta": ["DebertaTokenizer"],
}

# 尝试导入 tokenizers_deberta_fast 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["tokenization_deberta_fast"] = ["DebertaTokenizerFast"]

# 尝试导入 modeling_deberta 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_deberta"] = [
        "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "DebertaForMaskedLM",
        "DebertaForQuestionAnswering",
        "DebertaForSequenceClassification",
        "DebertaForTokenClassification",
        "DebertaModel",
        "DebertaPreTrainedModel",
    ]

# 尝试导入 modeling_tf_deberta 模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_tf_deberta"] = [
        "TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFDebertaForMaskedLM",
        "TFDebertaForQuestionAnswering",
        "TFDebertaForSequenceClassification",
        "TFDebertaForTokenClassification",
        "TFDebertaModel",
        "TFDebertaPreTrainedModel",
    ]

# 如果在类型检查模式下，则导入特定的模块和符号
if TYPE_CHECKING:
    from .configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaOnnxConfig
    from .tokenization_deberta import DebertaTokenizer

    # 尝试导入 tokenization_deberta_fast 模块，如果不可用则忽略
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_deberta_fast import DebertaTokenizerFast

    # 尝试导入 torch 模块，如果不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 否则，导入以下模块来自模型定义的Deberta相关类和预训练模型列表
    from .modeling_deberta import (
        DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
        DebertaForMaskedLM,
        DebertaForQuestionAnswering,
        DebertaForSequenceClassification,
        DebertaForTokenClassification,
        DebertaModel,
        DebertaPreTrainedModel,
    )

try:
    # 检查是否没有可用的TensorFlow，如果是则抛出OptionalDependencyNotAvailable异常
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 如果OptionalDependencyNotAvailable异常被抛出，不做任何操作
    pass
else:
    # 否则，导入以下TensorFlow版本的Deberta相关类和预训练模型列表
    from .modeling_tf_deberta import (
        TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFDebertaForMaskedLM,
        TFDebertaForQuestionAnswering,
        TFDebertaForSequenceClassification,
        TFDebertaForTokenClassification,
        TFDebertaModel,
        TFDebertaPreTrainedModel,
    )
else:
    # 导入 sys 模块，用于处理模块操作
    import sys

    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\deberta_v2\configuration_deberta_v2.py`

# coding=utf-8
# Copyright 2020, Microsoft and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
DeBERTa-v2 model configuration
"""
from collections import OrderedDict  # 导入有序字典类
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union  # 导入类型检查、类型声明相关模块

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入ONNX配置类
from ...utils import logging  # 导入日志工具


if TYPE_CHECKING:
    from ... import FeatureExtractionMixin, PreTrainedTokenizerBase, TensorType  # 如果是类型检查模式，则导入特征提取、预训练分词器基类和张量类型

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 预训练配置的映射字典，将预训练模型名称映射到配置文件URL
DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/config.json",
    "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/config.json",
    "microsoft/deberta-v2-xlarge-mnli": (
        "https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/config.json"
    ),
    "microsoft/deberta-v2-xxlarge-mnli": (
        "https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/config.json"
    ),
}


class DebertaV2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
    DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the DeBERTa
    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import DebertaV2Config, DebertaV2Model

    >>> # Initializing a DeBERTa-v2 microsoft/deberta-v2-xlarge style configuration
    >>> configuration = DebertaV2Config()

    >>> # Initializing a model (with random weights) from the microsoft/deberta-v2-xlarge style configuration
    >>> model = DebertaV2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "deberta-v2"  # 模型类型为deberta-v2
        # 初始化函数，用于初始化模型参数
        def __init__(
            self,
            vocab_size=128100,  # 词汇表大小，默认为128100
            hidden_size=1536,  # 隐藏层大小，默认为1536
            num_hidden_layers=24,  # 隐藏层的数量，默认为24
            num_attention_heads=24,  # 注意力头的数量，默认为24
            intermediate_size=6144,  # 中间层大小，默认为6144
            hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
            hidden_dropout_prob=0.1,  # 隐藏层dropout概率，默认为0.1
            attention_probs_dropout_prob=0.1,  # 注意力概率dropout概率，默认为0.1
            max_position_embeddings=512,  # 最大位置嵌入数，默认为512
            type_vocab_size=0,  # 类型词汇表大小，默认为0
            initializer_range=0.02,  # 初始化范围，默认为0.02
            layer_norm_eps=1e-7,  # 层归一化的epsilon值，默认为1e-7
            relative_attention=False,  # 是否使用相对注意力，默认为False
            max_relative_positions=-1,  # 最大相对位置，默认为-1
            pad_token_id=0,  # 填充标记ID，默认为0
            position_biased_input=True,  # 位置偏置输入，默认为True
            pos_att_type=None,  # 位置注意力类型，默认为None
            pooler_dropout=0,  # 汇聚层dropout概率，默认为0
            pooler_hidden_act="gelu",  # 汇聚层隐藏层激活函数，默认为GELU
            **kwargs,
        ):
            super().__init__(**kwargs)  # 调用父类的初始化函数

            self.hidden_size = hidden_size  # 设置隐藏层大小
            self.num_hidden_layers = num_hidden_layers  # 设置隐藏层数量
            self.num_attention_heads = num_attention_heads  # 设置注意力头数量
            self.intermediate_size = intermediate_size  # 设置中间层大小
            self.hidden_act = hidden_act  # 设置隐藏层激活函数
            self.hidden_dropout_prob = hidden_dropout_prob  # 设置隐藏层dropout概率
            self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 设置注意力概率dropout概率
            self.max_position_embeddings = max_position_embeddings  # 设置最大位置嵌入数
            self.type_vocab_size = type_vocab_size  # 设置类型词汇表大小
            self.initializer_range = initializer_range  # 设置初始化范围
            self.relative_attention = relative_attention  # 设置是否使用相对注意力
            self.max_relative_positions = max_relative_positions  # 设置最大相对位置
            self.pad_token_id = pad_token_id  # 设置填充标记ID
            self.position_biased_input = position_biased_input  # 设置位置偏置输入

            # 兼容性处理
            if isinstance(pos_att_type, str):  # 如果位置注意力类型为字符串
                pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]  # 将其分割为小写后的列表

            self.pos_att_type = pos_att_type  # 设置位置注意力类型
            self.vocab_size = vocab_size  # 设置词汇表大小
            self.layer_norm_eps = layer_norm_eps  # 设置层归一化的epsilon值

            self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)  # 设置汇聚层隐藏大小，默认为隐藏层大小
            self.pooler_dropout = pooler_dropout  # 设置汇聚层dropout概率
            self.pooler_hidden_act = pooler_hidden_act  # 设置汇聚层隐藏层激活函数
# 定义一个 DebertaV2OnnxConfig 类，继承自 OnnxConfig 类
class DebertaV2OnnxConfig(OnnxConfig):
    
    # 定义 inputs 属性，返回一个映射，其键为字符串，值为映射，其值为整数到字符串的映射
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务类型是多项选择
        if self.task == "multiple-choice":
            # 动态轴设置为 {0: "batch", 1: "choice", 2: "sequence"}
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则动态轴设置为 {0: "batch", 1: "sequence"}
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 如果配置中的 type_vocab_size 大于 0
        if self._config.type_vocab_size > 0:
            # 返回一个有序字典，包含键为 "input_ids", "attention_mask", "token_type_ids"，值为 dynamic_axis 的条目
            return OrderedDict(
                [("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis)]
            )
        else:
            # 否则返回一个有序字典，包含键为 "input_ids", "attention_mask"，值为 dynamic_axis 的条目
            return OrderedDict([("input_ids", dynamic_axis), ("attention_mask", dynamic_axis)])
    
    # 定义 default_onnx_opset 属性，返回整数 12
    @property
    def default_onnx_opset(self) -> int:
        return 12
    
    # 定义 generate_dummy_inputs 方法，用于生成虚拟输入数据
    def generate_dummy_inputs(
        self,
        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
        batch_size: int = -1,
        seq_length: int = -1,
        num_choices: int = -1,
        is_pair: bool = False,
        framework: Optional["TensorType"] = None,
        num_channels: int = 3,
        image_width: int = 40,
        image_height: int = 40,
        tokenizer: "PreTrainedTokenizerBase" = None,
    ) -> Mapping[str, Any]:
        # 调用父类的 generate_dummy_inputs 方法生成虚拟输入
        dummy_inputs = super().generate_dummy_inputs(preprocessor=preprocessor, framework=framework)
        
        # 如果配置中的 type_vocab_size 为 0 并且 dummy_inputs 中包含 "token_type_ids"
        if self._config.type_vocab_size == 0 and "token_type_ids" in dummy_inputs:
            # 从 dummy_inputs 中删除 "token_type_ids" 条目
            del dummy_inputs["token_type_ids"]
        
        # 返回生成的虚拟输入
        return dummy_inputs

`.\models\deberta_v2\modeling_deberta_v2.py`

# coding=utf-8
# Copyright 2020 Microsoft and the Hugging Face Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DeBERTa-v2 model."""

from collections.abc import Sequence
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import softmax_backward_data
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_deberta_v2 import DebertaV2Config


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "DebertaV2Config"
_CHECKPOINT_FOR_DOC = "microsoft/deberta-v2-xlarge"
_QA_TARGET_START_INDEX = 2
_QA_TARGET_END_INDEX = 9

DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/deberta-v2-xlarge",
    "microsoft/deberta-v2-xxlarge",
    "microsoft/deberta-v2-xlarge-mnli",
    "microsoft/deberta-v2-xxlarge-mnli",
]


# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
class ContextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用线性层将输入特征从 pooler_hidden_size 转换到 pooler_hidden_size
        self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
        # 添加稳定的 dropout 层，减少过拟合
        self.dropout = StableDropout(config.pooler_dropout)
        self.config = config

    def forward(self, hidden_states):
        # 通过取第一个 token 的隐藏状态来池化模型
        context_token = hidden_states[:, 0]
        context_token = self.dropout(context_token)
        # 将池化后的输出通过线性层得到最终的池化输出
        pooled_output = self.dense(context_token)
        # 使用激活函数 ACT2FN[self.config.pooler_hidden_act] 处理池化输出
        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
        return pooled_output

    @property
    def output_dim(self):
        # 返回输出维度为隐藏大小
        return self.config.hidden_size


# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2
class XSoftmax(torch.autograd.Function):
    """
    Masked Softmax which is optimized for saving memory
    """
    # XSoftmax 是一个优化内存的掩码 Softmax 函数
    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension along which softmax will be applied.

    Example:

    ```
    >>> import torch
    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    ```

    @staticmethod
    def forward(self, input, mask, dim):
        # Set the dimension for softmax calculation
        self.dim = dim
        # Invert the mask to create a reverse mask (rmask)
        rmask = ~(mask.to(torch.bool))

        # Replace ignored elements with the minimum value of the input tensor's dtype
        output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
        # Apply softmax along the specified dimension
        output = torch.softmax(output, self.dim)
        # Zero out the softmax values corresponding to ignored elements
        output.masked_fill_(rmask, 0)
        # Save the output tensor for backward computation
        self.save_for_backward(output)
        return output

    @staticmethod
    def backward(self, grad_output):
        # Retrieve the saved output tensor
        (output,) = self.saved_tensors
        # Compute gradient of input with respect to softmax output
        inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output)
        return inputGrad, None, None

    @staticmethod
    def symbolic(g, self, mask, dim):
        import torch.onnx.symbolic_helper as sym_help
        from torch.onnx.symbolic_opset9 import masked_fill, softmax

        # Cast mask to long and create reverse mask (r_mask)
        mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
        r_mask = g.op(
            "Cast",
            g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
            to_i=sym_help.cast_pytorch_to_onnx["Bool"],
        )
        # Fill ignored elements with the minimum value of tensor's dtype
        output = masked_fill(
            g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
        )
        # Apply softmax along specified dimension
        output = softmax(g, output, dim)
        # Fill ignored elements of softmax output with zero
        return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
# 定义了一个名为 DropoutContext 的类，用于管理 Dropout 相关的上下文信息
class DropoutContext(object):
    def __init__(self):
        # 初始化 dropout 参数为 0
        self.dropout = 0
        # 初始化 mask 为 None
        self.mask = None
        # 初始化 scale 参数为 1
        self.scale = 1
        # 初始化 reuse_mask 参数为 True，表示可以重复使用 mask
        self.reuse_mask = True


# Copied from transformers.models.deberta.modeling_deberta.get_mask
# 定义了一个名为 get_mask 的函数，用于根据不同的上下文获取 dropout mask
def get_mask(input, local_context):
    # 如果 local_context 不是 DropoutContext 类型，则将其作为 dropout 参数处理
    if not isinstance(local_context, DropoutContext):
        dropout = local_context
        mask = None
    else:
        # 如果 local_context 是 DropoutContext 类型，则获取其中的 dropout 和 scale 参数
        dropout = local_context.dropout
        dropout *= local_context.scale
        # 根据 reuse_mask 参数决定是否重用 mask
        mask = local_context.mask if local_context.reuse_mask else None

    # 如果 dropout 大于 0 且 mask 为 None，则生成一个新的 dropout mask
    if dropout > 0 and mask is None:
        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)

    # 如果 local_context 是 DropoutContext 类型且其 mask 为 None，则更新其 mask
    if isinstance(local_context, DropoutContext):
        if local_context.mask is None:
            local_context.mask = mask

    # 返回生成的 mask 和 dropout 参数
    return mask, dropout


# Copied from transformers.models.deberta.modeling_deberta.XDropout
# 定义了一个名为 XDropout 的自定义 PyTorch 函数，优化了 dropout 操作以节省计算和内存
class XDropout(torch.autograd.Function):
    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""

    @staticmethod
    def forward(ctx, input, local_ctx):
        # 调用 get_mask 函数获取 mask 和 dropout 参数
        mask, dropout = get_mask(input, local_ctx)
        # 计算 scale 参数用于反向传播时的缩放
        ctx.scale = 1.0 / (1 - dropout)
        # 如果 dropout 大于 0，则应用 dropout mask，并对输入进行缩放
        if dropout > 0:
            ctx.save_for_backward(mask)
            return input.masked_fill(mask, 0) * ctx.scale
        else:
            # 如果 dropout 等于 0，则直接返回输入
            return input

    @staticmethod
    def backward(ctx, grad_output):
        # 如果 scale 大于 1，则恢复被 dropout 的梯度
        if ctx.scale > 1:
            (mask,) = ctx.saved_tensors
            return grad_output.masked_fill(mask, 0) * ctx.scale, None
        else:
            # 如果 scale 不大于 1，则直接返回梯度
            return grad_output, None

    @staticmethod
    def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
        from torch.onnx import symbolic_opset12

        # 根据 local_ctx 类型决定 dropout 参数
        dropout_p = local_ctx
        if isinstance(local_ctx, DropoutContext):
            dropout_p = local_ctx.dropout
        # 使用符号运算创建 ONNX 图中的 dropout 操作
        # 这里固定使用 opset12 版本的 dropout 符号操作
        train = True  # StableDropout 只在训练时调用此函数。
        return symbolic_opset12.dropout(g, input, dropout_p, train)


# Copied from transformers.models.deberta.modeling_deberta.StableDropout
# 定义了一个名为 StableDropout 的 PyTorch 模块，用于稳定化训练时的 dropout 操作
class StableDropout(nn.Module):
    """
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    """

    def __init__(self, drop_prob):
        super().__init__()
        # 初始化 dropout 概率
        self.drop_prob = drop_prob
        # 计数器初始化为 0
        self.count = 0
        # 上下文栈初始化为 None
        self.context_stack = None
    def forward(self, x):
        """
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        """
        # 如果处于训练模式且 dropout 概率大于0，则应用自定义的 XDropout 操作
        if self.training and self.drop_prob > 0:
            return XDropout.apply(x, self.get_context())
        # 否则直接返回输入张量 x
        return x

    def clear_context(self):
        """
        Clear the context stack and reset count to zero.
        """
        # 将计数器 count 设为 0，清空上下文栈 context_stack
        self.count = 0
        self.context_stack = None

    def init_context(self, reuse_mask=True, scale=1):
        """
        Initialize the context stack with optional parameters.

        Args:
            reuse_mask (bool, optional): Whether to reuse mask for dropout. Defaults to True.
            scale (int, optional): Scaling factor for dropout. Defaults to 1.
        """
        # 如果上下文栈 context_stack 为空，则初始化为空列表
        if self.context_stack is None:
            self.context_stack = []
        # 将计数器 count 设为 0
        self.count = 0
        # 遍历上下文栈 context_stack，设置每个上下文对象的复用掩码和缩放因子
        for c in self.context_stack:
            c.reuse_mask = reuse_mask
            c.scale = scale

    def get_context(self):
        """
        Get the current dropout context from the context stack or create a new one.

        Returns:
            DropoutContext: Current or newly created dropout context.
        """
        # 如果上下文栈 context_stack 不为空
        if self.context_stack is not None:
            # 如果计数器 count 大于或等于上下文栈 context_stack 的长度，添加新的 DropoutContext 对象到栈中
            if self.count >= len(self.context_stack):
                self.context_stack.append(DropoutContext())
            # 获取当前计数器对应的上下文对象 ctx
            ctx = self.context_stack[self.count]
            # 设置该上下文对象的 dropout 属性为当前实例的 drop_prob
            ctx.dropout = self.drop_prob
            # 计数器 count 加一
            self.count += 1
            # 返回获取到的上下文对象 ctx
            return ctx
        else:
            # 如果上下文栈 context_stack 为空，则直接返回当前实例的 drop_prob
            return self.drop_prob
# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm
class DebertaV2SelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化 LayerNorm 层，输入维度是 config.hidden_size，使用 config.layer_norm_eps 作为 epsilon 参数
        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
        # 初始化一个稳定的 Dropout 层，使用 config.hidden_dropout_prob 作为 dropout 概率
        self.dropout = StableDropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        # 将 hidden_states 输入全连接层 self.dense 中
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行 dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 将 dropout 处理后的 hidden_states 和 input_tensor 相加，并通过 LayerNorm 层处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2
class DebertaV2Attention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个 DisentangledSelfAttention 对象
        self.self = DisentangledSelfAttention(config)
        # 初始化一个 DebertaV2SelfOutput 对象
        self.output = DebertaV2SelfOutput(config)
        # 保存配置信息
        self.config = config

    def forward(
        self,
        hidden_states,
        attention_mask,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
    ):
        # 调用 self.self 的 forward 方法进行自注意力计算
        self_output = self.self(
            hidden_states,
            attention_mask,
            output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
        )
        # 如果需要输出注意力矩阵，解包 self_output
        if output_attentions:
            self_output, att_matrix = self_output
        # 如果 query_states 为 None，则使用 hidden_states 作为 query_states
        if query_states is None:
            query_states = hidden_states
        # 调用 self.output 的 forward 方法，将 self_output 和 query_states 作为输入
        attention_output = self.output(self_output, query_states)

        if output_attentions:
            return (attention_output, att_matrix)
        else:
            return attention_output


# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2
class DebertaV2Intermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入维度是 config.hidden_size，输出维度是 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据 config.hidden_act 的类型选择对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将 hidden_states 输入全连接层 self.dense
        hidden_states = self.dense(hidden_states)
        # 使用选择的激活函数对全连接层的输出进行激活
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm
class DebertaV2Output(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层，输入维度是 config.intermediate_size，输出维度是 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 初始化 LayerNorm 层，输入维度是 config.hidden_size，使用 config.layer_norm_eps 作为 epsilon 参数
        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
        # 初始化一个稳定的 Dropout 层，使用 config.hidden_dropout_prob 作为 dropout 概率
        self.dropout = StableDropout(config.hidden_dropout_prob)
        # 保存配置信息
        self.config = config
    # 定义神经网络的前向传播函数，接收隐藏状态和输入张量作为参数
    def forward(self, hidden_states, input_tensor):
        # 将隐藏状态通过全连接层进行变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态应用丢弃(dropout)操作
        hidden_states = self.dropout(hidden_states)
        # 将丢弃后的隐藏状态与输入张量相加，并通过层归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态
        return hidden_states
# 从transformers.models.deberta.modeling_deberta.DebertaLayer复制而来，Deberta->DebertaV2
class DebertaV2Layer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化注意力层，使用DebertaV2Attention类
        self.attention = DebertaV2Attention(config)
        # 初始化中间层，使用DebertaV2Intermediate类
        self.intermediate = DebertaV2Intermediate(config)
        # 初始化输出层，使用DebertaV2Output类
        self.output = DebertaV2Output(config)

    def forward(
        self,
        hidden_states,
        attention_mask,
        query_states=None,
        relative_pos=None,
        rel_embeddings=None,
        output_attentions=False,
    ):
        # 调用注意力层的前向传播函数
        attention_output = self.attention(
            hidden_states,
            attention_mask,
            output_attentions=output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
        )
        # 如果需要输出注意力矩阵，则解包注意力输出
        if output_attentions:
            attention_output, att_matrix = attention_output
        # 经过中间层的前向传播
        intermediate_output = self.intermediate(attention_output)
        # 经过输出层的前向传播，得到最终层的输出
        layer_output = self.output(intermediate_output, attention_output)
        # 如果需要输出注意力矩阵，则返回层输出和注意力矩阵
        if output_attentions:
            return (layer_output, att_matrix)
        else:
            return layer_output


class ConvLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 获取卷积核大小，默认为3
        kernel_size = getattr(config, "conv_kernel_size", 3)
        # 获取卷积组数，默认为1
        groups = getattr(config, "conv_groups", 1)
        # 获取卷积激活函数，默认为"tanh"
        self.conv_act = getattr(config, "conv_act", "tanh")
        # 定义一维卷积层
        self.conv = nn.Conv1d(
            config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups
        )
        # 初始化LayerNorm层
        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
        # 初始化稳定的Dropout层
        self.dropout = StableDropout(config.hidden_dropout_prob)
        # 保存配置信息
        self.config = config

    def forward(self, hidden_states, residual_states, input_mask):
        # 执行卷积操作，要求hidden_states的维度为[batch_size, seq_length, hidden_size]
        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous()
        # 生成掩码，用于遮盖无效位置的输出
        rmask = (1 - input_mask).bool()
        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
        # 应用激活函数到卷积输出，并加上稳定的Dropout
        out = ACT2FN[self.conv_act](self.dropout(out))

        # 计算LayerNorm的输入，即残差连接后的结果
        layer_norm_input = residual_states + out
        # 对LayerNorm层进行归一化处理
        output = self.LayerNorm(layer_norm_input).to(layer_norm_input)

        # 如果输入掩码为空，则直接使用输出；否则，根据掩码遮盖输出结果
        if input_mask is None:
            output_states = output
        else:
            if input_mask.dim() != layer_norm_input.dim():
                # 如果输入掩码维度与LayerNorm输入维度不同，则调整掩码维度
                if input_mask.dim() == 4:
                    input_mask = input_mask.squeeze(1).squeeze(1)
                input_mask = input_mask.unsqueeze(2)

            input_mask = input_mask.to(output.dtype)
            # 对输出应用掩码
            output_states = output * input_mask

        return output_states


class DebertaV2Encoder(nn.Module):
    """Modified BertEncoder with relative position bias support"""
    # 初始化函数，接收一个配置对象作为参数
    def __init__(self, config):
        # 调用父类初始化方法
        super().__init__()

        # 创建包含多个 DebertaV2Layer 层的 ModuleList，数量由配置中的 num_hidden_layers 决定
        self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
        
        # 检查是否启用相对注意力机制
        self.relative_attention = getattr(config, "relative_attention", False)

        # 如果启用相对注意力机制
        if self.relative_attention:
            # 获取最大相对位置数，如果未设置或小于1，则使用默认的最大位置嵌入数
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings

            # 获取位置桶的数目，如果大于0，则根据桶数计算位置嵌入的大小
            self.position_buckets = getattr(config, "position_buckets", -1)
            pos_ebd_size = self.max_relative_positions * 2

            # 如果设置了位置桶数，则重新计算位置嵌入的大小
            if self.position_buckets > 0:
                pos_ebd_size = self.position_buckets * 2

            # 创建相对位置嵌入层，使用 nn.Embedding，大小为 pos_ebd_size × config.hidden_size
            self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)

        # 解析配置中的 norm_rel_ebd 字符串，去除首尾空格并转换为小写，以列表形式保存到 self.norm_rel_ebd
        self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]

        # 如果 norm_rel_ebd 中包含 "layer_norm"，则创建 LayerNorm 层用于归一化相对位置嵌入
        if "layer_norm" in self.norm_rel_ebd:
            self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)

        # 如果配置中指定了卷积核大小大于0，则创建 ConvLayer，否则将 self.conv 设为 None
        self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
        
        # 梯度检查点默认为 False
        self.gradient_checkpointing = False

    # 获取相对位置嵌入，如果未启用相对注意力或相对位置嵌入不存在，则返回 None
    def get_rel_embedding(self):
        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
        # 如果相对位置嵌入存在且需要进行 LayerNorm，则对相对位置嵌入进行归一化处理
        if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
            rel_embeddings = self.LayerNorm(rel_embeddings)
        return rel_embeddings

    # 获取注意力掩码，用于屏蔽无效的注意力位置
    def get_attention_mask(self, attention_mask):
        # 如果 attention_mask 的维度不大于2，则扩展其维度以适应多头注意力计算的需求
        if attention_mask.dim() <= 2:
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
        # 如果 attention_mask 的维度为3，则在第1维上再次扩展以适应多头注意力计算的需求
        elif attention_mask.dim() == 3:
            attention_mask = attention_mask.unsqueeze(1)

        return attention_mask

    # 获取相对位置编码，根据输入的隐藏状态和查询状态生成相对位置编码
    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
        # 如果启用相对注意力且未提供相对位置编码，则根据参数构建相对位置编码
        if self.relative_attention and relative_pos is None:
            q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
            relative_pos = build_relative_position(
                q,
                hidden_states.size(-2),
                bucket_size=self.position_buckets,
                max_position=self.max_relative_positions,
                device=hidden_states.device,
            )
        return relative_pos

    # 前向传播函数，接收多个参数用于模型的计算，并返回模型输出
    def forward(
        self,
        hidden_states,
        attention_mask,
        output_hidden_states=True,
        output_attentions=False,
        query_states=None,
        relative_pos=None,
        return_dict=True,
    ):
        # 如果输入的 attention_mask 的维度小于等于2，则直接使用该输入作为 input_mask
        if attention_mask.dim() <= 2:
            input_mask = attention_mask
        else:
            # 否则，计算 attention_mask 沿着倒数第二维的和是否大于0，生成 input_mask
            input_mask = attention_mask.sum(-2) > 0
        # 获取处理后的 attention_mask
        attention_mask = self.get_attention_mask(attention_mask)
        # 获取相对位置编码
        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)

        # 初始化用于存储所有隐藏状态和注意力的元组，如果不需要输出则为 None
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # 如果 hidden_states 是 Sequence 类型，则取其第一个作为 next_kv
        if isinstance(hidden_states, Sequence):
            next_kv = hidden_states[0]
        else:
            next_kv = hidden_states
        # 获取相对位置编码的嵌入
        rel_embeddings = self.get_rel_embedding()
        # 输出状态初始化为 next_kv
        output_states = next_kv
        # 遍历每一个 transformer 层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前 output_states 添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (output_states,)

            # 如果启用了梯度检查点且正在训练阶段，则使用梯度检查点函数进行前向传播
            if self.gradient_checkpointing and self.training:
                output_states = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    next_kv,
                    attention_mask,
                    query_states,
                    relative_pos,
                    rel_embeddings,
                    output_attentions,
                )
            else:
                # 否则，直接调用当前层进行前向传播
                output_states = layer_module(
                    next_kv,
                    attention_mask,
                    query_states=query_states,
                    relative_pos=relative_pos,
                    rel_embeddings=rel_embeddings,
                    output_attentions=output_attentions,
                )

            # 如果需要输出注意力权重，则从 output_states 中解包出注意力权重 att_m
            if output_attentions:
                output_states, att_m = output_states

            # 如果是第一个层并且存在卷积操作，则将卷积操作应用到输出状态上
            if i == 0 and self.conv is not None:
                output_states = self.conv(hidden_states, output_states, input_mask)

            # 如果 query_states 不为 None，则更新 query_states 为当前输出状态
            if query_states is not None:
                query_states = output_states
                # 如果 hidden_states 是 Sequence 类型，则更新 next_kv 为下一个 hidden_states
                if isinstance(hidden_states, Sequence):
                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
            else:
                # 否则，更新 next_kv 为当前输出状态
                next_kv = output_states

            # 如果需要输出注意力权重，则将当前层的注意力权重 att_m 添加到 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + (att_m,)

        # 如果需要输出隐藏状态，则将最后一个 output_states 添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (output_states,)

        # 如果不需要返回字典格式的输出，则将结果打包为元组返回
        if not return_dict:
            return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)
        # 否则，返回 BaseModelOutput 类型的对象，包含最终的隐藏状态、所有隐藏状态和所有注意力权重
        return BaseModelOutput(
            last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
@torch.jit.script
# 从transformers.models.deberta.modeling_deberta.c2p_dynamic_expand复制而来，用于扩展C2P位置编码
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])


@torch.jit.script
# 从transformers.models.deberta.modeling_deberta.p2c_dynamic_expand复制而来，用于扩展P2C位置编码
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
    return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])


@torch.jit.script
# 从transformers.models.deberta.modeling_deberta.pos_dynamic_expand复制而来，用于扩展位置索引
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
    return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))


class DisentangledSelfAttention(nn.Module):
    """
    Disentangled self-attention module

    Parameters:
        config (`DebertaV2Config`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaV2Config`]

    """
    # 初始化函数，接收一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        
        # 检查隐藏层大小是否能被注意力头数整除，否则抛出数值错误异常
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        
        # 设置注意力头数
        self.num_attention_heads = config.num_attention_heads
        
        # 计算每个注意力头的大小
        _attention_head_size = config.hidden_size // config.num_attention_heads
        self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
        
        # 计算所有注意力头的总大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        
        # 初始化查询、键、值的线性投影层
        self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
        self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
        self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)

        # 是否共享注意力键
        self.share_att_key = getattr(config, "share_att_key", False)
        
        # 位置注意力类型
        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
        
        # 是否使用相对位置注意力
        self.relative_attention = getattr(config, "relative_attention", False)

        # 如果使用相对位置注意力
        if self.relative_attention:
            # 设置位置桶数和最大相对位置
            self.position_buckets = getattr(config, "position_buckets", -1)
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            
            # 如果最大相对位置小于1，则使用配置的最大位置嵌入数
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings
            
            # 设置位置嵌入的大小
            self.pos_ebd_size = self.max_relative_positions
            
            # 如果位置桶数大于0，则将位置嵌入的大小设为位置桶数
            if self.position_buckets > 0:
                self.pos_ebd_size = self.position_buckets
            
            # 初始化位置嵌入的稳定dropout
            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
            
            # 如果不共享注意力键
            if not self.share_att_key:
                # 如果是"c2p"类型的位置注意力，初始化位置键的线性投影层
                if "c2p" in self.pos_att_type:
                    self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
                # 如果是"p2c"类型的位置注意力，初始化位置查询的线性投影层
                if "p2c" in self.pos_att_type:
                    self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化注意力概率的稳定dropout
        self.dropout = StableDropout(config.attention_probs_dropout_prob)

    # 将输入张量 x 转置以适应多头注意力的形状
    def transpose_for_scores(self, x, attention_heads):
        new_x_shape = x.size()[:-1] + (attention_heads, -1)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))

    # 前向传播函数
    def forward(
        self,
        hidden_states,           # 输入的隐藏状态张量
        attention_mask,          # 注意力掩码张量
        output_attentions=False, # 是否输出注意力
        query_states=None,       # 查询状态张量（可选）
        relative_pos=None,       # 相对位置（可选）
        rel_embeddings=None,     # 相对位置嵌入（可选）
# 从 transformers.models.deberta.modeling_deberta.DebertaEmbeddings 复制而来，修改了 DebertaLayerNorm->LayerNorm
class DebertaV2Embeddings(nn.Module):
    """从单词、位置和令牌类型嵌入构造嵌入。"""

    def __init__(self, config):
        super().__init__()
        # 获取填充令牌ID，若无则默认为0
        pad_token_id = getattr(config, "pad_token_id", 0)
        # 获取嵌入大小，默认为隐藏大小
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        # 创建单词嵌入层，大小为词汇表大小 x 嵌入大小，使用填充ID作为padding_idx
        self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)

        # 是否使用位置偏置输入，默认为True
        self.position_biased_input = getattr(config, "position_biased_input", True)
        if not self.position_biased_input:
            self.position_embeddings = None
        else:
            # 创建位置嵌入层，大小为最大位置嵌入数 x 嵌入大小
            self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)

        # 如果类型词汇大小大于0，则创建令牌类型嵌入层，大小为类型词汇大小 x 嵌入大小
        if config.type_vocab_size > 0:
            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)

        # 如果嵌入大小不等于隐藏大小，则创建线性投影层，将嵌入大小映射到隐藏大小，无偏置
        if self.embedding_size != config.hidden_size:
            self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
        
        # 创建LayerNorm层，对隐藏大小进行归一化，使用给定的层归一化epsilon值
        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
        # 创建稳定Dropout层，使用给定的隐藏丢弃概率
        self.dropout = StableDropout(config.hidden_dropout_prob)
        # 保存配置信息
        self.config = config

        # position_ids (1, len position emb) 在内存中是连续的，并且在序列化时被导出
        # 创建位置ID张量，大小为1 x 最大位置嵌入数，使用torch.arange扩展而来，不持久化
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
    # 定义前向传播方法，接收多个输入参数：input_ids, token_type_ids, position_ids, mask, inputs_embeds
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
        # 如果 input_ids 不为 None，则获取其形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则获取 inputs_embeds 的形状，去除最后一个维度
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度，即 input_shape 的第二个维度
        seq_length = input_shape[1]

        # 如果 position_ids 为 None，则使用预定义的 self.position_ids，并截取到与序列长度相同的部分
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果 token_type_ids 为 None，则创建与 input_shape 相同形状的零张量
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果 inputs_embeds 为 None，则使用 self.word_embeddings 对 input_ids 进行嵌入处理
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # 如果存在 self.position_embeddings，则根据 position_ids 获取位置嵌入
        if self.position_embeddings is not None:
            position_embeddings = self.position_embeddings(position_ids.long())
        else:
            # 否则创建与 inputs_embeds 相同形状的零张量作为位置嵌入
            position_embeddings = torch.zeros_like(inputs_embeds)

        # 将嵌入向量初始化为 inputs_embeds
        embeddings = inputs_embeds
        # 如果开启了位置偏置输入 self.position_biased_input，则加上位置嵌入
        if self.position_biased_input:
            embeddings += position_embeddings
        # 如果配置中的 type_vocab_size 大于 0，则加上 token_type_embeddings
        if self.config.type_vocab_size > 0:
            token_type_embeddings = self.token_type_embeddings(token_type_ids)
            embeddings += token_type_embeddings

        # 如果嵌入大小不等于隐藏大小 self.embedding_size != self.config.hidden_size，则通过 embed_proj 进行投影
        if self.embedding_size != self.config.hidden_size:
            embeddings = self.embed_proj(embeddings)

        # 经过 LayerNorm 归一化处理
        embeddings = self.LayerNorm(embeddings)

        # 如果 mask 不为 None，则对 embeddings 应用 mask
        if mask is not None:
            # 如果 mask 的维度不等于 embeddings 的维度
            if mask.dim() != embeddings.dim():
                # 如果 mask 的维度为 4，则进行挤压操作
                if mask.dim() == 4:
                    mask = mask.squeeze(1).squeeze(1)
                # 将 mask 的维度扩展到与 embeddings 相同
                mask = mask.unsqueeze(2)
            # 将 mask 转换为与 embeddings 相同的数据类型
            mask = mask.to(embeddings.dtype)
            # 应用 mask 到 embeddings 上
            embeddings = embeddings * mask

        # 经过 dropout 处理
        embeddings = self.dropout(embeddings)
        # 返回 embeddings
        return embeddings
# 从transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel复制而来，将Deberta改为DebertaV2
class DebertaV2PreTrainedModel(PreTrainedModel):
    """
    用于处理权重初始化、预训练模型下载和加载的抽象类。
    """
    
    # 配置类指定为DebertaV2Config
    config_class = DebertaV2Config
    # 基础模型前缀为"deberta"
    base_model_prefix = "deberta"
    # 加载时忽略的键名列表
    _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重。"""
        if isinstance(module, nn.Linear):
            # 与TF版本稍有不同，使用正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
                

DEBERTA_START_DOCSTRING = r"""
    DeBERTa模型由何鹏程、刘晓东、高建峰、陈伟柱在论文《DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention》中提出。它在BERT/RoBERTa的基础上进行了两项改进，即解耦注意力和增强的掩码解码器。通过这两项改进，
    在使用80GB预训练数据的大多数任务上超越了BERT/RoBERTa。

    这个模型也是PyTorch的torch.nn.Module子类。
    使用时可以像普通的PyTorch Module一样使用，并参考PyTorch文档处理一切一般使用和行为相关的事项。
    

    参数:
        config ([`DebertaV2Config`]): 包含模型所有参数的配置类。
            使用配置文件初始化模型时不会加载模型的权重，只会加载配置信息。
            查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型权重。
"""

DEBERTA_INPUTS_DOCSTRING = r"""
    # 输入
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列的token索引，在词汇表中
            Indices of input sequence tokens in the vocabulary.
            
            # 可以使用`AutoTokenizer`获取这些索引。详见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 注意力掩码，避免在填充的token索引上进行注意力计算
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 分段token索引，指示输入的第一部分和第二部分。索引在`[0, 1]`中选择：

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列token在位置嵌入中的位置索引。在范围`[0, config.max_position_embeddings - 1]`中选择。

            [What are position IDs?](../glossary#position-ids)
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选项，直接传递嵌入表示，而不是传递`input_ids`。在想要更多控制如何将`input_ids`索引转换为关联向量时有用。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关更多详细信息，请参见返回张量下的`attentions`。
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关更多详细信息，请参见返回张量下的`hidden_states`。
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            # 是否返回`~utils.ModelOutput`而不是普通元组。
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
    "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
    DEBERTA_START_DOCSTRING,
)
# 从transformers.models.deberta.modeling_deberta.DebertaModel复制而来，将Deberta更改为DebertaV2
class DebertaV2Model(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化模型的嵌入层和编码器
        self.embeddings = DebertaV2Embeddings(config)
        self.encoder = DebertaV2Encoder(config)
        self.z_steps = 0  # 初始化 z_steps 为 0
        self.config = config  # 保存模型配置
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回模型的嵌入层中的词嵌入
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        # 设置模型的嵌入层中的词嵌入
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        剪枝模型中的注意力头。
        heads_to_prune: 要剪枝的头部字典 {层号: 要在此层中剪枝的头部列表}，参见基类PreTrainedModel
        """
        raise NotImplementedError("The prune function is not implemented in DeBERTa model.")

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # DeBERTa模型的前向传播函数，接受多种输入参数和控制标志
        # DEBERTA_INPUTS_DOCSTRING 格式化字符串，描述了输入的文档字符串
        # _CHECKPOINT_FOR_DOC 检查点用于文档，BaseModelOutput 输出类型，_CONFIG_FOR_DOC 配置类
        pass
        ) -> Union[Tuple, BaseModelOutput]:
        # 如果用户没有指定是否输出注意力权重，使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果用户没有指定是否输出隐藏状态，使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果用户没有指定是否返回字典格式的输出，使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果既指定了输入的 token IDs 又指定了嵌入向量，抛出数值错误
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            # 如果指定了输入的 token IDs，则检查是否存在填充并且没有给出注意力遮罩的警告
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            # 获取输入的 token IDs 的形状
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            # 如果指定了嵌入向量，则获取其形状，排除最后一个维度（用于批处理）
            input_shape = inputs_embeds.size()[:-1]
        else:
            # 如果既未指定 token IDs 也未指定嵌入向量，抛出数值错误
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 确定使用的设备是 token IDs 的设备还是嵌入向量的设备
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # 如果没有给出注意力遮罩，则创建一个全为 1 的注意力遮罩张量
        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)
        # 如果没有给出 token 类型 IDs，则创建一个全为 0 的 token 类型 IDs 张量
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 将输入传递到嵌入层，获取嵌入输出
        embedding_output = self.embeddings(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            mask=attention_mask,
            inputs_embeds=inputs_embeds,
        )

        # 将嵌入输出传递到编码器层，并返回编码器的输出
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask,
            output_hidden_states=True,
            output_attentions=output_attentions,
            return_dict=return_dict,
        )
        # 获取编码器的编码层输出
        encoded_layers = encoder_outputs[1]

        # 如果设置了多步更新 z_steps 大于 1
        if self.z_steps > 1:
            # 获取倒数第二层的隐藏状态
            hidden_states = encoded_layers[-2]
            # 复制编码器最后一层，次数为 z_steps
            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
            # 获取查询状态
            query_states = encoded_layers[-1]
            # 获取相对嵌入
            rel_embeddings = self.encoder.get_rel_embedding()
            # 获取注意力遮罩
            attention_mask = self.encoder.get_attention_mask(attention_mask)
            # 获取相对位置
            rel_pos = self.encoder.get_rel_pos(embedding_output)
            # 对于除了第一层的每一层
            for layer in layers[1:]:
                # 运行
                Those .g There Med J Give Read Simple Here Engage in Perhaps they had been
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)

        # 初始化 DeBERTa V2 模型
        self.deberta = DebertaV2Model(config)
        # 初始化仅包含 MLM 头部的模型
        self.cls = DebertaV2OnlyMLMHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        # 返回输出嵌入的解码器部分
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        # 设置新的输出嵌入到解码器部分
        self.cls.predictions.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="[MASK]",
    )
    # 从 transformers.models.deberta.modeling_deberta.DebertaForMaskedLM.forward 复制而来，将 Deberta 改为 DebertaV2
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        # 根据是否返回字典设置返回结果
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 DeBERTa 模型进行前向传播
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的序列输出
        sequence_output = outputs[0]
        # 通过分类器预测下一个词的分数
        prediction_scores = self.cls(sequence_output)

        # 初始化masked_lm_loss为None
        masked_lm_loss = None
        # 如果提供了labels，则计算masked language modeling损失
        if labels is not None:
            # 使用交叉熵损失函数，忽略标签为-100的token（padding token）
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果不要求返回字典，则返回元组形式的输出
        if not return_dict:
            output = (prediction_scores,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果要求返回字典，则返回MaskedLMOutput对象
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从transformers.models.deberta.modeling_deberta.DebertaPredictionHeadTransform复制而来，将Deberta改为DebertaV2
class DebertaV2PredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)

        # 定义一个全连接层，将隐藏状态的维度映射到嵌入大小
        self.dense = nn.Linear(config.hidden_size, self.embedding_size)
        # 根据配置文件中的激活函数名称或对象选择变换函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # 应用LayerNorm到嵌入维度上，使用配置中的层标准化参数eps
        self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        # 全连接层映射
        hidden_states = self.dense(hidden_states)
        # 应用激活函数变换
        hidden_states = self.transform_act_fn(hidden_states)
        # 应用LayerNorm
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# 从transformers.models.deberta.modeling_deberta.DebertaLMPredictionHead复制而来，将Deberta改为DebertaV2
class DebertaV2LMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化DebertaV2PredictionHeadTransform，用于预测头部的变换
        self.transform = DebertaV2PredictionHeadTransform(config)

        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        # 输出权重与输入嵌入相同，但每个标记有一个仅输出的偏置项
        self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)

        # 初始化一个参数化的偏置项，与每个标记的词汇表大小相对应
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要一个链接以确保偏置项能够与`resize_token_embeddings`正确调整大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 应用预测头部的变换
        hidden_states = self.transform(hidden_states)
        # 应用线性层进行最终预测
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# 从transformers.models.bert.BertOnlyMLMHead复制而来，将bert改为deberta
class DebertaV2OnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化DebertaV2LMPredictionHead，用于唯一的MLM头部
        self.predictions = DebertaV2LMPredictionHead(config)

    def forward(self, sequence_output):
        # 应用MLM预测头部，生成预测分数
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


@add_start_docstrings(
    """
    在顶部有一个序列分类/回归头部的DeBERTa模型变换器（池化输出之上的线性层），例如用于GLUE任务。
    """,
    DEBERTA_START_DOCSTRING,
)
class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
    # 用于序列分类的DeBERTa模型变换器，继承自DebertaV2PreTrainedModel
    # 初始化函数，接受一个配置参数config作为输入
    def __init__(self, config):
        # 调用父类的初始化函数，将config传递给父类
        super().__init__(config)

        # 从配置参数中获取num_labels，如果没有指定，则默认为2
        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        # 创建一个DebertaV2Model对象，使用给定的config作为参数
        self.deberta = DebertaV2Model(config)
        
        # 创建一个ContextPooler对象，使用给定的config作为参数
        self.pooler = ContextPooler(config)
        
        # 获取ContextPooler的输出维度作为输出维度
        output_dim = self.pooler.output_dim

        # 创建一个线性层用于分类，输入维度为output_dim，输出维度为num_labels
        self.classifier = nn.Linear(output_dim, num_labels)
        
        # 获取配置参数中的cls_dropout，如果未指定，则使用config中的hidden_dropout_prob作为默认值
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        
        # 创建一个稳定的Dropout层，使用上一步得到的drop_out作为参数
        self.dropout = StableDropout(drop_out)

        # 调用post_init函数，用于初始化权重并进行最终处理
        self.post_init()

    # 返回DebertaV2Model对象的输入嵌入层
    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    # 设置DebertaV2Model对象的输入嵌入层为新的嵌入层new_embeddings
    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    # 使用DebertaForSequenceClassification.forward的文档字符串作为注释
    # 包括Deberta输入的描述和代码示例的描述
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 从transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification.forward复制并修改为DebertaV2
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
# 从 transformers.models.deberta.modeling_deberta.DebertaForTokenClassification 复制并修改为 DebertaV2
class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 初始化分类标签数量

        self.deberta = DebertaV2Model(config)  # 使用 DebertaV2Model 初始化 DeBERTa 模型
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # Dropout 层，用于防止过拟合
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)  # 分类器，线性层映射到标签数量维度

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 DeBERTa 模型的 forward 方法
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]  # 取出模型输出的序列输出

        sequence_output = self.dropout(sequence_output)  # 应用 Dropout
        logits = self.classifier(sequence_output)  # 应用分类器线性层

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 定义交叉熵损失函数
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))  # 计算损失

        if not return_dict:
            output = (logits,) + outputs[1:]  # 输出 logits 和其它附加输出
            return ((loss,) + output) if loss is not None else output

        # 返回 TokenClassifierOutput，包含损失、logits、隐藏状态和注意力
        return TokenClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )


@add_start_docstrings(
    """
    """
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DEBERTA_START_DOCSTRING,



# 描述 DeBERTa 模型，该模型用于提取式问答任务（如 SQuAD），在隐藏状态输出的基础上添加一个用于计算起始位置和结束位置 logit 的线性层作为分类头。
# DEBERTA_START_DOCSTRING 用于引用关于 DeBERTa 模型的文档字符串的常量或变量，可能包含了模型的详细描述和用法说明。
    )
    # 关闭括号，用于结束类定义中的一些参数和装饰器的定义

class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
    # 定义一个新的类，继承自DebertaV2PreTrainedModel

    def __init__(self, config):
        # 初始化函数，接受一个配置参数config

        super().__init__(config)
        # 调用父类的初始化方法

        self.num_labels = config.num_labels
        # 设置类属性num_labels为config中的num_labels字段值

        self.deberta = DebertaV2Model(config)
        # 创建一个DebertaV2Model实例，传入config作为配置参数，并将其赋值给self.deberta

        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
        # 创建一个线性层，将隐藏大小为config.hidden_size映射到标签数为config.num_labels的输出空间

        # Initialize weights and apply final processing
        self.post_init()
        # 调用类中的post_init方法，用于初始化权重和应用最终处理

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        qa_target_start_index=_QA_TARGET_START_INDEX,
        qa_target_end_index=_QA_TARGET_END_INDEX,
    )
    # 添加文档字符串和代码示例，用于模型的前向传播，根据特定的格式化字符串和样例

    # 从transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering.forward中复制并将Deberta改为DebertaV2
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 初始化 return_dict 变量，如果未提供则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 DeBERTa 模型进行前向传播
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型的序列输出
        sequence_output = outputs[0]

        # 将序列输出传递给 QA 输出层得到 logits
        logits = self.qa_outputs(sequence_output)
        # 将 logits 拆分为开始位置和结束位置的 logits
        start_logits, end_logits = logits.split(1, dim=-1)
        # 去除不必要的维度并保持连续性
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        # 初始化总损失为 None
        total_loss = None
        # 如果提供了起始位置和结束位置的标签，则计算损失
        if start_positions is not None and end_positions is not None:
            # 如果在多 GPU 下运行，添加一个维度以匹配 logits 的维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入的位置
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 使用交叉熵损失函数，忽略指定的索引
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # 如果 return_dict 为 False，则返回一个包含损失和 logits 的元组
        if not return_dict:
            output = (start_logits, end_logits) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 return_dict 为 True，则返回一个 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    DEBERTA_START_DOCSTRING,
)
class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)  # 获取配置中的标签数量，默认为2
        self.num_labels = num_labels

        self.deberta = DebertaV2Model(config)  # 初始化DeBERTa模型
        self.pooler = ContextPooler(config)  # 初始化上下文池化器
        output_dim = self.pooler.output_dim  # 获取池化器的输出维度

        self.classifier = nn.Linear(output_dim, 1)  # 创建线性层，用于多选分类任务的分类
        drop_out = getattr(config, "cls_dropout", None)  # 获取配置中的dropout值
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out  # 如果未指定，则使用默认的隐藏层dropout概率
        self.dropout = StableDropout(drop_out)  # 创建稳定的dropout层

        self.init_weights()  # 初始化模型权重

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()  # 获取输入的嵌入层

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)  # 设置新的输入嵌入层

    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播方法，接受多个输入参数并返回模型输出。

        Args:
            input_ids (Optional[torch.Tensor], optional): 输入的token IDs张量. Defaults to None.
            attention_mask (Optional[torch.Tensor], optional): 注意力掩码张量. Defaults to None.
            token_type_ids (Optional[torch.Tensor], optional): token类型IDs张量. Defaults to None.
            position_ids (Optional[torch.Tensor], optional): 位置IDs张量. Defaults to None.
            inputs_embeds (Optional[torch.Tensor], optional): 输入的嵌入张量. Defaults to None.
            labels (Optional[torch.Tensor], optional): 标签张量. Defaults to None.
            output_attentions (Optional[bool], optional): 是否输出注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否返回字典形式的输出. Defaults to None.

        Returns:
            MultipleChoiceModelOutput: 包含模型输出的命名元组。
        """
        # TODO: Implement forward pass logic here
        pass
    ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 确保返回字典不为None时使用配置中的返回字典设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 计算选择题个数，如果input_ids不为None，则取其第二维度的大小作为选择题数目
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 将输入张量展平，以便用于模型输入
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        flat_inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用DeBERTa模型进行推断
        outputs = self.deberta(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取编码层的结果
        encoder_layer = outputs[0]
        # 使用池化器对编码层结果进行池化
        pooled_output = self.pooler(encoder_layer)
        # 对池化结果应用dropout
        pooled_output = self.dropout(pooled_output)
        # 将池化后的结果送入分类器得到logits
        logits = self.classifier(pooled_output)
        # 将logits重塑为(batch_size, num_choices)形状
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        # 如果有标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不使用返回字典，则返回输出元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 使用MultipleChoiceModelOutput对象包装结果并返回
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\deberta_v2\modeling_tf_deberta_v2.py`

"""
TF 2.0 DeBERTa-v2 model.

"""

# 导入所需的模块和库
from __future__ import annotations
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_deberta_v2 import DebertaV2Config

# 获取日志记录器
logger = logging.get_logger(__name__)

# 模型配置文档信息
_CONFIG_FOR_DOC = "DebertaV2Config"
_CHECKPOINT_FOR_DOC = "kamalkraj/deberta-v2-xlarge"

# 预训练模型存档列表
TF_DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "kamalkraj/deberta-v2-xlarge",
    # See all DeBERTa models at https://huggingface.co/models?filter=deberta-v2
]

# 自定义的上下文池化层，继承自Keras层
# 从transformers.models.deberta.modeling_tf_deberta.TFDebertaContextPooler中复制并修改为TFDebertaV2ContextPooler
class TFDebertaV2ContextPooler(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)
        # 创建全连接层dense和稳定Dropout层dropout
        self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense")
        self.dropout = TFDebertaV2StableDropout(config.pooler_dropout, name="dropout")
        self.config = config

    def call(self, hidden_states, training: bool = False):
        # 通过取第一个token对应的隐藏状态来“池化”模型
        context_token = hidden_states[:, 0]
        context_token = self.dropout(context_token, training=training)
        pooled_output = self.dense(context_token)
        # 应用激活函数到池化的输出
        pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output)
        return pooled_output

    @property
    def output_dim(self) -> int:
        return self.config.hidden_size
    # 定义 build 方法，用于构建模型
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位表明已经构建过
        self.built = True
        # 如果存在名为 dense 的属性
        if getattr(self, "dense", None) is not None:
            # 使用 tf.name_scope 为 dense 层设置命名空间
            with tf.name_scope(self.dense.name):
                # 调用 dense 层的 build 方法，设置输入形状为 [None, None, self.config.pooler_hidden_size]
                self.dense.build([None, None, self.config.pooler_hidden_size])
        # 如果存在名为 dropout 的属性
        if getattr(self, "dropout", None) is not None:
            # 使用 tf.name_scope 为 dropout 层设置命名空间
            with tf.name_scope(self.dropout.name):
                # 调用 dropout 层的 build 方法，不设置具体的输入形状
                self.dropout.build(None)
# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaXSoftmax 复制的 TFDebertaV2XSoftmax 类，用于 Deberta 到 DebertaV2 的转换
class TFDebertaV2XSoftmax(keras.layers.Layer):
    """
    优化内存的掩码 Softmax 层

    Args:
        input (`tf.Tensor`): 需要应用 softmax 的输入张量。
        mask (`tf.Tensor`): 掩码矩阵，其中 0 表示在 softmax 计算中忽略该元素。
        dim (int): 应用 softmax 的维度
    """

    def __init__(self, axis=-1, **kwargs):
        super().__init__(**kwargs)
        self.axis = axis

    def call(self, inputs: tf.Tensor, mask: tf.Tensor):
        # 创建反掩码，将 mask 转换为布尔型的反向
        rmask = tf.logical_not(tf.cast(mask, tf.bool))
        # 在需要忽略的位置设置为负无穷大
        output = tf.where(rmask, float("-inf"), inputs)
        # 应用稳定 softmax
        output = stable_softmax(output, self.axis)
        # 将需要忽略的位置设置为 0
        output = tf.where(rmask, 0.0, output)
        return output


# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaStableDropout 复制的 TFDebertaV2StableDropout 类，用于 Deberta 到 DebertaV2 的转换
class TFDebertaV2StableDropout(keras.layers.Layer):
    """
    优化训练稳定性的 Dropout 模块

    Args:
        drop_prob (float): dropout 概率
    """

    def __init__(self, drop_prob, **kwargs):
        super().__init__(**kwargs)
        self.drop_prob = drop_prob

    @tf.custom_gradient
    def xdropout(self, inputs):
        """
        对输入应用 dropout，类似于普通的 dropout，但同时将剩余元素缩放为 1/drop_prob 倍。
        """
        # 创建 dropout 掩码，按照指定的概率丢弃
        mask = tf.cast(
            1
            - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)),
            tf.bool,
        )
        scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=tf.float32)
        if self.drop_prob > 0:
            # 如果 dropout 概率大于 0，则应用 dropout 并缩放剩余元素
            inputs = tf.where(mask, 0.0, inputs) * scale

        def grad(upstream):
            if self.drop_prob > 0:
                return tf.where(mask, 0.0, upstream) * scale
            else:
                return upstream

        return inputs, grad

    def call(self, inputs: tf.Tensor, training: tf.Tensor = False):
        if training:
            return self.xdropout(inputs)
        return inputs


# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaSelfOutput 复制的 TFDebertaV2SelfOutput 类，用于 Deberta 到 DebertaV2 的转换
class TFDebertaV2SelfOutput(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)
        # 创建全连接层，隐藏层大小为 config.hidden_size
        self.dense = keras.layers.Dense(config.hidden_size, name="dense")
        # 创建 LayerNormalization 层，使用 config.layer_norm_eps 作为 epsilon
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建 TFDebertaV2StableDropout 层，使用 config.hidden_dropout_prob 作为 dropout 概率
        self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
        self.config = config
    # 对输入的隐藏状态进行全连接层操作，映射到新的表示空间
    hidden_states = self.dense(hidden_states)
    # 根据训练模式进行 dropout 操作，以防止过拟合
    hidden_states = self.dropout(hidden_states, training=training)
    # 将经过全连接层和 dropout 后的隐藏状态与输入张量相加，再进行 Layer Normalization
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    # 返回经过全连接层、dropout 和 Layer Normalization 处理后的隐藏状态
    return hidden_states

    # 构建模型的方法，用于在第一次调用时创建层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 dense 层，则根据输入形状构建 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果存在 LayerNorm 层，则根据输入形状构建 LayerNorm 层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        # 如果存在 dropout 层，则构建 dropout 层
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2
class TFDebertaV2Attention(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)
        # 初始化自注意力层，使用DebertaV2DisentangledSelfAttention定义的层，并命名为"self"
        self.self = TFDebertaV2DisentangledSelfAttention(config, name="self")
        # 初始化自注意力层输出层，使用TFDebertaV2SelfOutput定义的层，并命名为"output"
        self.dense_output = TFDebertaV2SelfOutput(config, name="output")
        self.config = config

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        query_states: tf.Tensor = None,
        relative_pos: tf.Tensor = None,
        rel_embeddings: tf.Tensor = None,
        output_attentions: bool = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用自注意力层，传递输入张量及其他参数，获取自注意力层的输出
        self_outputs = self.self(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
            output_attentions=output_attentions,
            training=training,
        )
        if query_states is None:
            query_states = input_tensor
        # 将自注意力层的输出作为输入，传递给自注意力层输出层，获取注意力输出
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=query_states, training=training
        )

        # 组装并返回输出元组，包含注意力输出和可能的额外输出
        output = (attention_output,) + self_outputs[1:]

        return output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已构建，则直接返回；否则按名称作用域构建自注意力层和输出层
        if getattr(self, "self", None) is not None:
            with tf.name_scope(self.self.name):
                self.self.build(None)
        if getattr(self, "dense_output", None) is not None:
            with tf.name_scope(self.dense_output.name):
                self.dense_output.build(None)


# Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2
class TFDebertaV2Intermediate(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)

        # 初始化全连接层，使用给定的中间大小和初始化器，并命名为"dense"
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据配置中的激活函数类型或函数本身，设置中间激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 对输入的隐藏状态应用全连接层
        hidden_states = self.dense(inputs=hidden_states)
        # 对全连接层输出应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已构建，则直接返回；否则按名称作用域构建全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput 复制而来，将 Deberta 修改为 DebertaV2
class TFDebertaV2Output(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于变换隐藏状态的维度
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 定义 LayerNormalization 层，用于规范化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 定义一个稳定的 Dropout 层，用于在训练时随机丢弃部分隐藏状态
        self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
        # 保存配置信息
        self.config = config

    # 定义层的前向传播逻辑
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 全连接变换隐藏状态的维度
        hidden_states = self.dense(inputs=hidden_states)
        # 使用 Dropout 随机丢弃部分隐藏状态
        hidden_states = self.dropout(hidden_states, training=training)
        # 对变换后的隐藏状态进行 LayerNormalization，并加上输入张量
        hidden_states = self.LayerNorm(hidden_states + input_tensor)

        return hidden_states

    # 构建层，确保所有子层被正确构建
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果定义了全连接层 dense，则构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        # 如果定义了 LayerNormalization 层 LayerNorm，则构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        # 如果定义了 Dropout 层 dropout，则构建该层
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)


# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer 复制而来，将 Deberta 修改为 DebertaV2
class TFDebertaV2Layer(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)

        # 定义自注意力层
        self.attention = TFDebertaV2Attention(config, name="attention")
        # 定义中间层
        self.intermediate = TFDebertaV2Intermediate(config, name="intermediate")
        # 定义输出层
        self.bert_output = TFDebertaV2Output(config, name="output")

    # 定义层的前向传播逻辑
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        query_states: tf.Tensor = None,
        relative_pos: tf.Tensor = None,
        rel_embeddings: tf.Tensor = None,
        output_attentions: bool = False,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用 self.attention 方法，执行注意力计算，返回注意力输出元组
        attention_outputs = self.attention(
            input_tensor=hidden_states,  # 使用 hidden_states 作为输入张量
            attention_mask=attention_mask,  # 注意力掩码
            query_states=query_states,  # 查询状态
            relative_pos=relative_pos,  # 相对位置
            rel_embeddings=rel_embeddings,  # 相关嵌入
            output_attentions=output_attentions,  # 是否输出注意力信息
            training=training,  # 训练模式标志
        )
        attention_output = attention_outputs[0]  # 获取注意力输出张量
        intermediate_output = self.intermediate(hidden_states=attention_output)
        # 将注意力输出张量输入到 self.intermediate 方法中进行中间层处理
        layer_output = self.bert_output(
            hidden_states=intermediate_output,  # 使用中间输出作为隐藏状态输入
            input_tensor=attention_output,  # 注意力输出也作为输入之一
            training=training  # 训练模式标志传递给 bert_output 方法
        )
        outputs = (layer_output,) + attention_outputs[1:]  # 构建输出元组，包括层输出和可能的注意力信息

        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)  # 构建 self.attention 层
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)  # 构建 self.intermediate 层
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)  # 构建 self.bert_output 层
# 定义 TFDebertaV2ConvLayer 类，继承自 keras.layers.Layer
class TFDebertaV2ConvLayer(keras.layers.Layer):
    # 初始化方法，接受 DebertaV2Config 对象和其他关键字参数
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)

        # 设置卷积核大小为 config.conv_kernel_size，默认为 3
        self.kernel_size = getattr(config, "conv_kernel_size", 3)
        # 获取激活函数并转换为 TensorFlow 激活函数对象
        self.conv_act = get_tf_activation(getattr(config, "conv_act", "tanh"))
        # 根据卷积核大小计算填充数
        self.padding = (self.kernel_size - 1) // 2
        # 创建 LayerNormalization 层，使用给定的 epsilon
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建 TFDebertaV2StableDropout 实例，使用隐藏层 dropout 概率
        self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")
        # 存储配置对象
        self.config = config

    # 构建层的方法，用于定义层的权重
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 在 "conv" 命名空间下创建卷积核权重
        with tf.name_scope("conv"):
            self.conv_kernel = self.add_weight(
                name="kernel",
                shape=[self.kernel_size, self.config.hidden_size, self.config.hidden_size],
                initializer=get_initializer(self.config.initializer_range),
            )
            # 创建卷积层的偏置项
            self.conv_bias = self.add_weight(
                name="bias", shape=[self.config.hidden_size], initializer=tf.zeros_initializer()
            )
        # 如果存在 LayerNorm 层，则构建其权重
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        # 如果存在 dropout 层，则构建其权重
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)

    # 定义调用方法，用于执行层的前向传播逻辑
    def call(
        self, hidden_states: tf.Tensor, residual_states: tf.Tensor, input_mask: tf.Tensor, training: bool = False
    ) -> tf.Tensor:
        # 执行二维卷积操作，输入是 hidden_states 的扩展维度和卷积核的扩展维度
        out = tf.nn.conv2d(
            tf.expand_dims(hidden_states, 1),
            tf.expand_dims(self.conv_kernel, 0),
            strides=1,
            padding=[[0, 0], [0, 0], [self.padding, self.padding], [0, 0]],
        )
        # 添加卷积偏置项并去除添加的维度
        out = tf.squeeze(tf.nn.bias_add(out, self.conv_bias), 1)
        # 计算输入 mask 的逆 mask，并将 out 中不需要的部分置为 0
        rmask = tf.cast(1 - input_mask, tf.bool)
        out = tf.where(tf.broadcast_to(tf.expand_dims(rmask, -1), shape_list(out)), 0.0, out)
        # 对 out 应用 dropout
        out = self.dropout(out, training=training)
        # 对 out 应用激活函数 conv_act
        out = self.conv_act(out)

        # 计算 Layer Normalization 的输入
        layer_norm_input = residual_states + out
        # 对 layer_norm_input 应用 LayerNormalization
        output = self.LayerNorm(layer_norm_input)

        # 如果 input_mask 为 None，则直接使用 output 作为输出
        if input_mask is None:
            output_states = output
        else:
            # 如果 input_mask 和 layer_norm_input 的维度不匹配，则进行相应的维度调整
            if len(shape_list(input_mask)) != len(shape_list(layer_norm_input)):
                if len(shape_list(input_mask)) == 4:
                    input_mask = tf.squeeze(tf.squeeze(input_mask, axis=1), axis=1)
                input_mask = tf.cast(tf.expand_dims(input_mask, axis=2), tf.float32)

            # 使用 input_mask 对 output 进行加权处理
            output_states = output * input_mask

        # 返回最终的输出状态
        return output_states
    # 初始化函数，接受一个 DebertaV2Config 类型的配置对象和其他关键字参数
    def __init__(self, config: DebertaV2Config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建 self.layer 列表，包含 config.num_hidden_layers 个 TFDebertaV2Layer 对象
        self.layer = [TFDebertaV2Layer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
        
        # 检查是否启用相对注意力机制
        self.relative_attention = getattr(config, "relative_attention", False)
        self.config = config

        # 如果启用了相对注意力机制
        if self.relative_attention:
            # 获取最大相对位置偏移量，默认为 config.max_position_embeddings
            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
            if self.max_relative_positions < 1:
                self.max_relative_positions = config.max_position_embeddings

            # 获取位置桶数，默认为 -1
            self.position_buckets = getattr(config, "position_buckets", -1)
            self.pos_ebd_size = self.max_relative_positions * 2

            # 如果设置了位置桶数，则重新计算位置嵌入大小
            if self.position_buckets > 0:
                self.pos_ebd_size = self.position_buckets * 2

        # 从配置中获取并解析 norm_rel_ebd 属性，以列表形式存储在 self.norm_rel_ebd 中
        self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]

        # 如果 norm_rel_ebd 中包含 'layer_norm'，则创建 LayerNormalization 层对象
        if "layer_norm" in self.norm_rel_ebd:
            self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

        # 如果配置中的 conv_kernel_size 大于 0，则创建 TFDebertaV2ConvLayer 对象
        self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None

    # 构建函数，用于构建模型的层次结构
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True

        # 如果启用了相对注意力机制，创建相对位置嵌入权重 rel_embeddings
        if self.relative_attention:
            self.rel_embeddings = self.add_weight(
                name="rel_embeddings.weight",
                shape=[self.pos_ebd_size, self.config.hidden_size],
                initializer=get_initializer(self.config.initializer_range),
            )

        # 如果存在卷积层对象 self.conv，则调用其 build 方法构建卷积层
        if getattr(self, "conv", None) is not None:
            with tf.name_scope(self.conv.name):
                self.conv.build(None)

        # 如果存在 LayerNormalization 层对象 self.LayerNorm，则调用其 build 方法构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, self.config.hidden_size])

        # 遍历 self.layer 列表中的每个 TFDebertaV2Layer 对象，调用其 build 方法构建各层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)

    # 获取相对位置嵌入向量
    def get_rel_embedding(self):
        rel_embeddings = self.rel_embeddings if self.relative_attention else None
        
        # 如果相对位置嵌入存在且需要进行 LayerNormalization，则应用 LayerNorm 层
        if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
            rel_embeddings = self.LayerNorm(rel_embeddings)
        
        return rel_embeddings

    # 获取注意力掩码
    def get_attention_mask(self, attention_mask):
        # 如果 attention_mask 的维度小于等于 2，则扩展维度以适应模型需求
        if len(shape_list(attention_mask)) <= 2:
            extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2)
            attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1)
            attention_mask = tf.cast(attention_mask, tf.uint8)
        # 如果 attention_mask 的维度为 3，则添加额外的维度以适应模型需求
        elif len(shape_list(attention_mask)) == 3:
            attention_mask = tf.expand_dims(attention_mask, 1)

        return attention_mask
    # 如果启用相对注意力且未提供相对位置参数，则根据查询状态或隐藏状态的形状获取相对位置
    if self.relative_attention and relative_pos is None:
        q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2]
        relative_pos = build_relative_position(
            q,
            shape_list(hidden_states)[-2],
            bucket_size=self.position_buckets,
            max_position=self.max_relative_positions,
        )
    
    # 返回相对位置参数
    return relative_pos


def call(
    self,
    hidden_states: tf.Tensor,
    attention_mask: tf.Tensor,
    query_states: tf.Tensor = None,
    relative_pos: tf.Tensor = None,
    output_attentions: bool = False,
    output_hidden_states: bool = False,
    return_dict: bool = True,
    training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
    # 如果注意力掩码的维度小于等于2，则直接使用注意力掩码作为输入掩码
    if len(shape_list(attention_mask)) <= 2:
        input_mask = attention_mask
    else:
        # 将多维度的注意力掩码按最后第二维求和，并转换成 uint8 类型的掩码
        input_mask = tf.cast(tf.math.reduce_sum(attention_mask, axis=-2) > 0, dtype=tf.uint8)

    # 如果设置输出隐藏状态，则初始化空元组以存储所有隐藏状态
    all_hidden_states = () if output_hidden_states else None
    # 如果设置输出注意力权重，则初始化空元组以存储所有注意力权重
    all_attentions = () if output_attentions else None

    # 获取注意力掩码，确保其为正确的形式
    attention_mask = self.get_attention_mask(attention_mask)
    # 获取相对位置编码
    relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)

    # 初始化下一层键值对，即当前隐藏状态
    next_kv = hidden_states

    # 获取相对位置嵌入
    rel_embeddings = self.get_rel_embedding()
    # 初始化输出状态为当前隐藏状态
    output_states = next_kv

    # 遍历所有层进行前向传播
    for i, layer_module in enumerate(self.layer):
        # 如果需要输出隐藏状态，则将当前输出状态加入所有隐藏状态元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (output_states,)

        # 调用当前层的前向传播
        layer_outputs = layer_module(
            hidden_states=next_kv,
            attention_mask=attention_mask,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings,
            output_attentions=output_attentions,
            training=training,
        )
        # 更新输出状态为当前层的输出
        output_states = layer_outputs[0]

        # 如果是第一层且有卷积操作，则将当前隐藏状态与输出状态应用卷积
        if i == 0 and self.conv is not None:
            output_states = self.conv(hidden_states, output_states, input_mask)

        # 更新下一层键值对为当前输出状态
        next_kv = output_states

        # 如果需要输出注意力权重，则将当前层的注意力权重加入所有注意力元组中
        if output_attentions:
            all_attentions = all_attentions + (layer_outputs[1],)

    # 如果需要输出隐藏状态，则将最后一层的输出状态加入所有隐藏状态元组中
    if output_hidden_states:
        all_hidden_states = all_hidden_states + (output_states,)

    # 如果不需要返回字典形式的输出，则按顺序返回相应的结果元组
    if not return_dict:
        return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None)

    # 返回 TFBaseModelOutput 类型的输出，包括最后的隐藏状态、所有隐藏状态和所有注意力权重
    return TFBaseModelOutput(
        last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions
    )
# 根据相对位置、桶大小和最大位置生成日志桶位置
def make_log_bucket_position(relative_pos, bucket_size, max_position):
    # 确定相对位置的符号
    sign = tf.math.sign(relative_pos)
    # 计算相对位置的绝对值
    mid = bucket_size // 2
    abs_pos = tf.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, tf.math.abs(relative_pos))
    # 计算对数位置
    log_pos = (
        tf.math.ceil(
            tf.cast(tf.math.log(abs_pos / mid), tf.float32) / tf.math.log((max_position - 1) / mid) * (mid - 1)
        )
        + mid
    )
    # 根据绝对位置是否小于等于桶大小的一半来确定最终桶位置
    bucket_pos = tf.cast(
        tf.where(abs_pos <= mid, tf.cast(relative_pos, tf.float32), log_pos * tf.cast(sign, tf.float32)), tf.int32
    )
    return bucket_pos


# 构建相对位置张量
def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1):
    """
    Build relative position according to the query and key

    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
    P_k\\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key
        bucket_size (int): the size of position bucket
        max_position (int): the maximum allowed absolute position

    Return:
        `tf.Tensor`: A tensor with shape [1, query_size, key_size]

    """
    # 生成查询和键的索引
    q_ids = tf.range(query_size, dtype=tf.int32)
    k_ids = tf.range(key_size, dtype=tf.int32)
    # 计算相对位置
    rel_pos_ids = q_ids[:, None] - tf.tile(tf.expand_dims(k_ids, axis=0), [shape_list(q_ids)[0], 1])
    # 如果指定了桶大小和最大位置，则使用日志桶位置函数计算相对位置
    if bucket_size > 0 and max_position > 0:
        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
    # 裁剪并扩展相对位置张量的维度
    rel_pos_ids = rel_pos_ids[:query_size, :]
    rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0)
    return tf.cast(rel_pos_ids, tf.int64)


# 扩展相对位置张量以匹配查询层
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
    shapes = [
        shape_list(query_layer)[0],
        shape_list(query_layer)[1],
        shape_list(query_layer)[2],
        shape_list(relative_pos)[-1],
    ]
    return tf.broadcast_to(c2p_pos, shapes)


# 扩展相对位置张量以匹配键层
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
    shapes = [
        shape_list(query_layer)[0],
        shape_list(query_layer)[1],
        shape_list(key_layer)[-2],
        shape_list(key_layer)[-2],
    ]
    return tf.broadcast_to(c2p_pos, shapes)


# 扩展位置索引以匹配关键层
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
    shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]]
    return tf.broadcast_to(pos_index, shapes)


# 沿着轴取出张量的元素
def take_along_axis(x, indices):
    # 当 gather 轴为 -1 时才是有效的 np.take_along_axis 的端口

    # TPU 和 gather 操作在一起可能存在问题 -- 参考 https://github.com/huggingface/transformers/issues/18239
    pass  # 这个函数目前没有实际代码实现，暂时只是占位
    # 检查当前的分布策略是否为 TPUStrategy
    if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy):
        # 对输入的索引进行独热编码，扩展最后一个维度的深度为 x 张量的最后一个维度的大小
        one_hot_indices = tf.one_hot(indices, depth=x.shape[-1], dtype=x.dtype)

        # 使用 Einstein Summation (einsum) 实现矩阵乘法，将独热编码的张量和 x 张量相乘，忽略前两个维度，得到形状为 [B, S, P] 的结果
        # 这里滥用符号表示：[B, S, P, D] . [B, S, D] = [B, S, P]
        gathered = tf.einsum("ijkl,ijl->ijk", one_hot_indices, x)

    else:
        # 在 GPU 上，通常使用 gather 操作代替大规模的独热编码和矩阵乘法
        gathered = tf.gather(x, indices, batch_dims=2)

    # 返回最终的 gathered 张量
    return gathered
class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
    """
    Disentangled self-attention module

    Parameters:
        config (`DebertaV2Config`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaV2Config`]

    """

    def transpose_for_scores(self, tensor: tf.Tensor, attention_heads: int) -> tf.Tensor:
        # 获取张量的形状列表
        tensor_shape = shape_list(tensor)
        # 在图模式下，如果第一个维度（批处理大小）为None，则无法将最终维度为-1的形状进行重塑
        shape = tensor_shape[:-1] + [attention_heads, tensor_shape[-1] // attention_heads]
        # 从[batch_size, seq_length, all_head_size]重塑为[batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=shape)
        # 转置张量的维度顺序
        tensor = tf.transpose(tensor, perm=[0, 2, 1, 3])
        x_shape = shape_list(tensor)
        # 再次重塑张量的形状
        tensor = tf.reshape(tensor, shape=[-1, x_shape[-2], x_shape[-1]])
        return tensor

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        query_states: tf.Tensor = None,
        relative_pos: tf.Tensor = None,
        rel_embeddings: tf.Tensor = None,
        output_attentions: bool = False,
        training: bool = False,
    ):
        # 该方法定义了层的正向传播逻辑
        # 省略了具体的实现细节

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设置为已构建状态
        self.built = True
        # 如果存在查询投影层，则构建查询投影层
        if getattr(self, "query_proj", None) is not None:
            with tf.name_scope(self.query_proj.name):
                self.query_proj.build([None, None, self.config.hidden_size])
        # 如果存在键投影层，则构建键投影层
        if getattr(self, "key_proj", None) is not None:
            with tf.name_scope(self.key_proj.name):
                self.key_proj.build([None, None, self.config.hidden_size])
        # 如果存在值投影层，则构建值投影层
        if getattr(self, "value_proj", None) is not None:
            with tf.name_scope(self.value_proj.name):
                self.value_proj.build([None, None, self.config.hidden_size])
        # 如果存在dropout层，则构建dropout层
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
        # 如果存在位置dropout层，则构建位置dropout层
        if getattr(self, "pos_dropout", None) is not None:
            with tf.name_scope(self.pos_dropout.name):
                self.pos_dropout.build(None)
        # 如果存在位置键投影层，则构建位置键投影层
        if getattr(self, "pos_key_proj", None) is not None:
            with tf.name_scope(self.pos_key_proj.name):
                self.pos_key_proj.build([None, None, self.config.hidden_size])
        # 如果存在位置查询投影层，则构建位置查询投影层
        if getattr(self, "pos_query_proj", None) is not None:
            with tf.name_scope(self.pos_query_proj.name):
                self.pos_query_proj.build([None, None, self.config.hidden_size])

# 从transformers.models.deberta.modeling_tf_deberta.TFDebertaEmbeddings复制而来 Deberta->DebertaV2
class TFDebertaV2Embeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""
    # 初始化方法，接收配置对象和额外的关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 将配置对象保存到实例变量中
        self.config = config
        # 获取嵌入向量的大小，默认为隐藏层大小
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
        # 保存隐藏层大小到实例变量
        self.hidden_size = config.hidden_size
        # 保存最大位置嵌入长度到实例变量
        self.max_position_embeddings = config.max_position_embeddings
        # 根据配置设置是否使用位置偏置输入，默认为True
        self.position_biased_input = getattr(config, "position_biased_input", True)
        # 保存初始化范围到实例变量
        self.initializer_range = config.initializer_range
        
        # 如果嵌入向量大小不等于隐藏层大小，则创建一个全连接层作为投影层
        if self.embedding_size != config.hidden_size:
            self.embed_proj = keras.layers.Dense(
                config.hidden_size,
                kernel_initializer=get_initializer(config.initializer_range),
                name="embed_proj",
                use_bias=False,
            )
        
        # 创建LayerNormalization层，并设置epsilon参数
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建稳定Dropout层，并设置隐藏层dropout概率
        self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout")

    # 构建模型的方法，用于创建模型的各种层和权重
    def build(self, input_shape=None):
        # 创建词嵌入层的权重矩阵，形状为[vocab_size, embedding_size]
        with tf.name_scope("word_embeddings"):
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        # 创建token type嵌入层的权重矩阵，形状为[type_vocab_size, embedding_size]
        with tf.name_scope("token_type_embeddings"):
            if self.config.type_vocab_size > 0:
                self.token_type_embeddings = self.add_weight(
                    name="embeddings",
                    shape=[self.config.type_vocab_size, self.embedding_size],
                    initializer=get_initializer(self.initializer_range),
                )
            else:
                self.token_type_embeddings = None

        # 创建位置嵌入层的权重矩阵，形状为[max_position_embeddings, hidden_size]
        with tf.name_scope("position_embeddings"):
            if self.position_biased_input:
                self.position_embeddings = self.add_weight(
                    name="embeddings",
                    shape=[self.max_position_embeddings, self.hidden_size],
                    initializer=get_initializer(self.initializer_range),
                )
            else:
                self.position_embeddings = None

        # 如果模型已经构建，则直接返回
        if self.built:
            return
        self.built = True
        
        # 如果存在LayerNorm层，则构建LayerNorm层的结构
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
        
        # 如果存在dropout层，则构建dropout层的结构
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
        
        # 如果存在embed_proj投影层，则构建embed_proj层的结构
        if getattr(self, "embed_proj", None) is not None:
            with tf.name_scope(self.embed_proj.name):
                self.embed_proj.build([None, None, self.embedding_size])

    # 模型调用方法，定义了模型的前向传播逻辑
    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        mask: tf.Tensor = None,
        training: bool = False,
        # 继续定义其他输入参数
    def apply_embeddings(
        self,
        input_ids: Optional[tf.Tensor] = None,
        inputs_embeds: Optional[tf.Tensor] = None,
        token_type_ids: Optional[tf.Tensor] = None,
        position_ids: Optional[tf.Tensor] = None,
        mask: Optional[tf.Tensor] = None,
        training: bool = False,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        # 如果既没有提供 input_ids 也没有提供 inputs_embeds，则抛出数值错误
        if input_ids is None and inputs_embeds is None:
            raise ValueError("Need to provide either `input_ids` or `inputs_embeds`.")

        # 如果提供了 input_ids，则检查其是否在合法范围内
        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 使用 self.weight 中的参数和 input_ids 进行 gather 操作，得到 inputs_embeds
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取 inputs_embeds 的形状，去除最后一个维度
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果 token_type_ids 未提供，则使用全零张量进行填充
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 如果 position_ids 未提供，则创建一个范围从 0 到 input_shape[-1] 的张量，并扩展维度
        if position_ids is None:
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)

        # 初始的 final_embeddings 设为 inputs_embeds
        final_embeddings = inputs_embeds

        # 如果设置了 self.position_biased_input，则添加 position_embeddings 到 final_embeddings
        if self.position_biased_input:
            position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
            final_embeddings += position_embeds

        # 如果配置中的 type_vocab_size 大于 0，则添加 token_type_embeddings 到 final_embeddings
        if self.config.type_vocab_size > 0:
            token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
            final_embeddings += token_type_embeds

        # 如果 embedding_size 不等于 hidden_size，则对 final_embeddings 应用 embed_proj 函数
        if self.embedding_size != self.hidden_size:
            final_embeddings = self.embed_proj(final_embeddings)

        # 对 final_embeddings 应用 LayerNorm 函数
        final_embeddings = self.LayerNorm(final_embeddings)

        # 如果提供了 mask，则根据其形状调整 final_embeddings
        if mask is not None:
            if len(shape_list(mask)) != len(shape_list(final_embeddings)):
                if len(shape_list(mask)) == 4:
                    mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1)
                mask = tf.cast(tf.expand_dims(mask, axis=2), tf.float32)

            final_embeddings = final_embeddings * mask

        # 对 final_embeddings 应用 dropout 函数，如果处于训练状态
        final_embeddings = self.dropout(final_embeddings, training=training)

        # 返回最终的嵌入张量 final_embeddings
        return final_embeddings
# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaPredictionHeadTransform 复制而来，将 Deberta->DebertaV2
class TFDebertaV2PredictionHeadTransform(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)

        # 从配置中获取嵌入大小，默认为隐藏大小
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)

        # 创建一个全连接层，用于转换隐藏状态到嵌入大小
        self.dense = keras.layers.Dense(
            units=self.embedding_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )

        # 根据配置获取隐藏层激活函数，如果是字符串则获取相应的 TensorFlow 激活函数，否则直接使用配置中的函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.transform_act_fn = config.hidden_act
        
        # 创建一个 LayerNormalization 层，用于归一化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用全连接层进行隐藏状态的转换
        hidden_states = self.dense(inputs=hidden_states)
        # 应用激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # 应用 LayerNormalization
        hidden_states = self.LayerNorm(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        # 如果已经构建，则直接返回
        if self.built:
            return
        self.built = True
        # 如果 dense 层已存在，则构建 dense 层，指定输入形状为 [None, None, self.config.hidden_size]
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果 LayerNorm 层已存在，则构建 LayerNorm 层，指定输入形状为 [None, None, self.embedding_size]
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.embedding_size])


# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead 复制而来，将 Deberta->DebertaV2
class TFDebertaV2LMPredictionHead(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        # 从配置中获取嵌入大小，默认为隐藏大小
        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)

        # 使用 TFDebertaV2PredictionHeadTransform 进行隐藏状态到嵌入大小的转换
        self.transform = TFDebertaV2PredictionHeadTransform(config, name="transform")

        # 输出权重与输入嵌入相同，但每个标记仅有一个输出偏置
        self.input_embeddings = input_embeddings

    def build(self, input_shape=None):
        # 添加一个与词汇表大小相同的偏置，初始化为零，可训练，命名为 "bias"
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        # 如果已经构建，则直接返回
        if self.built:
            return
        self.built = True
        # 如果 transform 层已存在，则构建 transform 层
        if getattr(self, "transform", None) is not None:
            with tf.name_scope(self.transform.name):
                self.transform.build(None)

    def get_output_embeddings(self) -> keras.layers.Layer:
        # 返回输入嵌入层
        return self.input_embeddings

    def set_output_embeddings(self, value: tf.Variable):
        # 设置输出嵌入权重
        self.input_embeddings.weight = value
        # 设置输出嵌入词汇表大小为 value 的第一个维度长度
        self.input_embeddings.vocab_size = shape_list(value)[0]
    # 返回包含偏置项的字典，字典中键为"bias"，值为 self.bias 变量
    def get_bias(self) -> Dict[str, tf.Variable]:
        return {"bias": self.bias}

    # 设置偏置项，从给定的 value 字典中取出"bias"键对应的值，并赋给 self.bias
    # 同时更新 self.config.vocab_size，使用 shape_list 函数获取 value["bias"] 的形状，并取其第一个元素作为 vocab_size
    def set_bias(self, value: tf.Variable):
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]

    # 对隐藏状态进行变换，调用 self.transform 方法
    # 获取隐藏状态的序列长度，并保存在 seq_length 中
    # 将隐藏状态进行形状重塑，变成二维张量，第一维度为-1，第二维度为 self.embedding_size
    # 使用矩阵乘法计算 hidden_states 和 self.input_embeddings.weight 的转置的乘积
    # 再次对 hidden_states 进行形状重塑，变成三维张量，形状为 [-1, seq_length, self.config.vocab_size]
    # 使用偏置项 self.bias 对 hidden_states 进行偏置添加操作
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.transform(hidden_states=hidden_states)
        seq_length = shape_list(hidden_states)[1]
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        return hidden_states
# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaOnlyMLMHead 复制而来，将 Deberta 替换为 DebertaV2
class TFDebertaV2OnlyMLMHead(keras.layers.Layer):
    def __init__(self, config: DebertaV2Config, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)
        # 使用给定的配置和输入嵌入层创建预测头部对象
        self.predictions = TFDebertaV2LMPredictionHead(config, input_embeddings, name="predictions")

    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
        # 调用预测头部对象以生成预测分数
        prediction_scores = self.predictions(hidden_states=sequence_output)

        return prediction_scores

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                # 构建预测头部对象
                self.predictions.build(None)


# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer 复制而来，将 Deberta 替换为 DebertaV2
class TFDebertaV2MainLayer(keras.layers.Layer):
    config_class = DebertaV2Config

    def __init__(self, config: DebertaV2Config, **kwargs):
        super().__init__(**kwargs)

        self.config = config

        # 使用给定的配置创建嵌入层和编码器
        self.embeddings = TFDebertaV2Embeddings(config, name="embeddings")
        self.encoder = TFDebertaV2Encoder(config, name="encoder")

    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings

    def set_input_embeddings(self, value: tf.Variable):
        # 设置输入嵌入层的权重和词汇大小
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 剪枝模型的注意力头部，具体实现未提供
        raise NotImplementedError

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        ):
        # 调用模型的主要层，处理输入并返回相应的输出
        # 这里的 unpack_inputs 装饰器用于解包输入参数
        ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 如果同时指定了 input_ids 和 inputs_embeds，则抛出数值错误异常
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        # 如果只指定了 input_ids，则获取其形状信息
        elif input_ids is not None:
            input_shape = shape_list(input_ids)
        # 如果只指定了 inputs_embeds，则获取其形状信息，去掉最后一个维度
        elif inputs_embeds is not None:
            input_shape = shape_list(inputs_embeds)[:-1]
        # 如果既没有指定 input_ids 也没有指定 inputs_embeds，则抛出数值错误异常
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        # 如果未提供 attention_mask，则用维度为 input_shape 的全 1 张量来填充
        if attention_mask is None:
            attention_mask = tf.fill(dims=input_shape, value=1)

        # 如果未提供 token_type_ids，则用维度为 input_shape 的全 0 张量来填充
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 使用 embeddings 层处理输入，得到嵌入输出
        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            mask=attention_mask,
            training=training,
        )

        # 使用 encoder 层处理嵌入输出，得到编码器的输出
        encoder_outputs = self.encoder(
            hidden_states=embedding_output,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从编码器的输出中取出序列输出（第一个元素）
        sequence_output = encoder_outputs[0]

        # 如果不要求返回字典格式的输出，则返回元组形式的编码器输出
        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]

        # 否则，返回 TFBaseModelOutput 对象，包括最后隐藏状态、隐藏状态列表和注意力列表
        return TFBaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过网络结构，则直接返回
        if self.built:
            return
        # 将标志设置为已构建
        self.built = True
        # 如果存在 embeddings 层，则构建 embeddings 层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果存在 encoder 层，则构建 encoder 层
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaPreTrainedModel 复制的代码，将 Deberta->DebertaV2
class TFDebertaV2PreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 DebertaV2Config
    config_class = DebertaV2Config
    # 设置基础模型前缀为 "deberta"
    base_model_prefix = "deberta"


# DEBERTA_START_DOCSTRING 的原始文档字符串
DEBERTA_START_DOCSTRING = r"""
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>
"""
    Parameters:
        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
This block defines the docstring for inputs expected by the DeBERTaV2Model class.
It specifies the arguments and their types that can be passed to the model.

@add_start_docstrings is a decorator that adds a specific docstring template to the class.
It describes the bare DeBERTa Model transformer outputting raw hidden-states without specific head.

The class TFDebertaV2Model inherits from TFDebertaV2PreTrainedModel and represents the DeBERTa V2 model.
It initializes with a configuration object and optional inputs, and initializes a TFDebertaV2MainLayer named "deberta".

unpack_inputs is a decorator that likely unpacks and prepares inputs before feeding them into the model.
"""
class TFDebertaV2Model(TFDebertaV2PreTrainedModel):
    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
        # Initialize the superclass with the provided configuration and optional inputs
        super().__init__(config, *inputs, **kwargs)

        # Create an instance of TFDebertaV2MainLayer to serve as the core DeBERTa V2 transformer
        self.deberta = TFDebertaV2MainLayer(config, name="deberta")

    # Decorator function that manages the unpacking of inputs
    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 添加模型前向传播方法的文档字符串，包含 DEBERTA_INPUTS_DOCSTRING 的格式化参数
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的 token IDs，可以是 None
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力遮罩，可以是 numpy 数组或 TensorFlow 张量，也可以是 None
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # token 类型 IDs，可以是 numpy 数组或 TensorFlow 张量，也可以是 None
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置 IDs，可以是 numpy 数组或 TensorFlow 张量，也可以是 None
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入的嵌入向量，可以是 numpy 数组或 TensorFlow 张量，也可以是 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选布尔值，默认为 None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选布尔值，默认为 None
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选布尔值，默认为 None
        training: Optional[bool] = False,  # 是否处于训练模式，可选布尔值，默认为 False
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 调用 DEBERTA 模型的前向传播方法，并传递相应的参数
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回 DEBERTA 模型前向传播方法的输出
        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 self.deberta 存在，则在 TensorFlow 的命名空间下构建它
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
# 给 TFDebertaV2ForMaskedLM 类添加文档字符串，描述其作为 DeBERTa 模型的一个扩展，包含语言建模头部
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaForMaskedLM 复制代码，并将 Deberta 更改为 DebertaV2
class TFDebertaV2ForMaskedLM(TFDebertaV2PreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
        # 调用父类构造函数初始化模型
        super().__init__(config, *inputs, **kwargs)

        # 如果配置指定为解码器，发出警告信息，建议设置 config.is_decoder=False 来使用双向自注意力
        if config.is_decoder:
            logger.warning(
                "If you want to use `TFDebertaV2ForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化 DeBERTa V2 主层
        self.deberta = TFDebertaV2MainLayer(config, name="deberta")
        # 初始化仅包含 MLM 头部的 DeBERTa V2 模型头部
        self.mlm = TFDebertaV2OnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls")

    # 获取语言建模头部的方法
    def get_lm_head(self) -> keras.layers.Layer:
        return self.mlm.predictions

    # 实现模型调用的方法，支持多种输入和输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 以下参数用于详细描述输入要求和预期的输出
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 调用 DeBERTa 模型进行前向传播，获取模型的输出
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中获取序列输出（通常是模型最后一层的输出）
        sequence_output = outputs[0]
        # 将序列输出输入到 MLM 层，生成预测的分数
        prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
        # 如果有提供标签，则计算 MLM 损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)

        # 如果不要求返回字典形式的输出，则按照元组形式构建输出结果
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的输出，则构建 TFMaskedLMOutput 对象并返回
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 设置模型为已构建状态
        self.built = True
        # 如果模型包含 DeBERTa 层，则构建 DeBERTa 层
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
        # 如果模型包含 MLM 层，则构建 MLM 层
        if getattr(self, "mlm", None) is not None:
            with tf.name_scope(self.mlm.name):
                self.mlm.build(None)
"""
DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
"""

# 从transformers.models.deberta.modeling_tf_deberta.TFDebertaForSequenceClassification复制而来，将Deberta改为DebertaV2
class TFDebertaV2ForSequenceClassification(TFDebertaV2PreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels

        # 初始化DeBERTa V2主层和上下文池化层
        self.deberta = TFDebertaV2MainLayer(config, name="deberta")
        self.pooler = TFDebertaV2ContextPooler(config, name="pooler")

        # 设置分类器的dropout率，如果未指定则使用config中的默认值
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = TFDebertaV2StableDropout(drop_out, name="cls_dropout")

        # 定义分类器，用于进行具体的分类任务
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )

        # 输出维度为池化层的输出维度
        self.output_dim = self.pooler.output_dim

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 模型的前向传播函数，处理输入并返回分类结果
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs
    ):
        """
        接收输入参数，执行DeBERTa V2模型的前向传播，返回分类任务的输出结果。
        """
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用DeBERTa模型进行前向传播计算
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 取模型输出的第一个元素作为序列输出
        sequence_output = outputs[0]
        # 通过池化层计算汇聚输出
        pooled_output = self.pooler(sequence_output, training=training)
        # 使用dropout进行汇聚输出的随机失活
        pooled_output = self.dropout(pooled_output, training=training)
        # 通过分类器获取最终的逻辑回归输出
        logits = self.classifier(pooled_output)
        # 如果提供了标签，则计算损失值
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不要求返回字典，则将输出打包成元组
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 否则，返回TFSequenceClassifierOutput对象，包含损失、逻辑回归输出、隐藏状态和注意力分布
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        # 如果模型包含DeBERTa组件，则构建DeBERTa模型
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
        # 如果模型包含池化层组件，则构建池化层
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)
        # 如果模型包含dropout组件，则构建dropout层
        if getattr(self, "dropout", None) is not None:
            with tf.name_scope(self.dropout.name):
                self.dropout.build(None)
        # 如果模型包含分类器组件，则构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.output_dim])
@add_start_docstrings(
    """
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    DEBERTA_START_DOCSTRING,
)
# 基于 transformers.models.deberta.modeling_tf_deberta.TFDebertaForTokenClassification 的修改版本，用于 DeBERTaV2
class TFDebertaV2ForTokenClassification(TFDebertaV2PreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 设置分类的标签数量
        self.num_labels = config.num_labels

        # 初始化 DeBERTaV2 主层
        self.deberta = TFDebertaV2MainLayer(config, name="deberta")
        # 设置 dropout 层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        # 分类器，用于输出最终的标签预测
        self.classifier = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # 保存配置信息
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 模型调用函数，用于推断和训练
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        outputs = self.deberta(
            input_ids=input_ids,                      # 输入的 token IDs
            attention_mask=attention_mask,            # 注意力掩码
            token_type_ids=token_type_ids,            # token 类型 IDs
            position_ids=position_ids,                # 位置 IDs
            inputs_embeds=inputs_embeds,              # 嵌入的输入
            output_attentions=output_attentions,      # 是否输出注意力权重
            output_hidden_states=output_hidden_states,# 是否输出隐藏状态
            return_dict=return_dict,                  # 是否返回字典格式结果
            training=training,                        # 是否在训练模式下
        )
        sequence_output = outputs[0]                  # 取出模型输出的序列输出
        sequence_output = self.dropout(sequence_output, training=training)  # 对序列输出应用 dropout
        logits = self.classifier(inputs=sequence_output)  # 将序列输出输入分类器得到 logits
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)  # 如果有标签，则计算损失

        if not return_dict:
            output = (logits,) + outputs[1:]         # 如果不返回字典，则组合输出
            return ((loss,) + output) if loss is not None else output  # 如果有损失，则包含在输出中

        return TFTokenClassifierOutput(
            loss=loss,                               # 返回 TFTokenClassifierOutput 对象，包含损失
            logits=logits,                           # logits
            hidden_states=outputs.hidden_states,     # 隐藏状态
            attentions=outputs.attentions,           # 注意力权重
        )

    def build(self, input_shape=None):
        if self.built:                              # 如果已经建立则直接返回
            return
        self.built = True                           # 标记为已建立

        if getattr(self, "deberta", None) is not None:  # 如果存在 deberta 模型
            with tf.name_scope(self.deberta.name):   # 在 tf 中使用 deberta 模型的名称作为命名空间
                self.deberta.build(None)             # 建立 deberta 模型

        if getattr(self, "classifier", None) is not None:  # 如果存在分类器
            with tf.name_scope(self.classifier.name):  # 在 tf 中使用分类器的名称作为命名空间
                self.classifier.build([None, None, self.config.hidden_size])  # 建立分类器，输入形状为 [None, None, hidden_size]
@add_start_docstrings(
    """
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DEBERTA_START_DOCSTRING,
)
# 从 transformers.models.deberta.modeling_tf_deberta.TFDebertaForQuestionAnswering 复制并修改为 Deberta->DebertaV2
class TFDebertaV2ForQuestionAnswering(TFDebertaV2PreTrainedModel, TFQuestionAnsweringLoss):
    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 设置分类标签数量
        self.num_labels = config.num_labels

        # 初始化 DeBERTa 主层
        self.deberta = TFDebertaV2MainLayer(config, name="deberta")
        
        # 定义输出层，用于计算起始和结束位置的 logit
        self.qa_outputs = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        
        # 存储配置信息
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs,
    ):
        """
        执行 DeBERTaV2ForQuestionAnswering 的前向传播。
        
        Args:
            input_ids: 输入的 token IDs
            attention_mask: 输入的注意力掩码
            token_type_ids: 输入的 token 类型 IDs
            position_ids: 输入的位置 IDs
            inputs_embeds: 替代输入的嵌入表示
            output_attentions: 是否输出注意力权重
            output_hidden_states: 是否输出隐藏状态
            return_dict: 是否返回字典格式的输出
            start_positions: 起始位置的标签
            end_positions: 结束位置的标签
            training: 是否为训练模式
        """
        # 调用 DeBERTa 主层进行前向传播
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        outputs = self.deberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )
        
        # 如果训练模式，则计算起始和结束位置的 logit
        if training:
            start_logits, end_logits = self.qa_outputs(outputs.last_hidden_state)
            return start_logits, end_logits
        
        # 否则返回模型的标准输出
        return TFQuestionAnsweringModelOutput(
            start_logits=outputs.start_logits,
            end_logits=outputs.end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 DeBERTa 模型进行推理，获取模型的输出
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取模型输出的序列输出
        sequence_output = outputs[0]
        # 使用 QA 输出层对序列输出进行处理，得到问题回答的 logits
        logits = self.qa_outputs(inputs=sequence_output)
        # 将 logits 分割为开始位置和结束位置的 logits
        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
        # 去除 logits 张量的一个维度，使其维度减少到 [-1]
        start_logits = tf.squeeze(input=start_logits, axis=-1)
        end_logits = tf.squeeze(input=end_logits, axis=-1)
        # 初始化 loss 为 None
        loss = None

        # 如果给定了起始位置和结束位置的标签，则计算损失
        if start_positions is not None and end_positions is not None:
            # 将起始和结束位置标签存储在字典中
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            # 使用损失计算函数计算损失
            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))

        # 如果不需要返回字典形式的输出，则返回 logits 和可能的附加输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，则创建 TFQuestionAnsweringModelOutput 对象
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果 DeBERTa 模型存在，则构建其结构
        if getattr(self, "deberta", None) is not None:
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
        # 如果 QA 输出层存在，则构建其结构
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    DEBERTA_START_DOCSTRING,
)
class TFDebertaV2ForMultipleChoice(TFDebertaV2PreTrainedModel, TFMultipleChoiceLoss):
    """
    DeBERTa V2 model for multiple choice tasks. Extends TFDebertaV2PreTrainedModel and TFMultipleChoiceLoss.

    This class defines a model architecture with a DeBERTa V2 main layer, dropout, context pooler, and a dense
    classifier layer for multiple choice classification tasks.
    """

    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
    # _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config: DebertaV2Config, *inputs, **kwargs):
        """
        Initializes TFDebertaV2ForMultipleChoice.

        Args:
            config (DebertaV2Config): The model configuration class specifying the model architecture and hyperparameters.
            *inputs: Variable length argument list for passing inputs to parent classes.
            **kwargs: Additional keyword arguments passed to parent classes.
        """
        super().__init__(config, *inputs, **kwargs)

        # Initialize DeBERTa V2 main layer
        self.deberta = TFDebertaV2MainLayer(config, name="deberta")
        # Dropout layer with dropout rate specified in config
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        # Context pooler layer for pooling contextual embeddings
        self.pooler = TFDebertaV2ContextPooler(config, name="pooler")
        # Dense classifier layer for multiple choice classification
        self.classifier = keras.layers.Dense(
            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # Output dimensionality from the pooler layer
        self.output_dim = self.pooler.output_dim

    @unpack_inputs
    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs
    ):
        """
        Perform the forward pass of the TFDebertaV2ForMultipleChoice model.

        Args:
            input_ids (TFModelInputType, optional): Input ids of shape (batch_size, num_choices, sequence_length).
            attention_mask (np.ndarray or tf.Tensor, optional): Attention mask of shape (batch_size, num_choices, sequence_length).
            token_type_ids (np.ndarray or tf.Tensor, optional): Token type ids of shape (batch_size, num_choices, sequence_length).
            position_ids (np.ndarray or tf.Tensor, optional): Position ids of shape (batch_size, num_choices, sequence_length).
            inputs_embeds (np.ndarray or tf.Tensor, optional): Embedded inputs of shape (batch_size, num_choices, sequence_length, hidden_size).
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary.
            labels (np.ndarray or tf.Tensor, optional): Labels for multiple choice task.
            training (bool, optional): Whether the model is in training mode.
            **kwargs: Additional keyword arguments for future extension.

        Returns:
            TFMultipleChoiceModelOutput: Output class with scores and optionally other relevant outputs.
        """
        # Implementation of the model forward pass
        # Details depend on the specific implementation of the DeBERTa model
        pass
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        # 如果输入 `input_ids` 不为 None，则确定选择数量和序列长度
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取选择数量
            seq_length = shape_list(input_ids)[2]   # 获取序列长度
        else:
            num_choices = shape_list(inputs_embeds)[1]  # 获取选择数量（从 `inputs_embeds` 中获取）
            seq_length = shape_list(inputs_embeds)[2]   # 获取序列长度（从 `inputs_embeds` 中获取）

        # 将输入张量展平为二维张量，如果对应输入不为 None
        flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = (
            tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
        )
        flat_token_type_ids = (
            tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
        )
        flat_position_ids = (
            tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
        )
        flat_inputs_embeds = (
            tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )
        
        # 调用 `deberta` 模型进行前向传播
        outputs = self.deberta(
            input_ids=flat_input_ids,
            attention_mask=flat_attention_mask,
            token_type_ids=flat_token_type_ids,
            position_ids=flat_position_ids,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # 获取模型输出的序列输出
        sequence_output = outputs[0]
        
        # 使用池化器获取池化输出
        pooled_output = self.pooler(sequence_output, training=training)
        
        # 应用 dropout 处理池化输出
        pooled_output = self.dropout(pooled_output, training=training)
        
        # 使用分类器获取最终的 logits
        logits = self.classifier(pooled_output)
        
        # 将 logits 重新整形为二维张量
        reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
        
        # 如果提供了标签，计算损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)

        # 如果 `return_dict` 为 False，则返回包含 logits 和其它输出的元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        
        # 如果 `return_dict` 为 True，则返回 `TFMultipleChoiceModelOutput` 类的对象
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 构建函数用于构建模型的输入形状
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回，不做任何操作
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        
        # 如果模型中存在名为 "deberta" 的属性且不为 None，则构建 "deberta" 层
        if getattr(self, "deberta", None) is not None:
            # 使用 "deberta" 层的名称作为命名空间，在该命名空间下构建 "deberta" 层
            with tf.name_scope(self.deberta.name):
                self.deberta.build(None)
        
        # 如果模型中存在名为 "pooler" 的属性且不为 None，则构建 "pooler" 层
        if getattr(self, "pooler", None) is not None:
            # 使用 "pooler" 层的名称作为命名空间，在该命名空间下构建 "pooler" 层
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)
        
        # 如果模型中存在名为 "classifier" 的属性且不为 None，则构建 "classifier" 层
        if getattr(self, "classifier", None) is not None:
            # 使用 "classifier" 层的名称作为命名空间，在该命名空间下构建 "classifier" 层
            self.classifier.build([None, None, self.output_dim])

Transformers-源码解析-三十三-

Transformers 源码解析（三十三）

.\models\deberta\modeling_tf_deberta.py

.\models\deberta\tokenization_deberta.py

.\models\deberta\tokenization_deberta_fast.py

.\models\deberta\__init__.py

.\models\deberta_v2\configuration_deberta_v2.py

.\models\deberta_v2\modeling_deberta_v2.py

.\models\deberta_v2\modeling_tf_deberta_v2.py

`.\models\deberta\modeling_tf_deberta.py`

`.\models\deberta\tokenization_deberta.py`

`.\models\deberta\tokenization_deberta_fast.py`

`.\models\deberta\init.py`

`.\models\deberta_v2\configuration_deberta_v2.py`

`.\models\deberta_v2\modeling_deberta_v2.py`

`.\models\deberta_v2\modeling_tf_deberta_v2.py`