Transformers 源码解析（九十八）

`.\models\roformer\modeling_tf_roformer.py`

# 导入所需模块和库
import math
from typing import Dict, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

# 从内部模块导入函数和类
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFCausalLMOutput,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFSequenceSummary,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_roformer import RoFormerConfig

# 获取 logger 对象用于记录日志
logger = logging.get_logger(__name__)

# 文档中使用的模型检查点和配置信息
_CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
_CONFIG_FOR_DOC = "RoFormerConfig"

# RoFormer 的预训练模型归档列表
TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "junnyu/roformer_chinese_small",
    "junnyu/roformer_chinese_base",
    "junnyu/roformer_chinese_char_small",
    "junnyu/roformer_chinese_char_base",
    "junnyu/roformer_small_discriminator",
    "junnyu/roformer_small_generator",
    # 更多 RoFormer 模型详见 https://huggingface.co/models?filter=roformer
]

class TFRoFormerSinusoidalPositionalEmbedding(keras.layers.Layer):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, **kwargs):
        super().__init__(**kwargs)

        # 初始化函数，确保嵌入维度是偶数，否则抛出异常
        if embedding_dim % 2 != 0:
            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")

        # 嵌入维度和位置数量属性
        self.embedding_dim = embedding_dim
        self.num_positions = num_positions
    def build(self, input_shape: tf.TensorShape):
        """
        Build shared token embedding layer Shared weights logic adapted from
        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        """
        # 初始化权重矩阵
        weight = self._init_weight(self.num_positions, self.embedding_dim)

        # 添加权重作为层的一个参数
        self.weight = self.add_weight(
            name="embeddings",
            shape=[self.num_positions, self.embedding_dim],
        )
        # 将初始权重转换为与self.weight相同的数据类型
        weight = tf.cast(weight, dtype=self.weight.dtype)

        # 将初始权重赋值给self.weight
        self.weight.assign(weight)

        # 调用父类的build方法，传入输入形状
        super().build(input_shape)

    @staticmethod
    def _init_weight(n_pos: int, dim: int):
        """
        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
        the 2nd half of the vector. [dim // 2:]
        """
        # 创建位置编码矩阵，基于论文中的公式，使用sin和cos函数
        position_enc = np.array(
            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
        )
        table = np.zeros_like(position_enc)
        # 第一列全为零
        table[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
        # 第二列开始使用cos函数
        table[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
        # 转换为TensorFlow的张量
        table = tf.convert_to_tensor(table)
        # 停止梯度计算
        tf.stop_gradient(table)
        return table

    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
        """Input is expected to be of size [bsz x seqlen]."""
        # 获取输入张量的形状，bsz为批量大小，seq_len为序列长度
        bsz, seq_len = input_shape[:2]

        # 生成位置索引，从past_key_values_length开始，到seq_len + past_key_values_length结束，步长为1
        positions = tf.range(past_key_values_length, seq_len + past_key_values_length, delta=1, name="range")
        # 使用gather操作从self.weight中获取指定位置的embedding向量
        return tf.gather(self.weight, positions)
class TFRoFormerEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 初始化层，从配置中获取参数
        self.config = config
        self.embedding_size = config.embedding_size
        self.initializer_range = config.initializer_range
        
        # LayerNormalization 层，用于标准化输入数据
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # Dropout 层，用于在训练过程中随机断开神经元连接，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)

    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            # 添加词嵌入权重矩阵，形状为 [词汇量大小, 嵌入维度]
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("token_type_embeddings"):
            # 添加类型嵌入权重矩阵，形状为 [类型词汇量大小, 嵌入维度]
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.config.type_vocab_size, self.embedding_size],
                initializer=get_initializer(self.initializer_range),
            )

        if self.built:
            return
        self.built = True
        
        # 如果 LayerNorm 层已存在，则构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.embedding_size])

    def call(
        self,
        input_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        training: bool = False,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.

        Args:
            input_ids (tf.Tensor): 输入的词汇 ID 张量
            token_type_ids (tf.Tensor): 输入的类型 ID 张量
            inputs_embeds (tf.Tensor): 输入的嵌入张量
            training (bool): 是否在训练模式中使用 Dropout

        Returns:
            final_embeddings (`tf.Tensor`): 输出的嵌入张量.
        """
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
            # 检查输入的词汇 ID 是否在有效范围内
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            # 根据词汇 ID 从权重矩阵中获取对应的词嵌入张量
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

        if token_type_ids is None:
            # 如果未提供类型 ID，则使用默认值 0
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 根据类型 ID 从类型嵌入权重矩阵中获取类型嵌入张量
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        
        # 将词嵌入张量和类型嵌入张量相加得到最终的嵌入张量
        final_embeddings = inputs_embeds + token_type_embeds
        
        # 对最终嵌入张量进行 LayerNormalization 处理
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        
        # 在训练模式中，对最终嵌入张量应用 Dropout
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        return final_embeddings
    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)

        # 检查隐藏大小是否是注意力头数的整数倍，如果不是则引发错误
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        # 初始化变量
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 创建用于查询、键、值的全连接层，初始化器使用配置中的范围
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        # Dropout 层，丢弃率为配置中的注意力概率丢弃概率
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
        # 是否使用旋转值机制的标志
        self.rotary_value = config.rotary_value
        # 保存配置对象
        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # 将张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # 转置张量，从 [batch_size, seq_length, num_attention_heads, attention_head_size] 到 [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        sinusoidal_pos: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        batch_size = shape_list(hidden_states)[0]  # 获取隐藏状态的批量大小
        mixed_query_layer = self.query(inputs=hidden_states)  # 使用查询函数处理隐藏状态
        mixed_key_layer = self.key(inputs=hidden_states)  # 使用键函数处理隐藏状态
        mixed_value_layer = self.value(inputs=hidden_states)  # 使用值函数处理隐藏状态
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)  # 调整查询层的形状以进行注意力计算
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)  # 调整键层的形状以进行注意力计算
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)  # 调整值层的形状以进行注意力计算

        if sinusoidal_pos is not None:
            if self.rotary_value:
                # 如果启用旋转值，应用旋转位置嵌入到查询、键和值层
                query_layer, key_layer, value_layer = self.apply_rotary_position_embeddings(
                    sinusoidal_pos, query_layer, key_layer, value_layer
                )
            else:
                # 否则，只应用旋转位置嵌入到查询和键层
                query_layer, key_layer = self.apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer)

        # 计算"查询"和"键"之间的点积，得到原始注意力分数
        # 结果形状为(batch size, num_heads, seq_len_q, seq_len_k)
        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
        attention_scores = tf.divide(attention_scores, dk)  # 缩放注意力分数

        if attention_mask is not None:
            # 应用注意力掩码（在TFRoFormerModel的call()函数中预先计算）
            attention_scores = tf.add(attention_scores, attention_mask)

        # 将注意力分数归一化为概率
        attention_probs = stable_softmax(logits=attention_scores, axis=-1)

        # 对注意力概率进行dropout，这一步在原始Transformer论文中提到过
        attention_probs = self.dropout(inputs=attention_probs, training=training)

        # 如果需要，对注意力头进行掩码处理
        if head_mask is not None:
            attention_probs = tf.multiply(attention_probs, head_mask)

        # 计算注意力输出值
        attention_output = tf.matmul(attention_probs, value_layer)
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        # 重新整形得到最终输出
        # 形状为(batch_size, seq_len_q, all_head_size)
        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))

        # 构建输出元组，可能包含注意力输出和注意力概率，取决于output_attentions标志位
        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)

        return outputs
    # 应用旋转位置嵌入到查询、键、值的层中
    def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, value_layer=None):
        # 将输入的正弦和余弦位置编码张量按照最后一个维度切分为两部分
        sin, cos = tf.split(sinusoidal_pos, num_or_size_splits=2, axis=-1)
        # 将每个位置的正弦值重复两次，构成新的正弦位置编码张量
        sin_pos = tf.repeat(sin, 2, axis=-1)
        # 将每个位置的余弦值重复两次，构成新的余弦位置编码张量
        cos_pos = tf.repeat(cos, 2, axis=-1)
        
        # 将查询层中每隔一个位置的向量进行旋转处理，形成旋转后的查询层
        rotate_half_query_layer = tf.stack([-query_layer[..., 1::2], query_layer[..., ::2]], axis=-1)
        rotate_half_query_layer = tf.reshape(rotate_half_query_layer, shape_list(query_layer))
        # 对查询层应用旋转位置嵌入公式
        query_layer = query_layer * cos_pos + rotate_half_query_layer * sin_pos
        
        # 将键层中每隔一个位置的向量进行旋转处理，形成旋转后的键层
        rotate_half_key_layer = tf.stack([-key_layer[..., 1::2], key_layer[..., ::2]], axis=-1)
        rotate_half_key_layer = tf.reshape(rotate_half_key_layer, shape_list(key_layer))
        # 对键层应用旋转位置嵌入公式
        key_layer = key_layer * cos_pos + rotate_half_key_layer * sin_pos
        
        # 如果值层不为空，则对值层中每隔一个位置的向量进行旋转处理，形成旋转后的值层
        if value_layer is not None:
            rotate_half_value_layer = tf.stack([-value_layer[..., 1::2], value_layer[..., ::2]], axis=-1)
            rotate_half_value_layer = tf.reshape(rotate_half_value_layer, shape_list(value_layer))
            # 对值层应用旋转位置嵌入公式
            value_layer = value_layer * cos_pos + rotate_half_value_layer * sin_pos
            # 返回旋转后的查询、键、值层
            return query_layer, key_layer, value_layer
        
        # 如果值层为空，则只返回旋转后的查询、键层
        return query_layer, key_layer

    # 构建模型的方法
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        
        # 标记模型已经构建
        self.built = True
        
        # 如果存在查询张量，则构建查询张量的形状
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        
        # 如果存在键张量，则构建键张量的形状
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        
        # 如果存在值张量，则构建值张量的形状
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RoFormer
# 定义了一个名为 TFRoFormerSelfOutput 的自定义层，用于 RoFormer 模型的自我输出处理

class TFRoFormerSelfOutput(keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，输出单元数为 config.hidden_size，使用指定初始化器初始化权重矩阵
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        
        # 创建一个 LayerNormalization 层，设置 epsilon 参数为 config.layer_norm_eps
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # 创建一个 Dropout 层，设置 dropout 比率为 config.hidden_dropout_prob
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        
        # 保存配置参数
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入 hidden_states 通过全连接层 self.dense 进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        
        # 在训练过程中，对输出 hidden_states 应用 dropout
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        
        # 将 dropout 处理后的 hidden_states 与输入 input_tensor 相加，并进行 LayerNormalization
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        # 返回处理后的 hidden_states
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果存在 self.dense 层，则使用输入形状构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        
        # 如果存在 self.LayerNorm 层，则使用输入形状构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


class TFRoFormerAttention(keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个 TFRoFormerSelfAttention 层，命名为 self_attention，用于处理 RoFormer 的自注意力机制
        self.self_attention = TFRoFormerSelfAttention(config, name="self")
        
        # 创建一个 TFRoFormerSelfOutput 层，命名为 dense_output，用于处理自我输出
        self.dense_output = TFRoFormerSelfOutput(config, name="output")

    def prune_heads(self, heads):
        # 未实现的方法，用于剪枝多头注意力机制的头部
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        sinusoidal_pos: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 使用 self_attention 层处理输入的 input_tensor，获取自注意力机制的输出
        self_outputs = self.self_attention(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            sinusoidal_pos=sinusoidal_pos,
            head_mask=head_mask,
            output_attentions=output_attentions,
            training=training,
        )
        
        # 使用 dense_output 层处理 self_attention 的输出，并与原始输入 input_tensor 相加，处理自我输出
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        
        # 将处理后的 attention_output 作为主要输出，如果需要输出 attentions，则将其附加在输出元组中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出 attentions，则添加它们

        # 返回最终输出元组
        return outputs
    # 定义神经网络层的构建方法，用于在给定输入形状的情况下构建网络层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 如果存在 self_attention 属性，则构建 self_attention
        if getattr(self, "self_attention", None) is not None:
            # 在命名空间下构建 self_attention
            with tf.name_scope(self.self_attention.name):
                # 调用 self_attention 的 build 方法，传入 None 作为输入形状
                self.self_attention.build(None)
        
        # 如果存在 dense_output 属性，则构建 dense_output
        if getattr(self, "dense_output", None) is not None:
            # 在命名空间下构建 dense_output
            with tf.name_scope(self.dense_output.name):
                # 调用 dense_output 的 build 方法，传入 None 作为输入形状
                self.dense_output.build(None)
# 从 transformers.models.bert.modeling_tf_bert.TFBertIntermediate 复制而来，将 Bert 替换为 RoFormer
class TFRoFormerIntermediate(keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于处理中间层的输出
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据配置获取中间激活函数，可以是字符串或者函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将输入的隐藏状态经过全连接层处理
        hidden_states = self.dense(inputs=hidden_states)
        # 使用中间激活函数处理全连接层的输出
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经建立，则不做任何操作
        if getattr(self, "dense", None) is not None:
            # 如果存在全连接层，建立其结构
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# 从 transformers.models.bert.modeling_tf_bert.TFBertOutput 复制而来，将 Bert 替换为 RoFormer
class TFRoFormerOutput(keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，用于处理输出层的输出
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建一个层归一化层，用于处理输出的归一化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建一个 dropout 层，用于输出的随机失活
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入的隐藏状态经过全连接层处理
        hidden_states = self.dense(inputs=hidden_states)
        # 使用 dropout 层对全连接层输出进行处理
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 使用层归一化层对全连接层输出和输入进行处理
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经建立，则不做任何操作
        if getattr(self, "dense", None) is not None:
            # 如果存在全连接层，建立其结构
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        if getattr(self, "LayerNorm", None) is not None:
            # 如果存在层归一化层，建立其结构
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 定义 RoFormer 层，包含注意力层、中间层和输出层
class TFRoFormerLayer(keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建 RoFormer 注意力层
        self.attention = TFRoFormerAttention(config, name="attention")
        # 创建 RoFormer 中间层
        self.intermediate = TFRoFormerIntermediate(config, name="intermediate")
        # 创建 RoFormer 输出层
        self.roformer_output = TFRoFormerOutput(config, name="output")
    # 定义一个方法，用于 RoFormer 模型的前向传播计算
    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        sinusoidal_pos: tf.Tensor,
        head_mask: tf.Tensor,
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 调用注意力层的计算，得到注意力层的输出
        attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            sinusoidal_pos=sinusoidal_pos,
            head_mask=head_mask,
            output_attentions=output_attentions,
            training=training,
        )
        # 取注意力层输出的第一个元素作为注意力输出
        attention_output = attention_outputs[0]
        # 经过中间层处理，得到中间层的输出
        intermediate_output = self.intermediate(hidden_states=attention_output)
        # 经过 RoFormer 输出层的处理，得到最终层的输出
        layer_output = self.roformer_output(
            hidden_states=intermediate_output, input_tensor=attention_output, training=training
        )
        # 如果有需要，将注意力输出一并返回
        outputs = (layer_output,) + attention_outputs[1:]  # 如果有输出注意力信息，则添加进去

        return outputs

    # 构建方法，用于在 TensorFlow 中构建网络层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 设置为已构建状态
        self.built = True
        # 如果存在 attention 层，则在 TensorFlow 的名称空间下构建
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果存在 intermediate 层，则在 TensorFlow 的名称空间下构建
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        # 如果存在 roformer_output 层，则在 TensorFlow 的名称空间下构建
        if getattr(self, "roformer_output", None) is not None:
            with tf.name_scope(self.roformer_output.name):
                self.roformer_output.build(None)
# 定义 TFRoFormerEncoder 类，继承自 keras.layers.Layer
class TFRoFormerEncoder(keras.layers.Layer):
    
    # 初始化方法，接受 RoFormerConfig 对象和其他关键字参数
    def __init__(self, config: RoFormerConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 创建 embed_positions 属性，使用 TFRoFormerSinusoidalPositionalEmbedding 类
        self.embed_positions = TFRoFormerSinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            config.hidden_size // config.num_attention_heads,
            name="embed_positions",
        )
        
        # 创建 layer 属性，是 TFRoFormerLayer 对象组成的列表
        self.layer = [TFRoFormerLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    # call 方法，定义了层的正向传播逻辑
    def call(
        self,
        hidden_states: tf.Tensor,            # 输入的隐藏状态张量
        attention_mask: tf.Tensor,           # 注意力掩码张量
        head_mask: tf.Tensor,                # 头部掩码张量
        output_attentions: bool,             # 是否输出注意力权重
        output_hidden_states: bool,          # 是否输出隐藏状态
        return_dict: bool,                   # 是否返回字典
        training: bool = False,              # 是否处于训练模式
    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
        # 如果输出隐藏状态，初始化空元组 all_hidden_states，否则设为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，初始化空元组 all_attentions，否则设为 None
        all_attentions = () if output_attentions else None

        # 生成正弦位置编码，形状为 [1, 1, sequence_length, embed_size_per_head]
        sinusoidal_pos = self.embed_positions(shape_list(hidden_states)[:-1])[None, None, :, :]

        # 遍历每个层模块
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态，将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 调用层模块的正向传播方法，计算层的输出
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                sinusoidal_pos=sinusoidal_pos,
                head_mask=head_mask[i],
                output_attentions=output_attentions,
                training=training,
            )
            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果输出注意力权重，将当前层的注意力权重添加到 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 如果输出隐藏状态，将最后一个隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回不为 None 的元组项
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)

        # 如果 return_dict 为 True，则返回 TFBaseModelOutput 对象
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )

    # build 方法，用于构建层，初始化 embed_positions 和 layer
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果 embed_positions 存在，则构建它
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)
        
        # 遍历每个层，构建每个层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)
    # 初始化方法，用于创建一个新的 RoFormer 模型实例
    def __init__(self, config: RoFormerConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建一个全连接层，用于将输入向量转换到指定维度
        self.dense = keras.layers.Dense(
            units=config.embedding_size,                      # 设置全连接层的输出维度
            kernel_initializer=get_initializer(config.initializer_range),  # 设置权重初始化器
            name="dense",                                     # 设置层名称
        )

        # 根据配置参数选择或者创建激活函数转换器
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_tf_activation(config.hidden_act)  # 根据字符串获取激活函数
        else:
            self.transform_act_fn = config.hidden_act  # 直接使用配置中的激活函数

        # 创建 LayerNormalization 层，用于归一化输入数据
        self.LayerNorm = keras.layers.LayerNormalization(
            epsilon=config.layer_norm_eps,  # 设置归一化层的 epsilon 参数
            name="LayerNorm"                 # 设置层名称
        )
        
        # 保存配置参数供模型使用
        self.config = config

    # 模型调用方法，用于定义模型的前向传播逻辑
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 输入数据通过全连接层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 应用激活函数转换非线性特征
        hidden_states = self.transform_act_fn(hidden_states)
        # 输入数据通过归一化层进行归一化处理
        hidden_states = self.LayerNorm(inputs=hidden_states)

        # 返回处理后的数据作为模型输出
        return hidden_states

    # 构建方法，用于构建模型的各层并初始化权重
    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        self.built = True
        
        # 如果存在全连接层，则构建全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])  # 构建全连接层的权重

        # 如果存在归一化层，则构建归一化层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.embedding_size])  # 构建归一化层的权重
class TFRoFormerLMPredictionHead(keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)

        self.config = config  # 存储 RoFormer 的配置信息
        self.embedding_size = config.embedding_size  # 提取配置中的嵌入大小

        self.transform = TFRoFormerPredictionHeadTransform(config, name="transform")  # 初始化预测头的转换层

        # 输出权重与输入嵌入相同，但每个标记都有一个只输出的偏置项
        self.input_embeddings = input_embeddings  # 存储输入的嵌入层对象

    def build(self, input_shape=None):
        # 添加一个形状为 (vocab_size,) 的可训练偏置项，初始化为零向量
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        if self.built:
            return
        self.built = True
        if getattr(self, "transform", None) is not None:
            with tf.name_scope(self.transform.name):
                self.transform.build(None)  # 构建转换层的内部结构

    def get_output_embeddings(self) -> keras.layers.Layer:
        return self.input_embeddings  # 返回当前的输入嵌入层对象

    def set_output_embeddings(self, value: tf.Variable):
        self.input_embeddings.weight = value  # 设置输入嵌入的权重为给定值
        self.input_embeddings.vocab_size = shape_list(value)[0]  # 更新嵌入的词汇表大小

    def get_bias(self) -> Dict[str, tf.Variable]:
        return {"bias": self.bias}  # 返回当前的偏置项作为字典

    def set_bias(self, value: tf.Variable):
        self.bias = value["bias"]  # 设置偏置项为给定的值中的 "bias" 键
        self.config.vocab_size = shape_list(value["bias"])[0]  # 更新配置中的词汇表大小信息

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        hidden_states = self.transform(hidden_states=hidden_states)  # 应用预测头的转换层
        seq_length = shape_list(hidden_states)[1]  # 获取隐藏状态张量的序列长度
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])  # 将隐藏状态重塑为二维张量
        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)  # 执行矩阵乘法
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])  # 重塑为三维张量
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)  # 添加偏置项到张量

        return hidden_states


# 从 transformers.models.bert.modeling_tf_bert.TFBertMLMHead 复制并修改为 RoFormer
class TFRoFormerMLMHead(keras.layers.Layer):
    def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)

        self.predictions = TFRoFormerLMPredictionHead(config, input_embeddings, name="predictions")  # 初始化预测头

    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
        prediction_scores = self.predictions(hidden_states=sequence_output)  # 执行预测头的前向传播

        return prediction_scores

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                self.predictions.build(None)  # 构建预测头的内部结构


@keras_serializable
class TFRoFormerMainLayer(keras.layers.Layer):
    config_class = RoFormerConfig  # 设置 RoFormer 的配置类
    def __init__(self, config: RoFormerConfig, add_pooling_layer: bool = True, **kwargs):
        super().__init__(**kwargs)  # 调用父类的初始化方法

        self.config = config  # 保存配置对象到实例变量

        self.embeddings = TFRoFormerEmbeddings(config, name="embeddings")  # 创建 RoFormer 的 embeddings 层对象
        if config.embedding_size != config.hidden_size:
            self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")  # 如果 embedding_size 不等于 hidden_size，则创建 Dense 层

        self.encoder = TFRoFormerEncoder(config, name="encoder")  # 创建 RoFormer 的 encoder 层对象

    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings  # 返回 embeddings 层对象作为输入 embeddings

    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value  # 设置 embeddings 层的权重为给定的 value
        self.embeddings.vocab_size = shape_list(value)[0]  # 设置 embeddings 层的词汇量大小为 value 的第一个维度大小

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError  # 抛出未实现异常，表明子类应该实现这个方法

    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ):
        """
        RoFormer 模型的前向传播方法，接收多个输入参数，并返回相应的输出。

        这里的装饰器 @unpack_inputs 用于解包输入参数，详见其定义。
        """
        # 具体的前向传播逻辑在这里实现，但代码中没有具体展示

    def build(self, input_shape=None):
        if self.built:
            return  # 如果已经构建过，直接返回

        self.built = True  # 设置标志位表示模型已构建

        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)  # 构建 embeddings 层

        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)  # 构建 encoder 层

        if getattr(self, "embeddings_project", None) is not None:
            with tf.name_scope(self.embeddings_project.name):
                self.embeddings_project.build([None, None, self.config.embedding_size])  # 构建 embeddings_project 层
# 导入 `TFPreTrainedModel` 的子类 `TFRoFormerPreTrainedModel`
class TFRoFormerPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类是 `RoFormerConfig`
    config_class = RoFormerConfig
    # 基础模型前缀是 "roformer"
    base_model_prefix = "roformer"


# RoFormer 模型文档字符串的起始部分
ROFORMER_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`RoFormerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# RoFormer 模型输入文档字符串的起始部分
ROFORMER_INPUTS_DOCSTRING = r"""
"""


# 添加文档字符串说明到 `TFRoFormerModel` 类
@add_start_docstrings(
    "The bare RoFormer Model transformer outputing raw hidden-states without any specific head on top.",
    ROFORMER_START_DOCSTRING,
)
class TFRoFormerModel(TFRoFormerPreTrainedModel):
    # 初始化函数，接受一个RoFormer配置对象和其他输入参数，并调用父类的初始化方法
    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 创建一个TFRoFormerMainLayer对象，命名为"roformer"
        self.roformer = TFRoFormerMainLayer(config, name="roformer")

    # 装饰器：解压输入参数，将模型前向传播的文档字符串添加到方法上
    # 添加代码示例的文档字符串，指定检查点、输出类型和配置类
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPooling,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播方法
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
        # 调用self.roformer对象进行前向传播，传递所有输入参数
        outputs = self.roformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回模型的输出
        return outputs

    # 构建函数，用于构建模型，如果已经构建则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 检查self.roformer属性是否存在，并在名称作用域内构建self.roformer对象
        if getattr(self, "roformer", None) is not None:
            with tf.name_scope(self.roformer.name):
                self.roformer.build(None)
@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
# 使用给定的文档字符串为 RoFormer 模型添加头部语言建模功能的说明文档

class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss):
    # TFRoFormerForMaskedLM 类继承自 TFRoFormerPreTrainedModel 和 TFMaskedLanguageModelingLoss

    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 调用父类的构造函数，初始化模型配置和其他输入参数

        if config.is_decoder:
            logger.warning(
                "If you want to use `TFRoFormerForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )
            # 如果配置要求是解码器，则发出警告，建议在使用时确保 config.is_decoder=False，以便实现双向自注意力

        self.roformer = TFRoFormerMainLayer(config, name="roformer")
        # 初始化 RoFormer 的主层，使用给定的配置和名称"roformer"

        self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls")
        # 初始化 RoFormer 的 MLM 头部，使用给定的配置、输入嵌入和名称"mlm___cls"

    def get_lm_head(self) -> keras.layers.Layer:
        return self.mlm.predictions
        # 返回 MLM 头部的预测结果作为语言建模的输出层

    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 定义模型的调用方法，接受一系列输入参数，执行前向传播操作
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 调用 RoFormer 模型进行推理，返回结果包括 MLM 相关的输出和额外信息
        outputs = self.roformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取 RoFormer 的输出序列
        sequence_output = outputs[0]
        # 使用 MLM 层对序列进行预测得分计算
        prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
        # 如果有标签数据，则计算损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)

        # 如果不要求返回字典，则输出结果包括预测分数和额外的输出状态
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMaskedLMOutput 对象，包括损失、预测分数、隐藏状态和注意力信息
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过网络结构，则直接返回
        if self.built:
            return
        # 标记该模型已经构建
        self.built = True
        # 如果 RoFormer 模型存在，则建立 RoFormer
        if getattr(self, "roformer", None) is not None:
            with tf.name_scope(self.roformer.name):
                self.roformer.build(None)
        # 如果 MLM 模型存在，则建立 MLM
        if getattr(self, "mlm", None) is not None:
            with tf.name_scope(self.mlm.name):
                self.mlm.build(None)
@add_start_docstrings(
    """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
)
class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingLoss):
    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        if not config.is_decoder:
            # 如果要单独使用 `TFRoFormerForCausalLM`，需要设置 `is_decoder=True`
            logger.warning("If you want to use `TFRoFormerForCausalLM` as a standalone, add `is_decoder=True.`")

        # 初始化 RoFormer 主层
        self.roformer = TFRoFormerMainLayer(config, name="roformer")
        # 初始化 RoFormer 的 MLM 头部
        self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls")

    def get_lm_head(self) -> keras.layers.Layer:
        # 返回 MLM 头部的预测层
        return self.mlm.predictions

    @unpack_inputs
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 调用 RoFormer 主层进行前向传播
        outputs = self.roformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 RoFormer 输出中获取序列输出
        sequence_output = outputs[0]
        # 使用 MLM 头部对序列输出进行预测
        logits = self.mlm(sequence_output=sequence_output, training=training)
        loss = None

        if labels is not None:
            # 将标签向左移动一个位置并去掉最后一个预测标记
            shifted_logits = logits[:, :-1]
            labels = labels[:, 1:]
            # 计算交叉熵损失
            loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)

        if not return_dict:
            # 如果不要求返回字典，则返回元组形式的输出
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFCausalLMOutput 格式的输出
        return TFCausalLMOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 定义一个方法用于构建模型，在没有指定输入形状的情况下
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 检查是否存在名为 "roformer" 的属性，如果存在则构建相关部分
        if getattr(self, "roformer", None) is not None:
            # 在 TensorFlow 中使用命名空间来管理作用域，这里创建 roformer 的命名空间
            with tf.name_scope(self.roformer.name):
                # 调用 roformer 对象的 build 方法，参数为 None 表示使用默认输入形状
                self.roformer.build(None)
        
        # 检查是否存在名为 "mlm" 的属性，如果存在则构建相关部分
        if getattr(self, "mlm", None) is not None:
            # 在 TensorFlow 中使用命名空间来管理作用域，这里创建 mlm 的命名空间
            with tf.name_scope(self.mlm.name):
                # 调用 mlm 对象的 build 方法，参数为 None 表示使用默认输入形状
                self.mlm.build(None)
class TFRoFormerClassificationHead(keras.layers.Layer):
    """Head for sentence-level classification tasks."""

    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

        # 创建一个全连接层，用于将输入特征映射到隐藏层的维度上
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 添加一个dropout层，用于在训练过程中随机丢弃部分神经元，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        # 创建输出层，将隐藏层映射到标签数量的维度上
        self.out_proj = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )

        # 根据配置初始化分类器的激活函数
        if isinstance(config.hidden_act, str):
            self.classifier_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.classifier_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 从输入的隐藏状态中仅保留第一个特征向量，通常代表[CLS] token
        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
        # 应用dropout操作到隐藏状态，根据training参数决定是否启用dropout
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将dropout后的隐藏状态输入到全连接层中进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 应用分类器的激活函数到全连接层输出的隐藏状态上
        hidden_states = self.classifier_act_fn(hidden_states)
        # 再次应用dropout操作到激活函数后的隐藏状态上
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将dropout后的隐藏状态输入到输出层中进行线性变换，得到最终的分类输出
        hidden_states = self.out_proj(hidden_states)

        # 返回最终的分类输出张量
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果dense层已经初始化，则构建dense层的计算图
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果out_proj层已经初始化，则构建out_proj层的计算图
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.config.hidden_size])


@add_start_docstrings(
    """
    RoFormer Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks.
    """,
    ROFORMER_START_DOCSTRING,
)
# 创建用于序列分类任务的RoFormer模型，继承自TFRoFormerPreTrainedModel和TFSequenceClassificationLoss类
class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 设置分类任务的类别数目
        self.num_labels = config.num_labels

        # 初始化RoFormer主层和分类头部
        self.roformer = TFRoFormerMainLayer(config, name="roformer")
        self.classifier = TFRoFormerClassificationHead(config, name="classifier")

    # 以下是装饰器和注释，用于说明模型的输入和输出格式，以及示例代码等
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 定义输入的token ids，类型为TFModelInputType或None，默认为None
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 定义注意力掩码，类型为np.ndarray或tf.Tensor或None，默认为None
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # 定义token类型 ids，类型为np.ndarray或tf.Tensor或None，默认为None
        head_mask: np.ndarray | tf.Tensor | None = None,  # 定义头部掩码，类型为np.ndarray或tf.Tensor或None，默认为None
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 定义嵌入输入，类型为np.ndarray或tf.Tensor或None，默认为None
        output_attentions: Optional[bool] = None,  # 是否输出注意力信息，类型为可选的布尔值，默认为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值，默认为None
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，类型为可选的布尔值，默认为None
        labels: np.ndarray | tf.Tensor | None = None,  # 定义标签，类型为np.ndarray或tf.Tensor或None，默认为None
        training: Optional[bool] = False,  # 是否处于训练模式，类型为可选的布尔值，默认为False
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 使用 RoFormer 模型进行前向传播
        outputs = self.roformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 使用分类器对 RoFormer 的隐藏状态进行分类
        logits = self.classifier(hidden_states=outputs[0], training=training)
        # 计算损失，如果标签为None则损失为None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不返回字典形式的输出，则按元组形式返回结果
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFSequenceClassifierOutput 对象，包含损失、logits、隐藏状态和注意力信息
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        self.built = True
        # 构建 RoFormer 模型
        if getattr(self, "roformer", None) is not None:
            with tf.name_scope(self.roformer.name):
                self.roformer.build(None)
        # 构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
    """
    RoFormer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """
    # 继承自TFRoFormerPreTrainedModel和TFMultipleChoiceLoss，实现RoFormer模型用于多项选择分类任务
    @add_start_docstrings(
        ROFORMER_START_DOCSTRING,
    )
    class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLoss):
        def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
            super().__init__(config, *inputs, **kwargs)
    
            # 初始化RoFormer主层
            self.roformer = TFRoFormerMainLayer(config, name="roformer")
            # 序列摘要层，用于生成序列摘要特征
            self.sequence_summary = TFSequenceSummary(config, config.initializer_range, name="sequence_summary")
            # 分类器，使用Dense层实现，用于多项选择任务的分类
            self.classifier = keras.layers.Dense(
                units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
            )
            self.config = config
    
        # 根据输入参数解包输入，添加模型正向传播的文档字符串
        @unpack_inputs
        @add_start_docstrings_to_model_forward(
            ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
        )
        @add_code_sample_docstrings(
            checkpoint=_CHECKPOINT_FOR_DOC,
            output_type=TFMultipleChoiceModelOutput,
            config_class=_CONFIG_FOR_DOC,
        )
        def call(
            self,
            input_ids: TFModelInputType | None = None,
            attention_mask: np.ndarray | tf.Tensor | None = None,
            token_type_ids: np.ndarray | tf.Tensor | None = None,
            head_mask: np.ndarray | tf.Tensor | None = None,
            inputs_embeds: np.ndarray | tf.Tensor | None = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            labels: np.ndarray | tf.Tensor | None = None,
            training: Optional[bool] = False,
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        # 如果 `input_ids` 不为空，则确定 `num_choices` 和 `seq_length`
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取 `input_ids` 的第二维度大小
            seq_length = shape_list(input_ids)[2]   # 获取 `input_ids` 的第三维度大小
        else:
            num_choices = shape_list(inputs_embeds)[1]  # 获取 `inputs_embeds` 的第二维度大小
            seq_length = shape_list(inputs_embeds)[2]   # 获取 `inputs_embeds` 的第三维度大小

        # 将输入张量展平为二维，如果对应输入张量不为空
        flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = (
            tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
        )
        flat_token_type_ids = (
            tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
        )
        flat_inputs_embeds = (
            tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )

        # 调用 `roformer` 模型进行前向传播
        outputs = self.roformer(
            input_ids=flat_input_ids,
            attention_mask=flat_attention_mask,
            token_type_ids=flat_token_type_ids,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 对输出进行序列摘要处理
        logits = self.sequence_summary(inputs=outputs[0], training=training)
        logits = self.classifier(inputs=logits)  # 将序列摘要后的结果送入分类器

        # 将 logits 重新整形为二维形状
        reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))

        # 如果提供了标签 `labels`，计算损失
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)

        # 如果 `return_dict` 为 False，则返回输出元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 `return_dict` 为 True，则返回多项选择模型的输出对象
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        
        # 设置已构建标志为 True
        self.built = True

        # 如果 `roformer` 模型存在，则构建其网络结构
        if getattr(self, "roformer", None) is not None:
            with tf.name_scope(self.roformer.name):
                self.roformer.build(None)

        # 如果 `sequence_summary` 模型存在，则构建其网络结构
        if getattr(self, "sequence_summary", None) is not None:
            with tf.name_scope(self.sequence_summary.name):
                self.sequence_summary.build(None)

        # 如果 `classifier` 模型存在，则构建其网络结构
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
# 定义 TFRoFormerForTokenClassification 类，用于在 RoFormer 模型的基础上增加一个标记分类头部，例如用于命名实体识别（NER）任务
@add_start_docstrings(
    """
    RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    ROFORMER_START_DOCSTRING,
)
class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化类别数量
        self.num_labels = config.num_labels

        # 初始化 RoFormer 主层
        self.roformer = TFRoFormerMainLayer(config, name="roformer")
        # 初始化 Dropout 层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        # 初始化分类器 Dense 层，用于分类器的线性变换
        self.classifier = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        # 存储模型配置
        self.config = config

    # 定义模型的前向传播方法，接受多个输入参数，并返回 TFTokenClassifierOutput 类型的输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        # 更多参数用于控制模型行为
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        定义方法的返回类型，可以是 TFTokenClassifierOutput 或者包含 tf.Tensor 的元组。
        如果方法没有返回值，应该返回 None。
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            用于计算标记分类损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
        """
        # 调用 RoFormer 模型进行前向传播
        outputs = self.roformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取模型输出的序列输出
        sequence_output = outputs[0]
        # 对序列输出应用 dropout 操作，用于防止过拟合
        sequence_output = self.dropout(inputs=sequence_output, training=training)
        # 将处理后的序列输出送入分类器中得到 logits
        logits = self.classifier(inputs=sequence_output)
        # 计算损失值，如果没有提供标签，则损失值为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TFTokenClassifierOutput 对象
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过网络，则直接返回
        if self.built:
            return
        # 标记网络已经构建
        self.built = True
        # 如果存在 RoFormer 模型，则构建 RoFormer
        if getattr(self, "roformer", None) is not None:
            with tf.name_scope(self.roformer.name):
                self.roformer.build(None)
        # 如果存在分类器模型，则构建分类器
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
# 使用装饰器添加模型的起始文档字符串，描述了 RoFormer 模型及其用途，特别是在抽取式问答任务（如 SQuAD）中的应用
@add_start_docstrings(
    """
    RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ROFORMER_START_DOCSTRING,  # 引用了 RoFormer 模型的起始文档字符串
)
class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnsweringLoss):
    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels  # 从配置中获取标签数目

        self.roformer = TFRoFormerMainLayer(config, name="roformer")  # 初始化 RoFormer 主层
        self.qa_outputs = keras.layers.Dense(
            units=config.num_labels,  # 输出单元数为配置中的标签数目
            kernel_initializer=get_initializer(config.initializer_range),  # 使用配置中的初始化范围初始化权重
            name="qa_outputs"  # 输出层的名称为 "qa_outputs"
        )
        self.config = config  # 保存配置参数

    # 使用装饰器为 call 方法添加输入参数的起始文档字符串，描述了模型的输入格式及其用途
    @unpack_inputs
    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 提供了示例代码的检查点位置
        output_type=TFQuestionAnsweringModelOutput,  # 输出类型为 TFQuestionAnsweringModelOutput
        config_class=_CONFIG_FOR_DOC,  # 使用的配置类
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入的 token IDs
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力遮罩
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # token 类型 IDs
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部遮罩
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入的嵌入表示
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出
        start_positions: np.ndarray | tf.Tensor | None = None,  # 起始位置
        end_positions: np.ndarray | tf.Tensor | None = None,  # 结束位置
        training: Optional[bool] = False,  # 是否处于训练模式

        ...
        # 方法未完全显示，继续注释剩余部分
        ...
        ):
        # 方法主体未显示完全，继续注释其余部分
        ...


由于代码块过长，无法在此显示完整的 `call` 方法主体部分。
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 RoFormer 模型进行预测，返回输出结果
        outputs = self.roformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传入 QA 输出层，得到 logits
        logits = self.qa_outputs(inputs=sequence_output)
        # 将 logits 按照最后一个维度分割成 start_logits 和 end_logits
        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
        # 移除 start_logits 和 end_logits 的最后一个维度，使其变为一维张量
        start_logits = tf.squeeze(input=start_logits, axis=-1)
        end_logits = tf.squeeze(input=end_logits, axis=-1)
        # 初始化损失为 None
        loss = None

        # 如果提供了 start_positions 和 end_positions，则计算损失
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions, "end_position": end_positions}
            # 使用 labels 和 logits 计算损失
            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))

        # 如果不需要返回字典形式的输出，则根据是否有损失返回不同的输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，则构造 TFQuestionAnsweringModelOutput 对象返回
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 设置模型已构建标志为 True
        self.built = True
        # 如果模型中包含 RoFormer 层，则构建 RoFormer 层
        if getattr(self, "roformer", None) is not None:
            with tf.name_scope(self.roformer.name):
                self.roformer.build(None)
        # 如果模型中包含 QA 输出层，则构建 QA 输出层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\roformer\tokenization_roformer.py`

# coding=utf-8
# 版权所有 2021 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证版本 2.0（“许可证”）许可；
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，
# 不附带任何明示或暗示的担保或条件。
# 有关许可证详细信息，请参阅许可证。

"""RoFormer 的标记化类。"""

import collections
import os
import unicodedata
from typing import List, Optional, Tuple

from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 定义词汇文件名映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
        "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
        "junnyu/roformer_chinese_char_small": (
            "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_chinese_char_base": (
            "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_small_discriminator": (
            "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_small_generator": (
            "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt"
        ),
    }
}

# 预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "junnyu/roformer_chinese_small": 1536,
    "junnyu/roformer_chinese_base": 1536,
    "junnyu/roformer_chinese_char_small": 512,
    "junnyu/roformer_chinese_char_base": 512,
    "junnyu/roformer_small_discriminator": 128,
    "junnyu/roformer_small_generator": 128,
}

# 预训练模型的初始化配置映射
PRETRAINED_INIT_CONFIGURATION = {
    "junnyu/roformer_chinese_small": {"do_lower_case": True},
    "junnyu/roformer_chinese_base": {"do_lower_case": True},
    "junnyu/roformer_chinese_char_small": {"do_lower_case": True},
    "junnyu/roformer_chinese_char_base": {"do_lower_case": True},
    "junnyu/roformer_small_discriminator": {"do_lower_case": True},
    "junnyu/roformer_small_generator": {"do_lower_case": True},
}

# 从transformers.models.bert.tokenization_bert.load_vocab复制而来的函数
def load_vocab(vocab_file):
    """加载一个词汇文件到一个有序字典中。"""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    # 使用 enumerate 函数遍历 tokens 列表，同时获取索引 index 和每个 token
    for index, token in enumerate(tokens):
        # 去除 token 字符串末尾的换行符 "\n"
        token = token.rstrip("\n")
        # 将 token 添加到 vocab 字典中，键为 token，值为 index
        vocab[token] = index
    # 返回填充完毕的 vocab 字典
    return vocab
# 从 transformers.models.bert.tokenization_bert.whitespace_tokenize 复制而来
def whitespace_tokenize(text):
    """对文本进行基本的空格清理和分割。"""
    # 去除文本两端的空白字符
    text = text.strip()
    # 如果文本为空字符串，则返回空列表
    if not text:
        return []
    # 使用空格分割文本，得到分词结果
    tokens = text.split()
    # 返回分词结果列表
    return tokens


# 从 transformers.models.bert.tokenization_bert.BasicTokenizer 复制而来
class BasicTokenizer(object):
    """
    构造一个 BasicTokenizer 对象，执行基本的分词（分割标点符号、转换为小写等）。

    Args:
        do_lower_case (`bool`, *可选*, 默认为 `True`):
            是否在分词时转换为小写。
        never_split (`Iterable`, *可选*):
            在分词过程中永远不会分割的词汇集合。仅在 `do_basic_tokenize=True` 时有效。
        tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):
            是否分割中文字符。

            对于日语，这可能需要禁用（参见此处
            [issue](https://github.com/huggingface/transformers/issues/328)）。
        strip_accents (`bool`, *可选*):
            是否去除所有重音符号。如果未指定，则由 `lowercase` 的值决定（与原始 BERT 一致）。
        do_split_on_punc (`bool`, *可选*, 默认为 `True`):
            在某些情况下，我们希望跳过基本的标点符号分割，以便后续的分词可以捕获单词的完整上下文，例如缩写。

    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        # 如果 never_split 为 None，则设为空列表
        if never_split is None:
            never_split = []
        # 设置是否转换为小写
        self.do_lower_case = do_lower_case
        # 设置永远不会分割的词汇集合
        self.never_split = set(never_split)
        # 设置是否分割中文字符
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 设置是否去除重音符号
        self.strip_accents = strip_accents
        # 设置是否在标点符号上进行分割
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        # 如果提供了 `never_split` 参数，则将其与对象自身的 `never_split` 集合取并集
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本，包括一些预处理步骤
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        # 如果设置了 `tokenize_chinese_chars` 为 True，则执行中文字符的特殊处理
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        # 对文本进行 Unicode 规范化，确保统一字符的表示形式
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 将文本按空格分割成原始 token 列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        # 遍历每个 token 进行处理
        for token in orig_tokens:
            # 如果 token 不在 `never_split` 集合中，则考虑是否进行小写处理和去除重音符号处理
            if token not in never_split:
                if self.do_lower_case:
                    # 如果设置了小写处理，则将 token 转换为小写
                    token = token.lower()
                    # 如果不是明确设置为 False，则执行去除重音符号的处理
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                elif self.strip_accents:
                    # 如果设置了去除重音符号，则执行去除重音符号的处理
                    token = self._run_strip_accents(token)
            # 根据标点符号分割 token，并添加到分割后的 token 列表中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将分割后的 token 列表按空格连接成字符串，并再次按空格分割为最终输出的 token 列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要根据标点符号拆分，或者指定的文本不应该被拆分，则直接返回原始文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号，则将其作为一个单独的列表项添加到输出中，并标记可以开始新单词
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，根据当前是否应该开始新单词来添加到上一个列表项或新建一个列表项
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        # 将列表中的每个列表项（字符列表）转换为字符串，并返回最终的拆分结果列表
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是中文字符，则在其前后添加空格，并添加到输出中；否则直接添加到输出
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        # 将输出列表中的所有元素合并为一个字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查给定的 Unicode 码点是否属于CJK字符范围内的任何一个范围，返回布尔值
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或控制字符，则跳过不添加到输出；如果是空白字符则替换为单个空格
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将输出列表中的所有元素合并为一个字符串并返回
        return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
# 从 transformers.models.bert.tokenization_bert.WordpieceTokenizer 复制而来的类定义

class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""
    # 执行 WordPiece 分词的类

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化函数，接收词汇表、未知标记和每个单词最大输入字符数
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 将文本分词成 WordPiece 格式的片段。使用贪婪的最长匹配算法，根据给定的词汇表进行分词。

        output_tokens = []
        for token in whitespace_tokenize(text):
            # 对文本中的每个 token 进行处理，使用 whitespace_tokenize 进行分割
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                # 如果 token 长度超过设定的最大输入字符数，则使用未知标记
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    # 从当前位置开始向后逐渐减小窗口，尝试匹配最长的词片段
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        # 如果找到了在词汇表中的词片段，则记录下来
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    # 如果未找到匹配的词片段，则标记为无效
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                # 如果整个 token 都无法分割成有效的词片段，则使用未知标记
                output_tokens.append(self.unk_token)
            else:
                # 否则将分割得到的子词片段添加到输出 tokens 中
                output_tokens.extend(sub_tokens)
        return output_tokens
    # 导入四个常量，分别是词汇表文件名列表、预训练词汇表文件映射、预训练模型输入最大长度列表和预训练模型初始化配置
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 初始化函数，用于创建一个新的 Tokenizer 对象
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 检查给定的词汇文件是否存在，如果不存在则抛出 ValueError 异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表文件，并存储到 self.vocab 中
        self.vocab = load_vocab(vocab_file)
        # 创建一个从词汇映射到 ID 的有序字典 self.ids_to_tokens
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 设置是否执行基本的分词操作的标志
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要进行基本分词操作，则创建 BasicTokenizer 对象
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        # 创建 WordpieceTokenizer 对象，用于执行 WordPiece 分词
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
        
        # 尝试导入 rjieba 库，用于中文分词
        try:
            import rjieba
        except ImportError:
            # 如果导入失败，抛出 ImportError 异常并提供安装提示信息
            raise ImportError(
                "You need to install rjieba to use RoFormerTokenizer. "
                "See https://pypi.org/project/rjieba/ for installation."
            )
        # 将 rjieba 模块赋值给 self.jieba，以便后续中文分词使用

        self.jieba = rjieba

        # 调用父类的初始化方法，传递相同的参数和额外的关键字参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    # 返回当前 Tokenizer 是否执行小写处理的属性
    @property
    def do_lower_case(self):
        return self.basic_tokenizer.do_lower_case

    # 返回词汇表大小的属性
    @property
    def vocab_size(self):
        return len(self.vocab)

    # 获取当前对象的状态以进行序列化
    def __getstate__(self):
        state = self.__dict__.copy()
        # 将 self.jieba 置为 None，避免序列化时引入额外依赖
        state["jieba"] = None
        return state

    # 根据给定的状态设置当前对象的状态以进行反序列化
    def __setstate__(self, d):
        self.__dict__ = d
        # 重新导入 rjieba 模块，以便反序列化后能够继续使用中文分词功能
        import rjieba

        self.jieba = rjieba

    # 返回词汇表和额外添加的标记的字典表示
    def get_vocab(self):
        return dict(self.vocab, **self.added_tokens_encoder)
    # 将输入文本 `text` 分词为 token 序列，支持使用结巴分词库
    def _tokenize(self, text, use_jieba=True):
        split_tokens = []
        if use_jieba:
            # 使用结巴分词器分词，不进行全模式切分
            for wholword in self.jieba.cut(text, False):
                if wholword in self.vocab:
                    # 如果分词结果在词汇表中，直接添加到分词列表中
                    split_tokens.append(wholword)
                else:
                    # 否则使用 bert 分词器进行进一步分词处理
                    char_list = self._tokenize(wholword, use_jieba=False)
                    split_tokens.extend(char_list)
        else:
            if self.do_basic_tokenize:
                # 如果需要进行基础分词处理，则使用基础分词器进行处理
                for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                    if token in self.basic_tokenizer.never_split:
                        # 如果 token 在不分割集合中，直接添加到分词列表中
                        split_tokens.append(token)
                    else:
                        # 否则使用 wordpiece 分词器进行进一步处理
                        split_tokens += self.wordpiece_tokenizer.tokenize(token)
            else:
                # 否则直接使用 wordpiece 分词器进行处理
                split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    # 将 token 转换为对应的 ID，使用 vocab 进行映射，未知 token 使用 unk_token 处理
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # 将 ID 转换为对应的 token，使用 ids_to_tokens 进行映射，未知 ID 使用 unk_token 处理
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.ids_to_tokens.get(index, self.unk_token)

    # 将 tokens 序列转换为单个字符串，移除特殊 token 标记 (" ##")
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    # 构建包含特殊 token 的模型输入序列，用于序列分类任务
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoFormer sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # 如果只有一个输入序列，添加起始 ([CLS]) 和结束 ([SEP]) 特殊 token
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 如果有两个输入序列，添加起始 ([CLS])，序列 1，中间分隔 ([SEP])，序列 2，以及结束 ([SEP]) 特殊 token
        return cls + token_ids_0 + sep + token_ids_1 + sep

    # 获取包含特殊 token 的遮蔽掩码，用于序列对任务
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            # If the tokens already have special tokens, delegate to the superclass method
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            # Generate a mask for sequence pairs with special tokens added
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        # Generate a mask for a single sequence with special tokens added
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define special tokens for separation and classification
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        
        if token_ids_1 is None:
            # If only one sequence is provided, return a mask for the first sequence only
            return len(cls + token_ids_0 + sep) * [0]
        
        # If two sequences are provided, return a mask for both sequences concatenated with special tokens
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    # 定义一个方法用于保存词汇表到指定的目录和文件名前缀
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引为0，用于检查词汇表索引是否连续
        index = 0
        # 检查保存目录是否存在
        if os.path.isdir(save_directory):
            # 如果目录存在，构造词汇表文件的完整路径，包括文件名前缀和默认的文件名
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # 如果目录不存在，将目录作为文件名处理，构造完整的文件路径，包括文件名前缀
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        # 打开文件以写入模式，使用 UTF-8 编码
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的 token 和其对应的索引，按索引排序
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # 如果当前索引不等于预期的索引，记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    # 更新索引为当前 token 的索引
                    index = token_index
                # 将 token 写入文件，并在末尾添加换行符
                writer.write(token + "\n")
                # 更新索引以确保连续性
                index += 1
        # 返回保存的词汇表文件路径
        return (vocab_file,)

`.\models\roformer\tokenization_roformer_fast.py`

# 导入必要的模块和库
import json  # 导入 json 模块，用于处理 JSON 格式数据
from typing import List, Optional, Tuple  # 导入类型提示相关的模块

from tokenizers import normalizers  # 导入 tokenizers 库中的 normalizers 模块
from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer  # 导入 tokenizers 库中的预分词器类

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 从上级目录导入 PreTrainedTokenizerFast 类
from ...utils import logging  # 从上级目录导入 logging 模块
from .tokenization_roformer import RoFormerTokenizer  # 从当前目录导入 RoFormerTokenizer 类
from .tokenization_utils import JiebaPreTokenizer  # 从当前目录导入 JiebaPreTokenizer 类

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义用于 RoFormer 的词汇文件和 tokenizer 文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型的词汇文件映射，以及它们对应的下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
        "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
        "junnyu/roformer_chinese_char_small": (
            "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_chinese_char_base": (
            "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_small_discriminator": (
            "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_small_generator": (
            "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt"
        ),
    }
}

# 定义预训练模型的位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "junnyu/roformer_chinese_small": 1536,
    "junnyu/roformer_chinese_base": 1536,
    "junnyu/roformer_chinese_char_small": 512,
    "junnyu/roformer_chinese_char_base": 512,
    "junnyu/roformer_small_discriminator": 128,
    "junnyu/roformer_small_generator": 128,
}

# 定义预训练模型的初始化配置映射，指定是否小写化
PRETRAINED_INIT_CONFIGURATION = {
    "junnyu/roformer_chinese_small": {"do_lower_case": True},
    "junnyu/roformer_chinese_base": {"do_lower_case": True},
    "junnyu/roformer_chinese_char_small": {"do_lower_case": True},
    "junnyu/roformer_chinese_char_base": {"do_lower_case": True},
    "junnyu/roformer_small_discriminator": {"do_lower_case": True},
    "junnyu/roformer_small_generator": {"do_lower_case": True},
}


class RoFormerTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" RoFormer tokenizer (backed by HuggingFace's *tokenizers* library).
    # `RoFormerTokenizerFast`几乎与`BertTokenizerFast`相同，实现端到端的分词：
    # 标点符号分割和WordPiece。它们在处理中文时有些差异。
    
    # 此分词器继承自`PreTrainedTokenizerFast`，其中包含大部分主要方法。用户应该
    # 参考这个超类以获取有关这些方法的更多信息。
    
    # 示例：
    #
    # ```
    # >>> from transformers import RoFormerTokenizerFast
    #
    # >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
    # >>> tokenizer.tokenize("今天天气非常好。")
    # ['今', '天', '天', '气', '非常', '好', '。']
    # ```
    
    vocab_files_names = VOCAB_FILES_NAMES  # 获取词汇文件的名称列表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 获取预训练词汇文件的映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 获取预训练位置嵌入的最大模型输入尺寸
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION  # 获取预训练初始化配置
    slow_tokenizer_class = RoFormerTokenizer  # 慢速分词器类为RoFormerTokenizer
    
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置基本的分词器参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )
    
        # 从后端分词器的normalizer状态中加载JSON数据
        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        # 如果normalizer的lowercase属性与当前设置不符，则更新
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
        ):
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            # 更新后端分词器的normalizer
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
    
        # 确保正确设置自定义的PreTokenizer
        vocab = self.backend_tokenizer.get_vocab()
        self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
    
        self.do_lower_case = do_lower_case
    
    def __getstate__(self):
        state = self.__dict__.copy()
        # 将分词器的pre_tokenizer设置为BertPreTokenizer()
        state["_tokenizer"].pre_tokenizer = BertPreTokenizer()
        return state
    
    def __setstate__(self, d):
        self.__dict__ = d
        # 获取当前分词器的词汇表
        vocab = self.__dict__["_tokenizer"].get_vocab()
        # 将分词器的pre_tokenizer设置为自定义的JiebaPreTokenizer
        self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoFormer sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # Initialize output with CLS token ID, token_ids_0, and SEP token ID
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # If token_ids_1 is provided, concatenate token_ids_1 and SEP token ID
        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define SEP and CLS tokens as lists
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If token_ids_1 is None, return a list of zeros corresponding to token_ids_0 + CLS + SEP
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # Return a concatenated list of zeros for token_ids_0 + CLS + SEP and ones for token_ids_1 + SEP
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the tokenizer's vocabulary to a directory.

        Args:
            save_directory (str):
                Directory to save the vocabulary files.
            filename_prefix (str, *optional*):
                Prefix for the vocabulary files.

        Returns:
            `Tuple[str]`: Tuple of file paths where the vocabulary was saved.
        """
        # Save the model vocabulary using the tokenizer's save method
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

    def save_pretrained(
        self,
        save_directory,
        legacy_format=None,
        filename_prefix=None,
        push_to_hub=False,
        **kwargs,
    ):
        """
        Save the pretrained model and its tokenizer.

        Args:
            save_directory (str):
                Directory to save the pretrained model.
            legacy_format (str, *optional*):
                Legacy format compatibility.
            filename_prefix (str, *optional*):
                Prefix for the saved files.
            push_to_hub (bool):
                Whether to push the saved model to the Hugging Face model hub.
            **kwargs:
                Additional arguments passed to the superclass method.

        Returns:
            `Any`: Output of the superclass's `save_pretrained` method.
        """
        # Set the pre_tokenizer to BertPreTokenizer before saving
        self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()
        
        # Call the superclass's save_pretrained method with the specified arguments
        return super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)

`.\models\roformer\tokenization_utils.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization utils for RoFormer."""

from typing import List

from tokenizers import NormalizedString, PreTokenizedString, normalizers


class JiebaPreTokenizer:
    def __init__(self, vocab) -> None:
        self.vocab = vocab
        # 初始化BERT风格的文本规范化器，用于清理文本，处理中文字符，不去除重音符号，不转换为小写
        self.normalizers = normalizers.BertNormalizer(
            clean_text=False,
            handle_chinese_chars=True,
            strip_accents=False,
            lowercase=False,
        )
        try:
            import rjieba
        except ImportError:
            # 如果导入rjieba失败，引发ImportError并提供安装链接
            raise ImportError(
                "You need to install rjieba to use RoFormerTokenizer. "
                "See https://pypi.org/project/rjieba/ for installation."
            )
        # 导入成功后，将rjieba赋值给self.jieba
        self.jieba = rjieba

    def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        splits = []

        # 使用rjieba对normalized_string进行分词，hmm参数设为False以提高速度
        for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False):
            # 如果分词结果在词汇表中，则将对应的NormalizedString加入splits列表
            if token in self.vocab:
                splits.append(normalized_string[start:end])
            else:
                # 否则，对token进行文本规范化处理，并按照处理后的结果拆分为多个token加入splits列表
                token_list = self.normalizers.normalize_str(token).split()
                for token in token_list:
                    if token:
                        end = start + len(token)
                        splits.append(normalized_string[start:end])
                        start = end

        # 返回分词后的NormalizedString列表
        return splits

    def pre_tokenize(self, pretok: PreTokenizedString):
        # 使用jieba_split方法对PreTokenizedString对象进行分词处理
        pretok.split(self.jieba_split)

`.\models\roformer\init.py`

# 导入必要的模块和函数来检查当前环境中是否可用特定的依赖项
from typing import TYPE_CHECKING
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义一个字典，表示需要导入的模块结构
_import_structure = {
    "configuration_roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerOnnxConfig"],
    "tokenization_roformer": ["RoFormerTokenizer"],
}

# 检查是否可用tokenizers，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将RoFormerTokenizerFast加入导入结构字典
    _import_structure["tokenization_roformer_fast"] = ["RoFormerTokenizerFast"]

# 检查是否可用torch，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将模型相关的torch模块加入导入结构字典
    _import_structure["modeling_roformer"] = [
        "ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "RoFormerForCausalLM",
        "RoFormerForMaskedLM",
        "RoFormerForMultipleChoice",
        "RoFormerForQuestionAnswering",
        "RoFormerForSequenceClassification",
        "RoFormerForTokenClassification",
        "RoFormerLayer",
        "RoFormerModel",
        "RoFormerPreTrainedModel",
        "load_tf_weights_in_roformer",
    ]

# 检查是否可用tensorflow，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将模型相关的tensorflow模块加入导入结构字典
    _import_structure["modeling_tf_roformer"] = [
        "TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFRoFormerForCausalLM",
        "TFRoFormerForMaskedLM",
        "TFRoFormerForMultipleChoice",
        "TFRoFormerForQuestionAnswering",
        "TFRoFormerForSequenceClassification",
        "TFRoFormerForTokenClassification",
        "TFRoFormerLayer",
        "TFRoFormerModel",
        "TFRoFormerPreTrainedModel",
    ]

# 检查是否可用flax，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将模型相关的flax模块加入导入结构字典
    _import_structure["modeling_flax_roformer"] = [
        "FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FlaxRoFormerForMaskedLM",
        "FlaxRoFormerForMultipleChoice",
        "FlaxRoFormerForQuestionAnswering",
        "FlaxRoFormerForSequenceClassification",
        "FlaxRoFormerForTokenClassification",
        "FlaxRoFormerModel",
        "FlaxRoFormerPreTrainedModel",
    ]

# 如果是类型检查阶段，处理完成
if TYPE_CHECKING:
    pass
    # 导入 RoFormer 相关配置文件和类
    from .configuration_roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerOnnxConfig
    # 导入 RoFormer 的 Tokenizer 类
    from .tokenization_roformer import RoFormerTokenizer
    
    # 检查是否安装了 tokenizers 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 tokenizers 库，则导入 RoFormer 的快速 Tokenizer 类
        from .tokenization_roformer_fast import RoFormerTokenizerFast
    
    # 检查是否安装了 PyTorch 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 PyTorch 库，则导入 RoFormer 的相关模型和函数
        from .modeling_roformer import (
            ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            RoFormerForCausalLM,
            RoFormerForMaskedLM,
            RoFormerForMultipleChoice,
            RoFormerForQuestionAnswering,
            RoFormerForSequenceClassification,
            RoFormerForTokenClassification,
            RoFormerLayer,
            RoFormerModel,
            RoFormerPreTrainedModel,
            load_tf_weights_in_roformer,
        )
    
    # 检查是否安装了 TensorFlow 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 TensorFlow 库，则导入 TensorFlow 版本的 RoFormer 模型和函数
        from .modeling_tf_roformer import (
            TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFRoFormerForCausalLM,
            TFRoFormerForMaskedLM,
            TFRoFormerForMultipleChoice,
            TFRoFormerForQuestionAnswering,
            TFRoFormerForSequenceClassification,
            TFRoFormerForTokenClassification,
            TFRoFormerLayer,
            TFRoFormerModel,
            TFRoFormerPreTrainedModel,
        )
    
    # 检查是否安装了 Flax 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 Flax 库，则导入 Flax 版本的 RoFormer 模型和函数
        from .modeling_flax_roformer import (
            FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            FlaxRoFormerForMaskedLM,
            FlaxRoFormerForMultipleChoice,
            FlaxRoFormerForQuestionAnswering,
            FlaxRoFormerForSequenceClassification,
            FlaxRoFormerForTokenClassification,
            FlaxRoFormerModel,
            FlaxRoFormerPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器的系统功能
    import sys
    
    # 将当前模块添加到 sys.modules 中，以 LazyModule 的形式延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\rwkv\configuration_rwkv.py`

# coding=utf-8
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" RWKV configuration"""

# 导入配置基类 PretrainedConfig 和日志工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 RWKV 预训练模型的配置文件映射字典
RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "RWKV/rwkv-4-169m-pile": "https://huggingface.co/RWKV/rwkv-4-169m-pile/resolve/main/config.json",
    "RWKV/rwkv-4-430m-pile": "https://huggingface.co/RWKV/rwkv-4-430m-pile/resolve/main/config.json",
    "RWKV/rwkv-4-1b5-pile": "https://huggingface.co/RWKV/rwkv-4-1b5-pile/resolve/main/config.json",
    "RWKV/rwkv-4-3b-pile": "https://huggingface.co/RWKV/rwkv-4-3b-pile/resolve/main/config.json",
    "RWKV/rwkv-4-7b-pile": "https://huggingface.co/RWKV/rwkv-4-7b-pile/resolve/main/config.json",
    "RWKV/rwkv-4-14b-pile": "https://huggingface.co/RWKV/rwkv-4-14b-pile/resolve/main/config.json",
    "RWKV/rwkv-raven-1b5": "https://huggingface.co/RWKV/rwkv-raven-1b5/resolve/main/config.json",
    "RWKV/rwkv-raven-3b": "https://huggingface.co/RWKV/rwkv-raven-3b/resolve/main/config.json",
    "RWKV/rwkv-raven-7b": "https://huggingface.co/RWKV/rwkv-raven-7b/resolve/main/config.json",
    "RWKV/rwkv-raven-14b": "https://huggingface.co/RWKV/rwkv-raven-14b/resolve/main/config.json",
}

# RWKV 配置类，用于存储 RWKV 模型的配置信息
class RwkvConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the RWVK-4
    [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 RWKV 模型类型
    model_type = "rwkv"
    # 映射模型属性，将 "max_position_embeddings" 映射到类中的 "context_length"
    attribute_map = {"max_position_embeddings": "context_length"}
    
    # RWKV 模型的配置类，包含了模型的各种参数设置
    def __init__(
        self,
        vocab_size=50277,  # 词汇表大小，默认为 50277
        context_length=1024,  # 模型可以处理的最大序列长度，默认为 1024
        hidden_size=4096,  # 嵌入层和隐藏状态的维度
        num_hidden_layers=32,  # 模型中的隐藏层数量，默认为 32
        attention_hidden_size=None,  # 注意力机制隐藏状态的维度，默认为 hidden_size
        intermediate_size=None,  # 内部前馈层的维度，默认为 hidden_size 的四倍
        layer_norm_epsilon=1e-5,  # 层归一化层使用的 epsilon 值，默认为 1e-5
        bos_token_id=0,  # 词汇表中句子开头 token 的 id，默认为 0
        eos_token_id=0,  # 词汇表中句子结尾 token 的 id，默认为 0
        rescale_every=6,  # 推断时，每隔多少层将隐藏状态和对应输出层的权重除以 2，默认为 6
        tie_word_embeddings=False,  # 是否将词嵌入与输入 token 的嵌入进行绑定，默认为 False
        use_cache=True,  # 模型是否应返回最后状态，默认为 True
        **kwargs,  # 允许接受任意其他参数
    ):
        ):
        # 初始化模型的参数：词汇表大小、上下文长度、隐藏层大小、隐藏层数量、注意力隐藏大小
        # 如果注意力隐藏大小未指定，则使用隐藏层大小作为默认值
        self.vocab_size = vocab_size
        self.context_length = context_length
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
        # 如果中间层大小未指定，则使用隐藏层大小的四倍作为默认值
        self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
        self.layer_norm_epsilon = layer_norm_epsilon
        self.rescale_every = rescale_every
        self.use_cache = use_cache

        # 设置模型的特殊令牌（起始和结束令牌）的标识符
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        # 调用父类的初始化方法，传递一些参数，如是否共享词嵌入、起始和结束令牌的标识符等
        super().__init__(
            tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
        )

`.\models\rwkv\convert_rwkv_checkpoint_to_hf.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""

import argparse  # 导入处理命令行参数的模块
import gc  # 导入垃圾回收模块
import json  # 导入处理 JSON 格式的模块
import os  # 导入与操作系统交互的模块
import re  # 导入处理正则表达式的模块

import torch  # 导入 PyTorch 深度学习框架
from huggingface_hub import hf_hub_download  # 导入从 HF Hub 下载模型的功能

from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint

NUM_HIDDEN_LAYERS_MAPPING = {  # 定义模型尺寸与隐藏层映射关系的字典
    "169M": 12,
    "430M": 24,
    "1B5": 24,
    "3B": 32,
    "7B": 32,
    "14B": 40,
}

HIDEN_SIZE_MAPPING = {  # 定义模型尺寸与隐藏单元大小映射关系的字典
    "169M": 768,
    "430M": 1024,
    "1B5": 2048,
    "3B": 2560,
    "7B": 4096,
    "14B": 5120,
}

def convert_state_dict(state_dict):
    state_dict_keys = list(state_dict.keys())
    for name in state_dict_keys:
        weight = state_dict.pop(name)
        # 对模型参数名称进行转换，适配 Hugging Face 模型格式
        # emb -> embedding
        if name.startswith("emb."):
            name = name.replace("emb.", "embeddings.")
        # ln_0 -> pre_ln (only present at block 0)
        if name.startswith("blocks.0.ln0"):
            name = name.replace("blocks.0.ln0", "blocks.0.pre_ln")
        # att -> attention
        name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name)
        # ffn -> feed_forward
        name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name)
        # time_mix_k -> time_mix_key and reshape
        if name.endswith(".time_mix_k"):
            name = name.replace(".time_mix_k", ".time_mix_key")
        # time_mix_v -> time_mix_value and reshape
        if name.endswith(".time_mix_v"):
            name = name.replace(".time_mix_v", ".time_mix_value")
        # time_mix_r -> time_mix_key and reshape
        if name.endswith(".time_mix_r"):
            name = name.replace(".time_mix_r", ".time_mix_receptance")

        if name != "head.weight":
            name = "rwkv." + name  # 添加前缀以标识 RWKV 格式的参数

        state_dict[name] = weight
    return state_dict

def convert_rmkv_checkpoint_to_hf_format(
    repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None
):
    # 1. If possible, build the tokenizer.
    if tokenizer_file is None:
        print("No `--tokenizer_file` provided, we will use the default tokenizer.")
        vocab_size = 50277
        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")  # 使用默认的分词器模型
    else:
        # 如果没有指定 tokenizer_file，则使用 PreTrainedTokenizerFast 加载默认的分词器
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
        # 获取分词器的词汇表大小
        vocab_size = len(tokenizer)
    # 将 tokenizer 保存到输出目录
    tokenizer.save_pretrained(output_dir)

    # 2. 构建配置文件
    # 定义可能的隐藏层大小列表
    possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys())
    if size is None:
        # 尝试从 checkpoint 文件名推断 size
        for candidate in possible_sizes:
            if candidate in checkpoint_file:
                size = candidate
                break
        if size is None:
            # 如果无法推断出 size，则抛出错误
            raise ValueError("Could not infer the size, please provide it with the `--size` argument.")
    if size not in possible_sizes:
        # 如果 size 不在可能的大小列表中，则抛出错误
        raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")

    # 创建 RwkvConfig 对象，配置模型的参数
    config = RwkvConfig(
        vocab_size=vocab_size,
        num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
        hidden_size=HIDEN_SIZE_MAPPING[size],
    )
    # 将配置保存到输出目录
    config.save_pretrained(output_dir)

    # 3. 下载模型文件并转换 state_dict
    # 从 HF Hub 下载模型文件
    model_file = hf_hub_download(repo_id, checkpoint_file)
    # 加载模型的 state_dict
    state_dict = torch.load(model_file, map_location="cpu")
    # 转换 state_dict
    state_dict = convert_state_dict(state_dict)

    # 4. 分割成片段并保存
    # 将 state_dict 拆分成多个片段
    shards, index = shard_checkpoint(state_dict)
    for shard_file, shard in shards.items():
        # 保存每个片段到输出目录
        torch.save(shard, os.path.join(output_dir, shard_file))

    if index is not None:
        # 如果存在 index，则保存 index 到输出目录
        save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)
        with open(save_index_file, "w", encoding="utf-8") as f:
            # 将 index 写入文件
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
            f.write(content)

        # 5. 清理片段（有时 PyTorch 保存的文件会占用与完整 state_dict 相同的空间）
        print(
            "Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model."
        )
        # 获取所有片段文件名列表
        shard_files = list(shards.keys())

        # 清理变量以释放内存
        del state_dict
        del shards
        gc.collect()

        # 重新加载每个片段并保存（确保在 CPU 上）
        for shard_file in shard_files:
            state_dict = torch.load(os.path.join(output_dir, shard_file))
            torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))

    # 清理 state_dict 变量以释放内存
    del state_dict
    gc.collect()

    # 如果需要推送到 HF Hub
    if push_to_hub:
        if model_name is None:
            # 如果未提供 model_name，则抛出错误
            raise ValueError("Please provide a `model_name` to push the model to the Hub.")
        # 加载模型并推送到 HF Hub
        model = AutoModelForCausalLM.from_pretrained(output_dir)
        model.push_to_hub(model_name, max_shard_size="2GB")
        # 将分词器也推送到 HF Hub
        tokenizer.push_to_hub(model_name)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必需参数
    parser.add_argument(
        "--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint."
    )
    # repo_id 参数，从中获取检查点的仓库 ID

    parser.add_argument(
        "--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo."
    )
    # checkpoint_file 参数，检查点文件在仓库中的名称

    parser.add_argument(
        "--output_dir", default=None, type=str, required=True, help="Where to save the converted model."
    )
    # output_dir 参数，用于保存转换后模型的目录路径

    parser.add_argument(
        "--tokenizer_file",
        default=None,
        type=str,
        help="Path to the tokenizer file to use (if not provided, only the model is converted).",
    )
    # tokenizer_file 参数，用于指定要使用的分词器文件路径（如果未提供，则仅转换模型）

    parser.add_argument(
        "--size",
        default=None,
        type=str,
        help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",
    )
    # size 参数，指定模型的大小；如果未传入，则将从 checkpoint_file 推断大小

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Push to the Hub the converted model.",
    )
    # push_to_hub 参数，如果设置，则推送转换后的模型到 Hub 上

    parser.add_argument(
        "--model_name",
        default=None,
        type=str,
        help="Name of the pushed model on the Hub, including the username / organization.",
    )
    # model_name 参数，指定推送到 Hub 上的模型名称，包括用户名或组织名

    args = parser.parse_args()
    # 解析命令行参数并返回一个命名空间对象 args

    convert_rmkv_checkpoint_to_hf_format(
        args.repo_id,
        args.checkpoint_file,
        args.output_dir,
        size=args.size,
        tokenizer_file=args.tokenizer_file,
        push_to_hub=args.push_to_hub,
        model_name=args.model_name,
    )
    # 调用 convert_rmkv_checkpoint_to_hf_format 函数，传递解析后的参数作为函数的输入

`.\models\rwkv\modeling_rwkv.py`

# 设置文件编码为 UTF-8
# 版权声明：2023 年 Bo Peng 和 HuggingFace 公司团队版权所有
# 版权声明：2018 年 NVIDIA 公司版权所有
#
# 根据 Apache 许可证 2.0 版本许可，除非符合许可协议，否则不得使用本文件
# 您可以在以下网址获取许可协议的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 不提供任何形式的担保或条件，无论是明示的还是默示的
# 有关详细信息，请参阅许可协议

"""PyTorch RWKV 模型."""

import math
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# 从模型工具中导入预训练模型类
from ...modeling_utils import PreTrainedModel
# 从工具中导入文档字符串生成函数和其它实用函数
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_bitsandbytes_available,
    is_ninja_available,
    is_torch_cuda_available,
    logging,
)
# 从相应模块导入 RWKV 配置类
from .configuration_rwkv import RwkvConfig

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "RWKV/rwkv-4-169m-pile"
_CONFIG_FOR_DOC = "RwkvConfig"

# 预训练模型归档列表
RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "RWKV/rwkv-4-169m-pile",
    "RWKV/rwkv-4-430m-pile",
    "RWKV/rwkv-4-1b5-pile",
    "RWKV/rwkv-4-3b-pile",
    "RWKV/rwkv-4-7b-pile",
    "RWKV/rwkv-4-14b-pile",
    "RWKV/rwkv-raven-1b5",
    "RWKV/rwkv-raven-3b",
    "RWKV/rwkv-raven-7b",
    "RWKV/rwkv-raven-14b",
    # 查看所有 RWKV 模型：https://huggingface.co/models?filter=rwkv
]

# RWKV CUDA 核心初始化为 None
rwkv_cuda_kernel = None


def load_wkv_cuda_kernel(context_length):
    # 从 torch.utils.cpp_extension 中加载 CUDA 核心
    from torch.utils.cpp_extension import load as load_kernel

    global rwkv_cuda_kernel

    # 获取 CUDA 核心文件夹路径
    kernel_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "rwkv"
    cuda_kernel_files = [kernel_folder / f for f in ["wkv_op.cpp", "wkv_cuda.cu", "wkv_cuda_bf16.cu"]]

    # 如果已加载的 CUDA 核心存在且上下文长度未更改，则直接返回
    if rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == context_length:
        return

    # 记录加载 RWKV CUDA 核心的信息
    logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")

    # CUDA 编译标志
    flags = [
        "-res-usage",
        "--maxrregcount 60",
        "--use_fast_math",
        "-O3",
        "-Xptxas -O3",
        "--extra-device-vectorization",
        f"-DTmax={context_length}",
    ]
    # 加载 CUDA 核心
    rwkv_cuda_kernel = load_kernel(
        name=f"wkv_{context_length}",
        sources=cuda_kernel_files,
        verbose=(logging.get_verbosity() == logging.DEBUG),
        extra_cuda_cflags=flags,
    )
    rwkv_cuda_kernel.max_seq_length = context_length


class RwkvLinearAttention(torch.autograd.Function):
    @staticmethod
    # 定义一个静态方法 `forward`，接受多个参数和可选的状态信息，执行前向传播计算
    def forward(ctx, time_decay, time_first, key, value, state=None, return_state=False):
        # 获取输入张量的批量大小、序列长度和隐藏层大小
        batch_size, seq_len, hidden_size = key.size()
        # 如果序列长度超过最大允许长度，抛出异常
        if seq_len > rwkv_cuda_kernel.max_seq_length:
            raise ValueError(
                f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
                f"{rwkv_cuda_kernel.max_seq_length} with this model."
            )
        # 如果批量大小乘以隐藏层大小不能整除最小值（32），抛出异常
        if batch_size * hidden_size % min(hidden_size, 32) != 0:
            raise ValueError(
                f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
                f"multiple of {min(hidden_size, 32)}."
            )

        # 设置上下文对象的输入数据类型为 key 的数据类型
        ctx.input_dtype = key.dtype

        # 检查时间衰减、时间优先、key 和 value 张量是否都在 CUDA 设备上，否则抛出异常
        if (
            time_decay.device.type != "cuda"
            or time_first.device.type != "cuda"
            or key.device.type != "cuda"
            or value.device.type != "cuda"
        ):
            raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.")

        # 将时间衰减张量取负指数，转换为 float 类型并保证连续内存布局
        time_decay = -torch.exp(time_decay.float().contiguous())
        # 如果 key 的数据类型为 float16，将 time_first、key 和 value 转换为 float32 类型
        if key.dtype == torch.float16:
            time_first = time_first.float()
            key = key.float()
            value = value.float()
        # 确保 time_first、key 和 value 的连续内存布局
        time_first = time_first.contiguous()
        key = key.contiguous()
        value = value.contiguous()

        # 根据 key 的内存布局创建一个空的输出张量，保证其连续内存布局
        # CUDA 内核将填充这个张量
        output = torch.empty_like(key, memory_format=torch.contiguous_format)

        # 如果需要返回状态信息或者已提供状态信息
        if return_state or state is not None:
            # 如果未提供状态信息，则创建全零状态张量，并初始化最后一维度为 -1e38
            if state is None:
                state = torch.zeros(
                    batch_size,
                    hidden_size,
                    3,
                    dtype=torch.float32,
                    device=key.device,
                    memory_format=torch.contiguous_format,
                )
                state[:, :, 2] -= 1e38
            else:
                # 否则，将现有状态信息按最后一维度拼接，并保证连续内存布局
                state = torch.cat([s.unsqueeze(2) for s in state], dim=2).contiguous()
            # 根据 key 的数据类型选择前向传播函数，处理状态信息
            if key.dtype == torch.bfloat16:
                forward_func = rwkv_cuda_kernel.forward_with_state_bf16
            else:
                forward_func = rwkv_cuda_kernel.forward_with_state
            # 调用 CUDA 内核执行前向传播计算，并传递状态信息
            forward_func(time_decay, time_first, key, value, output, state)
        else:
            # 否则，根据 key 的数据类型选择相应的前向传播函数，不处理状态信息
            forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward
            # 调用 CUDA 内核执行前向传播计算，不传递状态信息
            forward_func(time_decay, time_first, key, value, output)

        # 将输入的关键数据和输出保存在上下文对象的备份中
        ctx.save_for_backward(time_decay, time_first, key, value, output)

        # 如果提供了状态信息，将其拆分并返回
        if state is not None:
            state = [s.squeeze(2) for s in torch.chunk(state, 3, dim=2)]

        # 返回计算结果的输出张量，并保证其数据类型与输入一致，同时返回状态信息
        return output.to(ctx.input_dtype), state

    @staticmethod
    # 静态方法的注释，g 代表梯度
    def backward(ctx, g_output, g_state=None):
        # 获取输入数据类型
        input_dtype = ctx.input_dtype

        # 从上下文中恢复保存的张量数据
        time_decay, time_first, key, value, output = ctx.saved_tensors
        # CUDA核心将填充这些张量。

        # 根据输入数据类型创建对应的梯度张量
        g_time_decay = torch.empty_like(
            time_decay,
            memory_format=torch.contiguous_format,
            dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32,
        )
        g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format)
        g_key = torch.empty_like(key, memory_format=torch.contiguous_format)
        g_value = torch.empty_like(value, memory_format=torch.contiguous_format)

        # 如果输入数据类型是torch.float16，则将g_output转换为float类型
        if input_dtype == torch.float16:
            g_output = g_output.float()

        # 选择对应的CUDA函数进行反向传播计算
        backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward
        backward_func(
            time_decay,
            time_first,
            key,
            value,
            output,
            g_output.contiguous(),  # 获取g_output的连续内存视图
            g_time_decay,
            g_time_first,
            g_key,
            g_value,
        )

        # 将计算得到的梯度张量转换回输入数据类型并返回
        return (
            g_time_decay.to(input_dtype),
            g_time_first.to(input_dtype),
            g_key.to(input_dtype),
            g_value.to(input_dtype),
            None,
            None,
        )
# 使用线性键值注意力的 CPU 版本实现。如果不在 torch.no_grad 下执行，可能比自定义 CUDA 内核更慢且消耗更多内存。
def rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False):
    _, seq_length, _ = key.size()  # 获取键张量的序列长度
    output = torch.zeros_like(key)  # 初始化输出张量，与键张量相同形状

    if state is None:
        # 如果状态为空，初始化状态张量
        num_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
        den_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
        max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38
    else:
        num_state, den_state, max_state = state  # 否则使用提供的状态张量

    # 对数值稳定性的考虑
    time_decay = -torch.exp(time_decay)

    # 迭代序列长度
    for current_index in range(seq_length):
        current_key = key[:, current_index].float()  # 当前时间步的键张量
        current_value = value[:, current_index]  # 当前时间步的值张量

        # 在时间步 t 计算线性键值注意力
        max_for_output = torch.maximum(max_state, current_key + time_first)
        e1 = torch.exp(max_state - max_for_output)
        e2 = torch.exp(current_key + time_first - max_for_output)
        numerator = e1 * num_state + e2 * current_value
        denominator = e1 * den_state + e2
        output[:, current_index] = (numerator / denominator).to(output.dtype)

        # 更新状态以备下一次迭代
        max_for_state = torch.maximum(max_state + time_decay, current_key)
        e1 = torch.exp(max_state + time_decay - max_for_state)
        e2 = torch.exp(current_key - max_for_state)
        num_state = e1 * num_state + e2 * current_value
        den_state = e1 * den_state + e2
        max_state = max_for_state

    # 如果需要返回状态或者状态不为空，则返回更新后的状态张量
    if return_state or state is not None:
        state = [num_state, den_state, max_state]

    return output, state


# 使用线性键值注意力的入口函数，根据硬件支持情况选择 CPU 或 CUDA 实现
def rwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False):
    # 检查是否存在不支持 CUDA 的硬件，或者键张量的长度为 1
    no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, key, value])
    one_token = key.size(1) == 1

    # 如果没有 CUDA 内核、不支持 CUDA 的硬件或者键张量的长度为 1，则调用 CPU 版本实现
    if rwkv_cuda_kernel is None or no_cuda or one_token:
        return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state)
    else:
        # 否则调用 CUDA 版本实现
        return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state)
    # 初始化函数，用于初始化一个自定义的注意力层对象
    def __init__(self, config, layer_id=0):
        # 调用父类的初始化方法
        super().__init__()
        # 将配置信息保存在对象属性中
        self.config = config
        # 检查是否已经加载了CUDA内核，并且内核支持的最大序列长度符合配置中的上下文长度
        kernel_loaded = rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == config.context_length
        # 如果可以使用Ninja编译器、有可用的CUDA设备，并且尚未加载CUDA内核，则尝试加载自定义CUDA内核
        if is_ninja_available() and is_torch_cuda_available() and not kernel_loaded:
            try:
                load_wkv_cuda_kernel(config.context_length)
            except Exception:
                logger.info("Could not load the custom CUDA kernel for RWKV attention.")
        # 将层的ID保存在对象属性中
        self.layer_id = layer_id
        # 获取隐藏层的大小
        hidden_size = config.hidden_size
        # 获取注意力隐藏层的大小，如果未指定，则默认与隐藏层大小相同
        attention_hidden_size = (
            config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
        )
        # 将注意力隐藏层的大小保存在对象属性中
        self.attention_hidden_size = attention_hidden_size

        # 初始化时间衰减参数，用于注意力机制
        self.time_decay = nn.Parameter(torch.empty(attention_hidden_size))
        # 初始化时间首参数，用于注意力机制
        self.time_first = nn.Parameter(torch.empty(attention_hidden_size))

        # 初始化时间混合关键字参数，用于注意力机制
        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
        # 初始化时间混合数值参数，用于注意力机制
        self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
        # 初始化时间混合接收参数，用于注意力机制
        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))

        # 初始化时间偏移层，使用2D零填充，只在垂直方向（时间维度）上进行
        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
        # 初始化关键字线性层，将隐藏层映射到注意力隐藏层，无偏置
        self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
        # 初始化数值线性层，将隐藏层映射到注意力隐藏层，无偏置
        self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
        # 初始化接收线性层，将隐藏层映射到注意力隐藏层，无偏置
        self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
        # 初始化输出线性层，将注意力隐藏层映射回隐藏层大小，无偏置
        self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)

    # TODO: maybe jit, otherwise move inside forward
    # 提取关键字和数值，可能使用jit，否则将其移动到前向传播方法内
    def extract_key_value(self, hidden, state=None):
        # 将当前隐藏状态与上一时间步状态混合，生成关键字、数值、接收参数
        if hidden.size(1) == 1 and state is not None:
            # 如果隐藏状态的时间步为1且状态不为空，则从状态中提取上一时间步的值
            shifted = state[1][:, :, self.layer_id]
        else:
            # 否则，使用时间偏移层处理当前隐藏状态
            shifted = self.time_shift(hidden)
            # 如果状态不为空，则将上一时间步的值混合到当前时间步
            if state is not None:
                shifted[:, 0] = state[1][:, :, self.layer_id]
        # 使用时间混合关键字参数混合当前隐藏状态和上一时间步状态，生成关键字
        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
        # 使用时间混合数值参数混合当前隐藏状态和上一时间步状态，生成数值
        value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
        # 使用时间混合接收参数混合当前隐藏状态和上一时间步状态，生成接收参数，并使用Sigmoid函数处理
        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)

        # 将关键字、数值、接收参数分别通过线性层映射到注意力隐藏层
        key = self.key(key)
        value = self.value(value)
        receptance = torch.sigmoid(self.receptance(receptance))
        # 如果状态不为空，则更新状态中的上一时间步隐藏状态
        if state is not None:
            state[1][:, :, self.layer_id] = hidden[:, -1]
        # 返回接收参数、关键字、数值、状态
        return receptance, key, value, state
    # 前向传播函数，用于处理输入隐藏状态，可选地使用缓存
    def forward(self, hidden, state=None, use_cache=False):
        # 从隐藏状态中提取接受度、键和值，同时更新状态
        receptance, key, value, state = self.extract_key_value(hidden, state=state)
        
        # 如果存在状态，则从状态中提取当前层的状态信息
        layer_state = tuple(s[:, :, self.layer_id] for s in state[2:]) if state is not None else None
        
        # 使用 RWKV 线性注意力计算，考虑时间衰减和时间维度
        rwkv, layer_state = rwkv_linear_attention(
            self.time_decay,
            self.time_first,
            key,
            value,
            state=layer_state,
            return_state=use_cache,
        )

        # 如果存在层状态信息，则更新整体状态的当前层信息
        if layer_state is not None:
            state[2][:, :, self.layer_id] = layer_state[0]
            state[3][:, :, self.layer_id] = layer_state[1]
            state[4][:, :, self.layer_id] = layer_state[2]

        # 返回经过输出层处理后的结果以及更新后的状态
        return self.output(receptance * rwkv), state
# 定义一个名为 RwkvFeedForward 的新神经网络模块，继承自 nn.Module 类
class RwkvFeedForward(nn.Module):
    # 初始化函数，接受配置参数 config 和层编号 layer_id
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 保存配置信息和层编号到对象属性中
        self.config = config
        self.layer_id = layer_id
        # 从配置中获取隐藏层大小和中间层大小
        hidden_size = config.hidden_size
        intermediate_size = (
            config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size
        )

        # 创建一个沿时间轴零填充的二维零填充层
        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
        # 创建一个时间混合关键字的可训练参数
        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
        # 创建一个时间混合接受度的可训练参数
        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))

        # 创建一个线性层对象，用于生成关键字
        self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
        # 创建一个线性层对象，用于生成接受度
        self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
        # 创建一个线性层对象，用于生成值
        self.value = nn.Linear(intermediate_size, hidden_size, bias=False)

    # 前向传播函数，接受隐藏层输入和状态信息
    def forward(self, hidden, state=None):
        # 如果隐藏层的第二维大小为1且状态不为空，则获取状态中的相应层次的偏移量
        if hidden.size(1) == 1 and state is not None:
            shifted = state[0][:, :, self.layer_id]
        else:
            # 否则，对隐藏层进行时间轴零填充操作，并根据状态调整填充结果
            shifted = self.time_shift(hidden)
            if state is not None:
                shifted[:, 0] = state[0][:, :, self.layer_id]

        # 计算关键字和接受度，根据时间混合参数和偏移量
        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)

        # 对关键字进行非负整数平方操作，并使用 ReLU 激活函数
        key = torch.square(torch.relu(self.key(key)))
        # 将处理后的关键字输入值生成线性层，并输出值
        value = self.value(key)
        # 对接受度应用 sigmoid 激活函数
        receptance = torch.sigmoid(self.receptance(receptance))

        # 如果状态不为空，则更新状态中的隐藏层信息
        if state is not None:
            state[0][:, :, self.layer_id] = hidden[:, -1]

        # 返回接受度乘以值和更新后的状态
        return receptance * value, state


# 定义一个名为 RwkvBlock 的新神经网络模块，继承自 nn.Module 类
class RwkvBlock(nn.Module):
    # 初始化函数，接受配置参数 config 和层编号 layer_id
    def __init__(self, config, layer_id):
        super().__init__()
        # 保存配置信息和层编号到对象属性中
        self.config = config
        self.layer_id = layer_id

        # 如果层编号为0，则创建一个 LayerNorm 层对象，对隐藏层进行预处理
        if layer_id == 0:
            self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)

        # 创建两个 LayerNorm 层对象，用于注意力机制前后的归一化处理
        self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)

        # 创建 RwkvSelfAttention 和 RwkvFeedForward 的实例对象，用于注意力机制和前向传播
        self.attention = RwkvSelfAttention(config, layer_id)
        self.feed_forward = RwkvFeedForward(config, layer_id)

    # 前向传播函数，接受隐藏层输入、状态信息、是否使用缓存和是否输出注意力矩阵的参数
    def forward(self, hidden, state=None, use_cache=False, output_attentions=False):
        # 如果层编号为0，则对隐藏层进行预处理
        if self.layer_id == 0:
            hidden = self.pre_ln(hidden)

        # 将隐藏层输入传入注意力机制，获取注意力结果和更新后的状态
        attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache)
        # 将注意力结果加上原始隐藏层输入，得到新的隐藏层输出
        hidden = hidden + attention

        # 将新的隐藏层输入传入前向传播模块，获取前向传播结果和更新后的状态
        feed_forward, state = self.feed_forward(self.ln2(hidden), state=state)
        # 将前向传播结果加上原始隐藏层输入，得到最终的隐藏层输出
        hidden = hidden + feed_forward

        # 将隐藏层输出和状态信息作为元组返回
        outputs = (hidden, state)
        # 如果需要输出注意力矩阵，则将注意力矩阵加入返回的元组中
        if output_attentions:
            outputs += (attention,)
        else:
            outputs += (None,)

        # 返回最终的输出元组
        return outputs


# 定义一个名为 RwkvPreTrainedModel 的抽象神经网络模型类，继承自 PreTrainedModel
class RwkvPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，处理权重初始化和简单的预训练模型下载与加载接口。
    """

    # 类属性，配置类为 RwkvConfig
    config_class = RwkvConfig
    # 基础模型前缀为 "rwkv"
    base_model_prefix = "rwkv"
    # 不需要分割的模块名称列表中包含 "RwkvBlock"
    _no_split_modules = ["RwkvBlock"]
    # 定义需要保留在 FP32 模块中的模块名称列表
    _keep_in_fp32_modules = ["time_decay", "time_first"]
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重"""
        # 如果模块是 RwkvSelfAttention 类型
        if isinstance(module, RwkvSelfAttention):
            # 获取当前层的编号和总隐藏层数
            layer_id = module.layer_id
            num_hidden_layers = module.config.num_hidden_layers
            hidden_size = module.config.hidden_size
            attention_hidden_size = module.attention_hidden_size

            # 计算比率 0 到 1，表示当前层在所有隐藏层中的位置
            ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 到 1
            # 计算比率 1 到 接近 0，表示当前层在所有隐藏层中的位置的反向比率
            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 到 ~0

            # 创建时间权重张量，用于调整时间相关的关键字
            time_weight = torch.tensor(
                [i / hidden_size for i in range(hidden_size)],
                dtype=module.time_mix_key.dtype,
                device=module.time_mix_key.device,
            )
            time_weight = time_weight[None, None, :]

            # 计算时间衰减速度，根据注意力隐藏层大小和层位置动态调整
            decay_speed = [
                -5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
                for h in range(attention_hidden_size)
            ]
            decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
            # 创建用于时间优先标记的波动
            zigzag = (
                torch.tensor(
                    [(i + 1) % 3 - 1 for i in range(attention_hidden_size)],
                    dtype=module.time_first.dtype,
                    device=module.time_first.device,
                )
                * 0.5
            )

            # 使用无梯度操作设置模块的时间衰减、时间优先和时间权重混合
            with torch.no_grad():
                module.time_decay.data = decay_speed
                module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag)

                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
                module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
        
        # 如果模块是 RwkvFeedForward 类型
        elif isinstance(module, RwkvFeedForward):
            # 获取当前层的编号和总隐藏层数
            layer_id = module.layer_id
            num_hidden_layers = module.config.num_hidden_layers
            hidden_size = module.config.hidden_size

            # 计算比率 1 到 接近 0，表示当前层在所有隐藏层中的位置的反向比率
            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 到 ~0

            # 创建时间权重张量，用于调整时间相关的关键字
            time_weight = torch.tensor(
                [i / hidden_size for i in range(hidden_size)],
                dtype=module.time_mix_key.dtype,
                device=module.time_mix_key.device,
            )
            time_weight = time_weight[None, None, :]

            # 使用无梯度操作设置模块的时间权重混合和时间接受度
            with torch.no_grad():
                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
# 使用 @dataclass 装饰器声明一个数据类，用于封装 RWKV 模型的输出结果
@dataclass
class RwkvOutput(ModelOutput):
    """
    Class for the RWKV model outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 定义 RWKV 模型的输出属性
    last_hidden_state: torch.FloatTensor = None  # 最后一层模型的隐藏状态
    state: Optional[List[torch.FloatTensor]] = None  # 模型在最后时间步的状态
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None  # 每层模型的隐藏状态
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None  # 每层注意力权重
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 损失值，用于语言建模任务中的下一个标记预测，当提供了`labels`时返回
    loss: Optional[torch.FloatTensor] = None
    # 语言建模头部的预测分数，即在应用SoftMax之前每个词汇标记的分数，形状为`(batch_size, sequence_length, config.vocab_size)`
    logits: torch.FloatTensor = None
    # 模型在最后一个时间步的状态，可以在下一个`input_ids`的前向方法中使用，避免提供旧的`input_ids`
    state: Optional[List[torch.FloatTensor]] = None
    # 模型每一层的隐藏状态的元组，包括（如果存在）嵌入层的输出，形状为`(batch_size, sequence_length, hidden_size)`
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 注意力权重的元组，用于自注意力头部中的加权平均计算，形状为`(batch_size, num_heads, sequence_length, sequence_length)`
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# RWKV_START_DOCSTRING 定义了一个多行字符串，用于描述某个模型类的文档字符串。
# 文档字符串解释了该模型继承自 PreTrainedModel，列出了该库对所有模型实现的通用方法（如下载或保存模型、调整输入嵌入、剪枝头部等）。
# 这个模型也是 PyTorch 的 torch.nn.Module 的子类，可以像普通的 PyTorch 模块一样使用，所有与一般使用和行为相关的事项请参考 PyTorch 文档。

RWKV_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            # `input_ids` 是输入序列的 token 索引，在词汇表中进行查找得到。
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            # 用来避免对填充 token 索引执行注意力操作的掩码。掩码值选择在 `[0, 1]` 范围内：

            - 1 表示**未被掩码**的 token，
            - 0 表示**被掩码**的 token。

            This is currently not used by `RwkvModel`, but will be supported in the future.

            [What are attention masks?](../glossary#attention-mask)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选参数，代替 `input_ids` 直接传递嵌入表示。如果希望更好地控制如何将 `input_ids` 索引转换为关联向量，
            这是非常有用的，比如使用自定义的嵌入查找矩阵。

            This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            # 如果提供，模型将在所有块中使用先前状态（这将给出模型对提供的 `input_ids` 和 `state_input_ids` 作为上下文的输出）。

            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            # 如果设置为 `True`，则返回最后的状态，并且可以用于快速生成下一个 logits。

            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的 `attentions`。

            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详细信息请参见返回的张量中的 `hidden_states`。

            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。

            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
定义一个 RwkvModel 类，继承自 RwkvPreTrainedModel 类。

@add_start_docstrings(
    "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
    RWKV_START_DOCSTRING,
)
添加文档字符串，描述该模型是一个裸的 RWKV 模型，输出未经特定顶层处理的原始隐藏状态。

class RwkvModel(RwkvPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化嵌入层，使用给定的词汇量大小和隐藏层大小
        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        
        # 创建包含多个 RwkvBlock 的层列表，每个块的配置由传入的 config 控制
        self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
        
        # 初始化 LayerNorm 层，对隐藏状态进行归一化处理
        self.ln_out = nn.LayerNorm(config.hidden_size)

        # 初始化标志：层是否被重新缩放
        self.layers_are_rescaled = False

        # 初始化标志：是否使用梯度检查点
        self.gradient_checkpointing = False

        # 执行额外的初始化操作
        # 这可能包括权重初始化和最终处理
        self.post_init()

    # 返回嵌入层
    def get_input_embeddings(self):
        return self.embeddings

    # 设置新的嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.embeddings = new_embeddings

    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=RwkvOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    定义 forward 方法，接收多个输入参数，执行模型的前向传播过程。

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,  # noqa
        inputs_embeds: Optional[torch.FloatTensor] = None,
        state: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    def _rescale_layers(self):
        # Layers should be rescaled for inference only.
        if self.layers_are_rescaled == (not self.training):
            return
        # Check if rescaling interval is specified
        if self.config.rescale_every > 0:
            # Perform rescaling without gradient tracking
            with torch.no_grad():
                # Iterate over blocks in the model
                for block_id, block in enumerate(self.blocks):
                    if self.training:
                        # Scale weights during training
                        block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
                        block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
                    else:
                        # Handle quantization statistics during inference
                        if hasattr(block.attention.output.weight, "SCB"):
                            block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
                            block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
                        elif hasattr(block.attention.output.weight, "quant_state"):
                            # Perform 4-bit dequantization and rescaling
                            self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
                            self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)
                        else:
                            # Default case: rescale weights
                            block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
                            block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))

        # Update rescaling status
        self.layers_are_rescaled = not self.training

    def _bnb_4bit_dequantize_and_rescale(self, target_layer, block_id):
        r"""
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        """
        # Check if bitsandbytes library is available
        if not is_bitsandbytes_available():
            raise ImportError("Please install bitsandbytes to use this method.")
        import bitsandbytes as bnb

        # Dequantize 4-bit weights
        dequant_weights = bnb.functional.dequantize_4bit(target_layer.weight.data, target_layer.weight.quant_state)

        # Rescale weights
        dequant_weights.div_(2 ** int(block_id // self.config.rescale_every))

        # Re-quantize the weights
        # Move weights to CPU and back to device to handle quantization
        quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)
        setattr(target_layer, "weight", quant_weight)
@add_start_docstrings(
    """
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    RWKV_START_DOCSTRING,
)
class RwkvForCausalLM(RwkvPreTrainedModel):
    _tied_weights_keys = ["head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.rwkv = RwkvModel(config)  # 初始化 RWKV 模型
        self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)  # 创建线性层作为语言建模的输出层

        # Initialize weights and apply final processing
        self.post_init()  # 执行初始化权重和最终处理

    def get_output_embeddings(self):
        return self.head  # 返回输出层的权重

    def set_output_embeddings(self, new_embeddings):
        self.head = new_embeddings  # 设置新的输出层权重

    def generate(self, *args, **kwargs):
        # Thin wrapper to raise exceptions when trying to generate with methods that manipulate `past_key_values`.
        # RWKV is one of the few models that don't have it (it has `state` instead, which has different properties and
        # usage).
        try:
            gen_output = super().generate(*args, **kwargs)  # 调用父类的 generate 方法
        except AttributeError as exc:
            # Expected exception: "AttributeError: '(object name)' object has no attribute 'past_key_values'"
            if "past_key_values" in str(exc):
                raise AttributeError(
                    "You tried to call `generate` with a decoding strategy that manipulates `past_key_values`. RWKV "
                    "doesn't have that attribute, try another generation strategy instead. For the available "
                    "generation strategies, check this doc: https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies"
                )
            else:
                raise exc
        return gen_output

    def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, **kwargs):
        # only last token for inputs_ids if the state is passed along.
        if state is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)  # 只使用输入的最后一个标记作为生成输入

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and state is None:
            model_inputs = {"inputs_embeds": inputs_embeds}  # 如果传入了 inputs_embeds，则只在第一个生成步骤中使用它们
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs["state"] = state  # 将状态信息添加到模型输入中
        return model_inputs

    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=RwkvCausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,  # noqa
        inputs_embeds: Optional[torch.FloatTensor] = None,
        state: Optional[List[torch.FloatTensor]] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, RwkvCausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 如果 return_dict 为 None，则使用模型配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 rwkv 方法进行前向传播
        rwkv_outputs = self.rwkv(
            input_ids,
            inputs_embeds=inputs_embeds,
            state=state,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取 rwkv 输出中的隐藏状态
        hidden_states = rwkv_outputs[0]

        # 将隐藏状态传入头部模型计算 logits
        logits = self.head(hidden_states)

        # 初始化损失为 None
        loss = None
        # 如果存在标签，则计算损失
        if labels is not None:
            # 将标签移动到与 logits 相同的设备上，以便进行模型并行计算
            labels = labels.to(logits.device)
            # 将 logits 向左移动一个位置，以对齐标签
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # 使用交叉熵损失函数计算损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # 如果 return_dict 为 False，则返回一个元组
        if not return_dict:
            output = (logits,) + rwkv_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 RwkvCausalLMOutput 对象
        return RwkvCausalLMOutput(
            loss=loss,
            logits=logits,
            state=rwkv_outputs.state,
            hidden_states=rwkv_outputs.hidden_states,
            attentions=rwkv_outputs.attentions,
        )

`.\models\rwkv\init.py`

# 版权声明和许可证信息，指出此代码版权归HuggingFace团队所有，并遵循Apache License, Version 2.0。
#
# 如果不满足许可证的要求，禁止使用此文件。可以从以下链接获取许可证的副本：
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"提供，不附带任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。

from typing import TYPE_CHECKING

# 从utils模块中导入所需的类和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义了模块的导入结构
_import_structure = {
    "configuration_rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig", "RwkvOnnxConfig"],
}

# 检查是否有torch库可用，如果不可用，则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用，将modeling_rwkv模块添加到导入结构中
    _import_structure["modeling_rwkv"] = [
        "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST",
        "RwkvForCausalLM",
        "RwkvModel",
        "RwkvPreTrainedModel",
    ]

# 如果当前是类型检查阶段，导入所需的类型定义
if TYPE_CHECKING:
    from .configuration_rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig, RwkvOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_rwkv import (
            RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
            RwkvForCausalLM,
            RwkvModel,
            RwkvPreTrainedModel,
        )
# 如果不是类型检查阶段，则在sys.modules中注册一个LazyModule
else:
    import sys

    # 使用_LazyModule类将当前模块注册到sys.modules中，以实现惰性加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\sam\configuration_sam.py`

# 设置文件编码为 UTF-8
# 版权声明，指出版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本授权使用本文件，除非遵守许可证的条款，否则不得使用此文件
# 可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“原样”分发的，不提供任何形式的担保或条件，无论是明示的还是隐含的
# 请参阅许可证了解具体的法律规定
""" SAM 模型配置"""


# 从配置工具中导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 从工具包中导入日志记录模块
from ...utils import logging


# 获取名为 __name__ 的日志记录器
logger = logging.get_logger(__name__)

# 定义 SAM 预训练配置文件映射字典
SAM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/sam-vit-huge": "https://huggingface.co/facebook/sam-vit-huge/resolve/main/config.json",
    "facebook/sam-vit-large": "https://huggingface.co/facebook/sam-vit-large/resolve/main/config.json",
    "facebook/sam-vit-base": "https://huggingface.co/facebook/sam-vit-base/resolve/main/config.json",
}


# 定义 SamPromptEncoderConfig 类，继承自 PretrainedConfig
class SamPromptEncoderConfig(PretrainedConfig):
    r"""
    这是用于存储 [`SamPromptEncoder`] 配置的配置类。[`SamPromptEncoder`] 模块用于编码输入的 2D 点和边界框。
    实例化配置默认将生成与 SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。

    Args:
        hidden_size (`int`, *optional*, 默认为 256):
            隐藏状态的维度。
        image_size (`int`, *optional*, 默认为 1024):
            图像的预期输出分辨率。
        patch_size (`int`, *optional*, 默认为 16):
            每个补丁的大小（分辨率）。
        mask_input_channels (`int`, *optional*, 默认为 16):
            要馈送到 `MaskDecoder` 模块的通道数。
        num_point_embeddings (`int`, *optional*, 默认为 4):
            要使用的点嵌入数量。
        hidden_act (`str`, *optional*, 默认为 `"gelu"`):
            编码器和池化器中的非线性激活函数。
    """

    def __init__(
        self,
        hidden_size=256,
        image_size=1024,
        patch_size=16,
        mask_input_channels=16,
        num_point_embeddings=4,
        hidden_act="gelu",
        layer_norm_eps=1e-6,
        **kwargs,
        ):
        # 调用父类的构造函数，传递所有的关键字参数
        super().__init__(**kwargs)
        # 设置隐藏层大小
        self.hidden_size = hidden_size
        # 设置图像大小
        self.image_size = image_size
        # 设置补丁大小
        self.patch_size = patch_size
        # 计算图像嵌入大小，等于图像大小除以补丁大小
        self.image_embedding_size = image_size // patch_size
        # 设置掩码输入通道数
        self.mask_input_channels = mask_input_channels
        # 设置点嵌入数量
        self.num_point_embeddings = num_point_embeddings
        # 设置隐藏层激活函数
        self.hidden_act = hidden_act
        # 设置层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
# `SamMaskDecoderConfig` 类，用于存储 `SamMaskDecoder` 的配置信息。
# 继承自 `PretrainedConfig`，用于控制模型输出。
# 该配置类用于实例化一个 `SamMaskDecoder`，定义模型的架构。
# 默认情况下，实例化配置类将生成类似于 `facebook/sam-vit-huge` 架构的配置。

class SamMaskDecoderConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
    will yield a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            The non-linear activation function used inside the `SamMaskDecoder` module.
        mlp_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsampling rate of the attention layer.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The number of layers in the IoU head module.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.

    """
    
    # 初始化方法，用于设置配置参数
    def __init__(
        self,
        hidden_size=256,
        hidden_act="relu",
        mlp_dim=2048,
        num_hidden_layers=2,
        num_attention_heads=8,
        attention_downsample_rate=2,
        num_multimask_outputs=3,
        iou_head_depth=3,
        iou_head_hidden_dim=256,
        layer_norm_eps=1e-6,
        **kwargs,
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 设置各个配置参数
        self.hidden_size = hidden_size
        self.hidden_act = hidden_act
        self.mlp_dim = mlp_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.attention_downsample_rate = attention_downsample_rate
        self.num_multimask_outputs = num_multimask_outputs
        self.iou_head_depth = iou_head_depth
        self.iou_head_hidden_dim = iou_head_hidden_dim
        self.layer_norm_eps = layer_norm_eps


class SamVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of the SAM ViT-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 设置编码器层和池化层的维度大小，默认为768
    hidden_size (`int`, *optional*, defaults to 768):
    # Patch Encoder 中输出通道的维度大小，默认为256
    output_channels (`int`, *optional*, defaults to 256):
    # Transformer 编码器中隐藏层的数量，默认为12
    num_hidden_layers (`int`, *optional*, defaults to 12):
    # Transformer 编码器中每个注意力层的注意力头数，默认为12
    num_attention_heads (`int`, *optional*, defaults to 12):
    # 输入图像的通道数，默认为3
    num_channels (`int`, *optional*, defaults to 3):
    # 期望的输入图像分辨率，默认为1024
    image_size (`int`, *optional*, defaults to 1024):
    # 从输入图像中提取的补丁大小，默认为16
    patch_size (`int`, *optional*, defaults to 16):
    # 非线性激活函数的类型，默认为 "gelu"
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
    # 层归一化层中使用的 epsilon 值，默认为 1e-06
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
    # 注意力概率的 dropout 比率，默认为0.0（不使用 dropout）
    attention_dropout (`float`, *optional*, defaults to 0.0):
    # 初始化所有权重矩阵的截断正态分布的标准差，默认为1e-10
    initializer_range (`float`, *optional*, defaults to 1e-10):
    # 是否向查询、键、值的投影中添加偏置，默认为 True
    qkv_bias (`bool`, *optional*, defaults to `True`):
    # MLP 隐藏层维度与嵌入维度之比，默认为4.0
    mlp_ratio (`float`, *optional*, defaults to 4.0):
    # 是否使用绝对位置编码，默认为 True
    use_abs_pos (`bool`, *optional*, defaults to `True`):
    # 是否使用相对位置编码，默认为 True
    use_rel_pos (`bool`, *optional*, defaults to `True`):
    # 相对位置的窗口大小，默认为14
    window_size (`int`, *optional*, defaults to 14):
    # 全局注意力层的索引列表，默认为 `[2, 5, 8, 11]`
    global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
    # 位置嵌入的维度大小，默认为128
    num_pos_feats (`int`, *optional*, defaults to 128):
    # Transformer 编码器中 MLP 层的维度大小。如果为 `None`，则默认为 `mlp_ratio * hidden_size`
    mlp_dim (`int`, *optional*):
    # 初始化函数，设置Transformer模型的各项参数
    def __init__(
        self,
        hidden_size=768,                  # 隐藏层大小，默认为768
        output_channels=256,              # 输出通道数，默认为256
        num_hidden_layers=12,             # 隐藏层的数量，默认为12
        num_attention_heads=12,           # 注意力头的数量，默认为12
        num_channels=3,                   # 输入图像的通道数，默认为3（RGB）
        image_size=1024,                  # 输入图像的大小，默认为1024x1024像素
        patch_size=16,                    # 图像分块的大小，默认为16x16像素
        hidden_act="gelu",                # 隐藏层激活函数，默认为GELU
        layer_norm_eps=1e-06,             # Layer Normalization的epsilon，默认为1e-06
        attention_dropout=0.0,            # 注意力机制的dropout率，默认为0.0（不使用dropout）
        initializer_range=1e-10,          # 参数初始化的范围，默认为1e-10
        qkv_bias=True,                    # 是否在QKV矩阵中使用偏置，默认为True
        mlp_ratio=4.0,                    # MLP的维度扩展比例，默认为4.0
        use_abs_pos=True,                 # 是否使用绝对位置编码，默认为True
        use_rel_pos=True,                 # 是否使用相对位置编码，默认为True
        window_size=14,                   # 局部注意力窗口大小，默认为14
        global_attn_indexes=[2, 5, 8, 11], # 全局注意力层的索引，默认为[2, 5, 8, 11]
        num_pos_feats=128,                # 位置特征的数量，默认为128
        mlp_dim=None,                     # MLP的维度，默认为hidden_size * mlp_ratio，若给定mlp_dim则使用给定值
        **kwargs,                         # 其他未指定的参数
    ):
        super().__init__(**kwargs)        # 调用父类的初始化方法
    
        self.hidden_size = hidden_size    # 设置隐藏层大小属性
        self.output_channels = output_channels  # 设置输出通道数属性
        self.num_hidden_layers = num_hidden_layers  # 设置隐藏层数量属性
        self.num_attention_heads = num_attention_heads  # 设置注意力头数量属性
        self.num_channels = num_channels  # 设置输入图像通道数属性
        self.image_size = image_size      # 设置输入图像大小属性
        self.patch_size = patch_size      # 设置图像分块大小属性
        self.hidden_act = hidden_act      # 设置隐藏层激活函数属性
        self.layer_norm_eps = layer_norm_eps  # 设置Layer Normalization的epsilon属性
        self.attention_dropout = attention_dropout  # 设置注意力dropout率属性
        self.initializer_range = initializer_range  # 设置参数初始化范围属性
        self.qkv_bias = qkv_bias          # 设置是否使用QKV偏置属性
        self.mlp_ratio = mlp_ratio        # 设置MLP维度扩展比例属性
        self.use_abs_pos = use_abs_pos    # 设置是否使用绝对位置编码属性
        self.use_rel_pos = use_rel_pos    # 设置是否使用相对位置编码属性
        self.window_size = window_size    # 设置局部注意力窗口大小属性
        self.global_attn_indexes = global_attn_indexes  # 设置全局注意力层的索引属性
        self.num_pos_feats = num_pos_feats  # 设置位置特征数量属性
        self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim  # 设置MLP的维度属性，如果mlp_dim未指定则计算为hidden_size * mlp_ratio
# 定义 `SamConfig` 类，用于存储 `SamModel` 的配置信息，继承自 `PretrainedConfig`。
class SamConfig(PretrainedConfig):
    # 文档字符串，描述了 `SamConfig` 的作用和用法，以及如何实例化 SAM 模型的相关参数。
    r"""
    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamPromptEncoderConfig,
    ...     SamMaskDecoderConfig,
    ...     SamModel,
    ... )

    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamConfig()

    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
    >>> vision_config = SamVisionConfig()
    >>> prompt_encoder_config = SamPromptEncoderConfig()
    >>> mask_decoder_config = SamMaskDecoderConfig()

    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```"""

    # 类属性 `model_type`，指定模型类型为 "sam"。
    model_type = "sam"

    # 构造函数 `__init__`，用于初始化 `SamConfig` 类的实例。
    def __init__(
        self,
        vision_config=None,
        prompt_encoder_config=None,
        mask_decoder_config=None,
        initializer_range=0.02,
        **kwargs,
        ):
            # 调用父类的构造方法，传递所有的关键字参数
            super().__init__(**kwargs)
            # 如果 vision_config 不为 None，则使用其值；否则使用空字典
            vision_config = vision_config if vision_config is not None else {}
            # 如果 prompt_encoder_config 不为 None，则使用其值；否则使用空字典
            prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {}
            # 如果 mask_decoder_config 不为 None，则使用其值；否则使用空字典
            mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {}

            # 如果 vision_config 是 SamVisionConfig 类的实例，则转换为字典
            if isinstance(vision_config, SamVisionConfig):
                vision_config = vision_config.to_dict()
            # 如果 prompt_encoder_config 是 SamPromptEncoderConfig 类的实例，则转换为字典
            if isinstance(prompt_encoder_config, SamPromptEncoderConfig):
                prompt_encoder_config = prompt_encoder_config.to_dict()
            # 如果 mask_decoder_config 是 SamMaskDecoderConfig 类的实例，则转换为字典
            if isinstance(mask_decoder_config, SamMaskDecoderConfig):
                mask_decoder_config = mask_decoder_config.to_dict()

            # 使用 vision_config 字典创建 SamVisionConfig 对象
            self.vision_config = SamVisionConfig(**vision_config)
            # 使用 prompt_encoder_config 字典创建 SamPromptEncoderConfig 对象
            self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config)
            # 使用 mask_decoder_config 字典创建 SamMaskDecoderConfig 对象
            self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config)
            # 设置 initializer_range 实例变量
            self.initializer_range = initializer_range

`.\transformers\models\sam\convert_sam_original_to_hf_format.py`

# 指定编码格式为 UTF-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权
# 除非符合许可证的规定，否则不得使用此文件
# 您可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发软件
# 软件按"原样"分发，不附带任何形式的担保或条件，
# 包括但不限于默示担保或适销性或特定用途的适用性
# 有关许可证的详细信息，请参见许可证
"""
从原始存储库转换 SAM 检查点。
"""
# 导入所需模块
import argparse  # 导入命令行解析模块
import re  # 导入正则表达式模块

import numpy as np  # 导入 NumPy 库
import requests  # 导入 requests 库
import torch  # 导入 PyTorch 库
from huggingface_hub import hf_hub_download  # 导入 Hugging Face Hub 模块
from PIL import Image  # 导入 Python Imaging Library（PIL）模块

# 从 transformers 模块导入所需内容
from transformers import (
    SamConfig,  # 导入 SAM 配置类
    SamImageProcessor,  # 导入 SAM 图像处理器类
    SamModel,  # 导入 SAM 模型类
    SamProcessor,  # 导入 SAM 处理器类
    SamVisionConfig,  # 导入 SAM 视觉配置类
)

# 定义一个字典，用于存储需要修改的键
KEYS_TO_MODIFY_MAPPING = {
    "iou_prediction_head.layers.0": "iou_prediction_head.proj_in",  # 修改键名映射关系
    "iou_prediction_head.layers.1": "iou_prediction_head.layers.0",  # 修改键名映射关系
    "iou_prediction_head.layers.2": "iou_prediction_head.proj_out",  # 修改键名映射关系
    "mask_decoder.output_upscaling.0": "mask_decoder.upscale_conv1",  # 修改键名映射关系
    "mask_decoder.output_upscaling.1": "mask_decoder.upscale_layer_norm",  # 修改键名映射关系
    "mask_decoder.output_upscaling.3": "mask_decoder.upscale_conv2",  # 修改键名映射关系
    "mask_downscaling.0": "mask_embed.conv1",  # 修改键名映射关系
    "mask_downscaling.1": "mask_embed.layer_norm1",  # 修改键名映射关系
    "mask_downscaling.3": "mask_embed.conv2",  # 修改键名映射关系
    "mask_downscaling.4": "mask_embed.layer_norm2",  # 修改键名映射关系
    "mask_downscaling.6": "mask_embed.conv3",  # 修改键名映射关系
    "point_embeddings": "point_embed",  # 修改键名映射关系
    "pe_layer.positional_encoding_gaussian_matrix": "shared_embedding.positional_embedding",  # 修改键名映射关系
    "image_encoder": "vision_encoder",  # 修改键名映射关系
    "neck.0": "neck.conv1",  # 修改键名映射关系
    "neck.1": "neck.layer_norm1",  # 修改键名映射关系
    "neck.2": "neck.conv2",  # 修改键名映射关系
    "neck.3": "neck.layer_norm2",  # 修改键名映射关系
    "patch_embed.proj": "patch_embed.projection",  # 修改键名映射关系
    ".norm": ".layer_norm",  # 修改键名映射关系
    "blocks": "layers",  # 修改键名映射关系
}

# 定义函数用于替换字典键名
def replace_keys(state_dict):
    model_state_dict = {}  # 初始化模型状态字典为空字典
    state_dict.pop("pixel_mean", None)  # 移除键为"pixel_mean"的值
    state_dict.pop("pixel_std", None)  # 移除键为"pixel_std"的值

    output_hypernetworks_mlps_pattern = r".*.output_hypernetworks_mlps.(\d+).layers.(\d+).*"  # 定义正则表达式模式

    # 遍历状态字典的键值对
    for key, value in state_dict.items():
        # 遍历需要修改的键值对
        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
            # 如果需要修改的键存在于当前键中
            if key_to_modify in key:
                # 替换键名
                key = key.replace(key_to_modify, new_key)

        # 匹配键是否符合指定模式
        if re.match(output_hypernetworks_mlps_pattern, key):
            # 提取层编号
            layer_nb = int(re.match(output_hypernetworks_mlps_pattern, key).group(2))
            # 根据层编号进行键名替换
            if layer_nb == 0:
                key = key.replace("layers.0", "proj_in")
            elif layer_nb == 1:
                key = key.replace("layers.1", "layers.0")
            elif layer_nb == 2:
                key = key.replace("layers.2", "proj_out")

        # 将键值对加入模型状态字典
        model_state_dict[key] = value
    # 将模型状态字典中的位置编码信息从一个键复制到另一个键
    model_state_dict["shared_image_embedding.positional_embedding"] = model_state_dict[
        "prompt_encoder.shared_embedding.positional_embedding"
    ]
    
    # 返回更新后的模型状态字典
    return model_state_dict
# 将 SAM 模型转换为检查点格式，以便在 Hugging Face Hub 上发布
def convert_sam_checkpoint(model_name, pytorch_dump_folder, push_to_hub, model_hub_id="ybelkada/segment-anything"):
    # 下载 Hugging Face Hub 上的模型检查点
    checkpoint_path = hf_hub_download(model_hub_id, f"checkpoints/{model_name}.pth")

    # 根据模型名选择不同的配置
    if "sam_vit_b" in model_name:
        config = SamConfig()
    elif "sam_vit_l" in model_name:
        # 针对较大的 SAM-ViT 模型创建视觉配置
        vision_config = SamVisionConfig(
            hidden_size=1024,
            num_hidden_layers=24,
            num_attention_heads=16,
            global_attn_indexes=[5, 11, 17, 23],
        )

        config = SamConfig(
            vision_config=vision_config,
        )
    elif "sam_vit_h" in model_name:
        # 针对更大的 SAM-ViT 模型创建视觉配置
        vision_config = SamVisionConfig(
            hidden_size=1280,
            num_hidden_layers=32,
            num_attention_heads=16,
            global_attn_indexes=[7, 15, 23, 31],
        )

        config = SamConfig(
            vision_config=vision_config,
        )

    # 从检查点文件中加载状态字典
    state_dict = torch.load(checkpoint_path, map_location="cpu")
    # 替换状态字典中的键
    state_dict = replace_keys(state_dict)

    # 创建 SAM 图像处理器
    image_processor = SamImageProcessor()

    # 创建 SAM 处理器
    processor = SamProcessor(image_processor=image_processor)
    # 创建 SAM 模型
    hf_model = SamModel(config)

    # 加载模型权重到 GPU
    hf_model.load_state_dict(state_dict)
    hf_model = hf_model.to("cuda")

    # 加载示例图像
    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")

    # 设置示例输入点和标签
    input_points = [[[400, 650]]]
    input_labels = [[1]]

    # 处理示例输入并移到 GPU
    inputs = processor(images=np.array(raw_image), return_tensors="pt").to("cuda")

    # 使用模型进行推理
    with torch.no_grad():
        output = hf_model(**inputs)
    # 获取预测的 IOU 分数
    scores = output.iou_scores.squeeze()

    # 如果模型名是 "sam_vit_h_4b8939"，执行额外的测试
    if model_name == "sam_vit_h_4b8939":
        # 断言最后一个 IOU 分数符合预期值
        assert scores[-1].item() == 0.579890251159668

        # 使用输入点和标签进行额外测试
        inputs = processor(
            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
        ).to("cuda")

        with torch.no_grad():
            output = hf_model(**inputs)
        scores = output.iou_scores.squeeze()

        # 断言最后一个 IOU 分数符合预期值
        assert scores[-1].item() == 0.9712603092193604

        # 使用输入框进行额外测试
        input_boxes = ((75, 275, 1725, 850),)

        inputs = processor(images=np.array(raw_image), input_boxes=input_boxes, return_tensors="pt").to("cuda")

        with torch.no_grad():
            output = hf_model(**inputs)
        scores = output.iou_scores.squeeze()

        # 断言最后一个 IOU 分数符合预期值
        assert scores[-1].item() == 0.8686015605926514

        # 使用 2 个点和 1 张图像进行额外测试
        input_points = [[[400, 650], [800, 650]]]
        input_labels = [[1, 1]]

        inputs = processor(
            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
        ).to("cuda")

        with torch.no_grad():
            output = hf_model(**inputs)
        scores = output.iou_scores.squeeze()

        # 断言最后一个 IOU 分数符合预期值
        assert scores[-1].item() == 0.9936047792434692


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 提供模型选择的选项列表
    choices = ["sam_vit_b_01ec64", "sam_vit_h_4b8939", "sam_vit_l_0b3195"]
    # 添加解析器参数，用于指定要转换的模型名称，默认为 sam_vit_h_4b8939，可选值在 choices 中
    parser.add_argument(
        "--model_name",
        default="sam_vit_h_4b8939",
        choices=choices,
        type=str,
        help="Path to hf config.json of model to convert",
    )
    # 添加解析器参数，用于指定输出 PyTorch 模型的文件夹路径，默认为 None
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加解析器参数，用于指定是否在转换后将模型和处理器推送到 hub
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model and processor to the hub after converting",
    )
    # 添加解析器参数，用于指定要转换的模型在 hub 上的 ID，默认为 "ybelkada/segment-anything"，可选值在 choices 中
    parser.add_argument(
        "--model_hub_id",
        default="ybelkada/segment-anything",
        choices=choices,
        type=str,
        help="Path to hf config.json of model to convert",
    )
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用函数将 SAM 检查点转换为 PyTorch 模型
    convert_sam_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.model_hub_id)

`.\models\sam\convert_sam_to_hf.py`

# 设置文件编码为 UTF-8
# 版权声明和许可信息，指出代码归属和使用许可
# 根据 Apache License, Version 2.0 许可，除非符合许可要求，否则不得使用此文件
"""
从原始仓库中转换 SAM 模型的检查点。

URL: https://github.com/facebookresearch/segment-anything.

同时支持从 https://github.com/czg1225/SlimSAM/tree/master 转换 SlimSAM 检查点。
"""
import argparse  # 导入命令行参数解析模块
import re  # 导入正则表达式模块

import numpy as np  # 导入处理数组的库
import requests  # 导入处理 HTTP 请求的库
import torch  # 导入 PyTorch 深度学习库
from huggingface_hub import hf_hub_download  # 从 Hugging Face Hub 下载模型和数据
from PIL import Image  # 导入 Python Imaging Library 用于图像处理

from transformers import (  # 导入 Transformers 库的相关模块
    SamConfig,  # SAM 模型的配置类
    SamImageProcessor,  # 处理图像输入的 SAM 图像处理器
    SamModel,  # SAM 模型
    SamProcessor,  # SAM 数据处理器
    SamVisionConfig,  # SAM 视觉部分的配置类
)


def get_config(model_name):
    if "slimsam-50" in model_name:
        vision_config = SamVisionConfig(
            hidden_size=384,  # 隐藏层大小
            mlp_dim=1536,  # MLP 层大小
            num_hidden_layers=12,  # 隐藏层层数
            num_attention_heads=12,  # 注意力头数
            global_attn_indexes=[2, 5, 8, 11],  # 全局注意力索引
        )
    elif "slimsam-77" in model_name:
        vision_config = SamVisionConfig(
            hidden_size=168,
            mlp_dim=696,
            num_hidden_layers=12,
            num_attention_heads=12,
            global_attn_indexes=[2, 5, 8, 11],
        )
    elif "sam_vit_b" in model_name:
        vision_config = SamVisionConfig()  # 使用 SAM VIT_B 的默认配置
    elif "sam_vit_l" in model_name:
        vision_config = SamVisionConfig(
            hidden_size=1024,
            num_hidden_layers=24,
            num_attention_heads=16,
            global_attn_indexes=[5, 11, 17, 23],
        )
    elif "sam_vit_h" in model_name:
        vision_config = SamVisionConfig(
            hidden_size=1280,
            num_hidden_layers=32,
            num_attention_heads=16,
            global_attn_indexes=[7, 15, 23, 31],
        )

    config = SamConfig(
        vision_config=vision_config,  # 使用 SAM 的配置类来创建配置对象
    )

    return config


KEYS_TO_MODIFY_MAPPING = {
    "iou_prediction_head.layers.0": "iou_prediction_head.proj_in",  # 映射修改键值对
    "iou_prediction_head.layers.1": "iou_prediction_head.layers.0",
    "iou_prediction_head.layers.2": "iou_prediction_head.proj_out",
    "mask_decoder.output_upscaling.0": "mask_decoder.upscale_conv1",
    "mask_decoder.output_upscaling.1": "mask_decoder.upscale_layer_norm",
    "mask_decoder.output_upscaling.3": "mask_decoder.upscale_conv2",
    "mask_downscaling.0": "mask_embed.conv1",
    "mask_downscaling.1": "mask_embed.layer_norm1",
    "mask_downscaling.3": "mask_embed.conv2",
    "mask_downscaling.4": "mask_embed.layer_norm2",
    "mask_downscaling.6": "mask_embed.conv3",
}
    # 定义一个字典，用于将旧模型的参数映射到新模型的对应参数上
    "point_embeddings": "point_embed",
    # 将旧模型中的 positional_encoding_gaussian_matrix 映射到新模型的 shared_embedding.positional_embedding
    "pe_layer.positional_encoding_gaussian_matrix": "shared_embedding.positional_embedding",
    # 将旧模型中的 image_encoder 映射到新模型的 vision_encoder
    "image_encoder": "vision_encoder",
    # 将旧模型中的 neck.0 映射到新模型的 neck.conv1
    "neck.0": "neck.conv1",
    # 将旧模型中的 neck.1 映射到新模型的 neck.layer_norm1
    "neck.1": "neck.layer_norm1",
    # 将旧模型中的 neck.2 映射到新模型的 neck.conv2
    "neck.2": "neck.conv2",
    # 将旧模型中的 neck.3 映射到新模型的 neck.layer_norm2
    "neck.3": "neck.layer_norm2",
    # 将旧模型中的 patch_embed.proj 映射到新模型的 patch_embed.projection
    "patch_embed.proj": "patch_embed.projection",
    # 将旧模型中所有以 .norm 结尾的参数映射到新模型中以 .layer_norm 结尾的对应参数
    ".norm": ".layer_norm",
    # 将旧模型中的 blocks 映射到新模型的 layers
    "blocks": "layers",
}

# 替换模型状态字典中的键值，去除特定键"pixel_mean"和"pixel_std"
def replace_keys(state_dict):
    model_state_dict = {}
    state_dict.pop("pixel_mean", None)
    state_dict.pop("pixel_std", None)

    # 定义匹配模式，用于识别特定的键名格式
    output_hypernetworks_mlps_pattern = r".*.output_hypernetworks_mlps.(\d+).layers.(\d+).*"

    # 遍历输入的状态字典中的每个键值对
    for key, value in state_dict.items():
        # 遍历预定义的键映射字典，替换键名中的特定字符串
        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
            if key_to_modify in key:
                key = key.replace(key_to_modify, new_key)

        # 如果键名符合output_hypernetworks_mlps_pattern模式，则进行进一步处理
        if re.match(output_hypernetworks_mlps_pattern, key):
            layer_nb = int(re.match(output_hypernetworks_mlps_pattern, key).group(2))
            # 根据layer_nb的值替换特定的键名部分
            if layer_nb == 0:
                key = key.replace("layers.0", "proj_in")
            elif layer_nb == 1:
                key = key.replace("layers.1", "layers.0")
            elif layer_nb == 2:
                key = key.replace("layers.2", "proj_out")

        # 将处理后的键值对存入模型状态字典中
        model_state_dict[key] = value

    # 将一个特定键的值复制到另一个键中
    model_state_dict["shared_image_embedding.positional_embedding"] = model_state_dict[
        "prompt_encoder.shared_embedding.positional_embedding"
    ]

    # 返回替换键后的模型状态字典
    return model_state_dict


# 将SAM模型检查点转换为PyTorch格式，并在必要时进行处理和推理
def convert_sam_checkpoint(model_name, checkpoint_path, pytorch_dump_folder, push_to_hub):
    # 获取指定模型的配置信息
    config = get_config(model_name)

    # 加载检查点文件中的状态字典（在CPU上加载）
    state_dict = torch.load(checkpoint_path, map_location="cpu")
    # 使用替换键函数处理状态字典
    state_dict = replace_keys(state_dict)

    # 创建SAM图像处理器和SAM处理器对象
    image_processor = SamImageProcessor()
    processor = SamProcessor(image_processor=image_processor)
    # 使用SAM模型配置创建SAM模型对象，并设为评估模式
    hf_model = SamModel(config)
    hf_model.eval()

    # 根据CUDA是否可用，将SAM模型移动到适当的设备上
    device = "cuda" if torch.cuda.is_available() else "cpu"
    hf_model.load_state_dict(state_dict)
    hf_model = hf_model.to(device)

    # 从URL加载原始图像，并将其转换为RGB格式
    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")

    # 设置输入的点和标签
    input_points = [[[500, 375]]]
    input_labels = [[1]]

    # 使用SAM处理器处理图像并将其转换为PyTorch张量格式
    inputs = processor(images=np.array(raw_image), return_tensors="pt").to(device)

    # 在不计算梯度的情况下进行模型推理
    with torch.no_grad():
        output = hf_model(**inputs)
    # 提取IoU分数
    scores = output.iou_scores.squeeze()

    # 如果模型名称符合条件，则再次使用SAM处理器处理输入图像
    if model_name == "sam_vit_b_01ec64":
        inputs = processor(
            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
        ).to(device)

        # 在不计算梯度的情况下进行模型推理
        with torch.no_grad():
            output = hf_model(**inputs)
            # 提取IoU分数
            scores = output.iou_scores.squeeze()
    # 如果模型名称为 "sam_vit_h_4b8939"，执行以下操作
    elif model_name == "sam_vit_h_4b8939":
        # 使用 processor 处理原始图像数据，输入关键点和标签，返回 PyTorch 张量
        inputs = processor(
            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
        ).to(device)

        # 在无需梯度的上下文中，使用 hf_model 进行推理
        with torch.no_grad():
            output = hf_model(**inputs)
        # 提取输出的 IOU 得分并压缩为一维张量
        scores = output.iou_scores.squeeze()

        # 断言最后一个得分是否等于特定值
        assert scores[-1].item() == 0.9712603092193604

        # 定义输入框的坐标范围
        input_boxes = ((75, 275, 1725, 850),)

        # 使用 processor 处理原始图像数据，输入框作为输入，返回 PyTorch 张量
        inputs = processor(images=np.array(raw_image), input_boxes=input_boxes, return_tensors="pt").to(device)

        # 在无需梯度的上下文中，使用 hf_model 进行推理
        with torch.no_grad():
            output = hf_model(**inputs)
        # 提取输出的 IOU 得分并压缩为一维张量
        scores = output.iou_scores.squeeze()

        # 断言最后一个得分是否等于特定值
        assert scores[-1].item() == 0.8686015605926514

        # 测试包含两个关键点和一个图像的情况
        input_points = [[[400, 650], [800, 650]]]
        input_labels = [[1, 1]]

        # 使用 processor 处理原始图像数据，输入关键点和标签，返回 PyTorch 张量
        inputs = processor(
            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
        ).to(device)

        # 在无需梯度的上下文中，使用 hf_model 进行推理
        with torch.no_grad():
            output = hf_model(**inputs)
        # 提取输出的 IOU 得分并压缩为一维张量
        scores = output.iou_scores.squeeze()

        # 断言最后一个得分是否等于特定值
        assert scores[-1].item() == 0.9936047792434692

    # 如果 pytorch_dump_folder 不为 None，则保存 processor 和 hf_model 到指定文件夹
    if pytorch_dump_folder is not None:
        processor.save_pretrained(pytorch_dump_folder)
        hf_model.save_pretrained(pytorch_dump_folder)

    # 如果 push_to_hub 为 True，则根据模型名称推送到指定的 Hub 仓库
    if push_to_hub:
        # 如果模型名称中包含 "slimsam"，使用特定格式的 repo_id
        repo_id = f"nielsr/{model_name}" if "slimsam" in model_name else f"meta/{model_name}"
        # 将 processor 和 hf_model 推送到 Hub 仓库中
        processor.push_to_hub(repo_id)
        hf_model.push_to_hub(repo_id)
if __name__ == "__main__":
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser()
    # 定义可选的模型名称列表
    choices = ["sam_vit_b_01ec64", "sam_vit_h_4b8939", "sam_vit_l_0b3195", "slimsam-50-uniform", "slimsam-77-uniform"]
    # 添加命令行参数：模型名称，包括默认值、可选值、类型和帮助信息
    parser.add_argument(
        "--model_name",
        default="sam_vit_h_4b8939",
        choices=choices,
        type=str,
        help="Name of the original model to convert",
    )
    # 添加命令行参数：检查点路径，包括类型和是否必需
    parser.add_argument(
        "--checkpoint_path",
        type=str,
        required=False,
        help="Path to the original checkpoint",
    )
    # 添加命令行参数：PyTorch 模型输出路径，默认为 None，包括帮助信息
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数：推送到 Hub 的标志，是一个布尔值，包括帮助信息
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model and processor to the hub after converting",
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 如果模型名称中包含 'slimsam'
    if "slimsam" in args.model_name:
        # 检查点路径为命令行参数提供的检查点路径，如果未提供则抛出错误
        checkpoint_path = args.checkpoint_path
        if checkpoint_path is None:
            raise ValueError("You need to provide a checkpoint path for SlimSAM models.")
    else:
        # 使用 Hugging Face Hub 下载指定模型名称的检查点文件路径
        checkpoint_path = hf_hub_download("ybelkada/segment-anything", f"checkpoints/{args.model_name}.pth")

    # 调用函数：转换 SAM 模型的检查点到 PyTorch 模型
    convert_sam_checkpoint(args.model_name, checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)

Transformers-源码解析-九十八-

Transformers 源码解析（九十八）

.\models\roformer\modeling_tf_roformer.py

.\models\roformer\tokenization_roformer.py

.\models\roformer\tokenization_roformer_fast.py

.\models\roformer\tokenization_utils.py

.\models\roformer\__init__.py

.\models\rwkv\configuration_rwkv.py

.\models\rwkv\convert_rwkv_checkpoint_to_hf.py

.\models\rwkv\modeling_rwkv.py

.\models\rwkv\__init__.py

.\models\sam\configuration_sam.py

.\transformers\models\sam\convert_sam_original_to_hf_format.py

.\models\sam\convert_sam_to_hf.py

`.\models\roformer\modeling_tf_roformer.py`

`.\models\roformer\tokenization_roformer.py`

`.\models\roformer\tokenization_roformer_fast.py`

`.\models\roformer\tokenization_utils.py`

`.\models\roformer\init.py`

`.\models\rwkv\configuration_rwkv.py`

`.\models\rwkv\convert_rwkv_checkpoint_to_hf.py`

`.\models\rwkv\modeling_rwkv.py`

`.\models\rwkv\init.py`

`.\models\sam\configuration_sam.py`

`.\transformers\models\sam\convert_sam_original_to_hf_format.py`

`.\models\sam\convert_sam_to_hf.py`