Transformers 源码解析（八十四）

`.\models\openai\modeling_tf_openai.py`

# 定义 TFAttention 类，继承自 keras.layers.Layer，用于实现注意力机制
class TFAttention(keras.layers.Layer):
    # 初始化函数，设置注意力相关参数
    def __init__(self, nx, config, scale=False, **kwargs):
        super().__init__(**kwargs)

        # 在 Attention 中，n_state 表示隐藏状态的维度，通常等于嵌入维度 nx
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # 确保隐藏维度能够被注意力头数整除，保证多头注意力操作的有效性
        assert (
            n_state % config.n_head == 0
        ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
        
        # 设置注意力头数和分割大小
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale  # 是否对注意力分数进行缩放
        self.output_attentions = config.output_attentions  # 是否输出注意力权重

        # c_attn 是注意力机制中的卷积层，用于计算注意力分布的相关参数
        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
        
        # c_proj 是注意力机制中的卷积层，用于计算最终输出的相关参数
        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
        
        # attn_dropout 是注意力机制中的丢弃层，用于对注意力权重进行随机丢弃
        self.attn_dropout = keras.layers.Dropout(config.attn_pdrop)
        
        # resid_dropout 是注意力机制中的丢弃层，用于对残差连接进行随机丢弃
        self.resid_dropout = keras.layers.Dropout(config.resid_pdrop)
        
        # 记录隐藏状态维度和剪枝的注意力头部集合
        self.n_state = n_state
        self.pruned_heads = set()
    def prune_heads(self, heads):
        pass



    @staticmethod
    def causal_attention_mask(nd, ns):
        """
        1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]),
        -1, ns-nd), but doesn't produce garbage on TPUs.
        """
        # Generate a causal attention mask matrix for self-attention mechanisms
        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return m



    def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False):
        # q, k, v have shape [batch, heads, sequence, features]
        # Compute the attention scores matrix
        w = tf.matmul(q, k, transpose_b=True)
        if self.scale:
            dk = tf.cast(shape_list(k)[-1], dtype=w.dtype)  # scale attention_scores
            w = w / tf.math.sqrt(dk)

        # Apply a causal attention mask to prevent attending to future positions
        _, _, nd, ns = shape_list(w)
        b = tf.cast(self.causal_attention_mask(nd, ns), dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w * b - 1e4 * (1 - b)

        if attention_mask is not None:
            # Apply the provided attention mask
            attention_mask = tf.cast(attention_mask, dtype=w.dtype)
            w = w + attention_mask

        # Apply stable softmax activation function to compute attention weights
        w = stable_softmax(w, axis=-1)
        w = self.attn_dropout(w, training=training)

        # Mask heads if specified
        if head_mask is not None:
            w = w * head_mask

        outputs = [tf.matmul(w, v)]
        if output_attentions:
            outputs.append(w)
        return outputs



    def merge_heads(self, x):
        # Transpose and reshape tensor to merge the heads of multi-headed attention
        x = tf.transpose(x, [0, 2, 1, 3])
        x_shape = shape_list(x)
        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
        return tf.reshape(x, new_x_shape)



    def split_heads(self, x):
        # Split tensor into multiple heads for multi-headed attention
        x_shape = shape_list(x)
        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
        x = tf.reshape(x, new_x_shape)
        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)



    def call(self, x, attention_mask, head_mask, output_attentions, training=False):
        # Perform the main operation of the transformer block, including self-attention and feed-forward layers

        # Apply self-attention mechanism
        x = self.c_attn(x)
        query, key, value = tf.split(x, 3, axis=2)
        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)

        # Compute attention outputs
        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
        a = attn_outputs[0]

        # Merge heads back together
        a = self.merge_heads(a)

        # Apply feed-forward layer and residual dropout
        a = self.c_proj(a)
        a = self.resid_dropout(a, training=training)

        # Prepare and return outputs
        outputs = [a] + attn_outputs[1:]
        return outputs  # a, (attentions)
    # 定义一个方法用于构建模型结构，可以接受输入形状参数
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，不进行重复构建
        if self.built:
            return
        # 设置标志位表示模型已经构建
        self.built = True
        
        # 检查是否存在名为"c_attn"的属性，并且该属性不为None
        if getattr(self, "c_attn", None) is not None:
            # 使用 TensorFlow 的命名空间来命名"c_attn"的作用域
            with tf.name_scope(self.c_attn.name):
                # 根据给定的形状构建"c_attn"模块，形状为[None, None, self.n_state * 3]
                self.c_attn.build([None, None, self.n_state * 3])
        
        # 检查是否存在名为"c_proj"的属性，并且该属性不为None
        if getattr(self, "c_proj", None) is not None:
            # 使用 TensorFlow 的命名空间来命名"c_proj"的作用域
            with tf.name_scope(self.c_proj.name):
                # 根据给定的形状构建"c_proj"模块，形状为[None, None, self.n_state]
                self.c_proj.build([None, None, self.n_state])
# 定义一个自定义的 Keras 层 TFMLP，继承自 keras.layers.Layer 类
class TFMLP(keras.layers.Layer):
    # 初始化方法，接受神经元数目和配置参数 config
    def __init__(self, n_state, config, **kwargs):
        super().__init__(**kwargs)
        # 获取嵌入维度
        nx = config.n_embd
        # 创建一个 1 维卷积层 c_fc，用于全连接操作，使用配置中的初始化范围
        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
        # 创建另一个 1 维卷积层 c_proj，用于投影操作，同样使用配置中的初始化范围
        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
        # 获取 GELU 激活函数
        self.act = get_tf_activation("gelu")
        # 创建一个 dropout 层，使用给定的 resid_pdrop 参数
        self.dropout = keras.layers.Dropout(config.resid_pdrop)
        # 保存嵌入维度和神经元数目到实例变量
        self.nx = nx
        self.n_state = n_state

    # 定义调用方法，传入输入 x 和训练标志
    def call(self, x, training=False):
        # 使用 GELU 激活函数对输入进行卷积操作，并赋值给 h
        h = self.act(self.c_fc(x))
        # 对 h 进行投影操作，并赋值给 h2
        h2 = self.c_proj(h)
        # 使用 dropout 层对 h2 进行 dropout 操作，根据训练标志
        h2 = self.dropout(h2, training=training)
        # 返回处理后的 h2
        return h2

    # 定义构建方法，用于构建层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 c_fc 层，使用 name_scope 构建 c_fc 层
        if getattr(self, "c_fc", None) is not None:
            with tf.name_scope(self.c_fc.name):
                self.c_fc.build([None, None, self.n_state])
        # 如果存在 c_proj 层，使用 name_scope 构建 c_proj 层
        if getattr(self, "c_proj", None) is not None:
            with tf.name_scope(self.c_proj.name):
                self.c_proj.build([None, None, self.nx])


# 定义一个自定义的 Keras 层 TFBlock，继承自 keras.layers.Layer 类
class TFBlock(keras.layers.Layer):
    # 初始化方法，接受配置参数 config 和一个可选的缩放参数 scale
    def __init__(self, config, scale=False, **kwargs):
        super().__init__(**kwargs)
        # 获取嵌入维度
        nx = config.n_embd
        # 创建注意力层 attn，使用配置参数和可能的缩放参数
        self.attn = TFAttention(nx, config, scale, name="attn")
        # 创建 LayerNormalization 层 ln_1，用于规范化
        self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
        # 创建 MLP 层 mlp，四倍于嵌入维度，使用配置参数
        self.mlp = TFMLP(4 * nx, config, name="mlp")
        # 创建另一个 LayerNormalization 层 ln_2，用于规范化
        self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
        # 保存嵌入维度到实例变量
        self.nx = nx

    # 定义调用方法，接受输入 x、注意力掩码、头部掩码、输出注意力和训练标志
    def call(self, x, attention_mask, head_mask, output_attentions, training=False):
        # 使用注意力层 attn 处理输入 x，并获取注意力输出
        output_attn = self.attn(x, attention_mask, head_mask, output_attentions, training=training)
        a = output_attn[0]  # output_attn: a, (attentions)

        # 对输入 x 和注意力输出进行加法操作，并进行 LayerNormalization
        n = self.ln_1(x + a)
        # 使用 MLP 层处理上一步的结果 n
        m = self.mlp(n, training=training)
        # 对 n 和 MLP 输出 m 进行加法操作，并进行 LayerNormalization
        h = self.ln_2(n + m)

        # 将最终结果组成列表输出，包含 h 和可能的注意力输出
        outputs = [h] + output_attn[1:]
        return outputs  # x, (attentions)

    # 定义构建方法，用于构建层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        # 如果存在 attn 层，使用 name_scope 构建 attn 层
        if getattr(self, "attn", None) is not None:
            with tf.name_scope(self.attn.name):
                self.attn.build(None)
        # 如果存在 ln_1 层，使用 name_scope 构建 ln_1 层
        if getattr(self, "ln_1", None) is not None:
            with tf.name_scope(self.ln_1.name):
                self.ln_1.build([None, None, self.nx])
        # 如果存在 mlp 层，使用 name_scope 构建 mlp 层
        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)
        # 如果存在 ln_2 层，使用 name_scope 构建 ln_2 层
        if getattr(self, "ln_2", None) is not None:
            with tf.name_scope(self.ln_2.name):
                self.ln_2.build([None, None, self.nx])


# 定义一个 Keras 可序列化层 TFOpenAIGPTMainLayer，继承自 keras.layers.Layer 类
@keras_serializable
class TFOpenAIGPTMainLayer(keras.layers.Layer):
    # 设置配置类为 OpenAIGPTConfig
    config_class = OpenAIGPTConfig
    # 初始化函数，接受一个配置对象和额外的输入参数，并调用父类的初始化方法
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法，传递额外的输入参数和关键字参数
        super().__init__(*inputs, **kwargs)

        # 将配置对象保存到实例中
        self.config = config
        # 是否输出隐藏层状态的标志位
        self.output_hidden_states = config.output_hidden_states
        # 是否输出注意力权重的标志位
        self.output_attentions = config.output_attentions
        # 是否使用返回字典作为输出的标志位
        self.return_dict = config.use_return_dict
        # 模型的隐藏层数量
        self.num_hidden_layers = config.n_layer
        # 嵌入向量的维度
        self.n_embd = config.n_embd
        # 位置编码的最大位置数
        self.n_positions = config.n_positions
        # 初始化范围
        self.initializer_range = config.initializer_range

        # 共享的嵌入层，用于输入的词汇表大小、嵌入维度和初始化范围
        self.tokens_embed = TFSharedEmbeddings(
            config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
        )
        # Dropout 层，使用配置中的嵌入丢弃率
        self.drop = keras.layers.Dropout(config.embd_pdrop)
        # 创建多个 Transformer Block 层，使用配置中的隐藏层数量和初始化标志
        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]

    # 构建模型，定义了位置编码的嵌入矩阵
    def build(self, input_shape=None):
        # 在 "positions_embed" 命名空间下，创建位置编码的嵌入矩阵
        with tf.name_scope("positions_embed"):
            self.positions_embed = self.add_weight(
                name="embeddings",
                shape=[self.n_positions, self.n_embd],
                initializer=get_initializer(self.initializer_range),
            )

        # 如果已经构建过，则直接返回
        if self.built:
            return
        self.built = True
        
        # 如果 tokens_embed 属性存在，则构建该属性
        if getattr(self, "tokens_embed", None) is not None:
            with tf.name_scope(self.tokens_embed.name):
                self.tokens_embed.build(None)
        
        # 如果 h 属性存在，则对每个 Transformer Block 层进行构建
        if getattr(self, "h", None) is not None:
            for layer in self.h:
                with tf.name_scope(layer.name):
                    layer.build(None)

    # 获取输入嵌入层对象
    def get_input_embeddings(self):
        return self.tokens_embed

    # 设置输入嵌入层的权重和词汇表大小
    def set_input_embeddings(self, value):
        self.tokens_embed.weight = value
        self.tokens_embed.vocab_size = shape_list(value)[0]

    # 剪枝模型中的注意力头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError

    # 调用模型，实现模型的前向计算
    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        OPENAI_GPT_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    """



class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = OpenAIGPTConfig
    base_model_prefix = "transformer"



@dataclass
class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.

    Args:
        logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    logits: tf.Tensor = None
    mc_logits: tf.Tensor = None
    hidden_states: Tuple[tf.Tensor] | None = None
    attentions: Tuple[tf.Tensor] | None = None



    """
    Defines a constant string providing an introductory documentation string for the OpenAI GPT model implementation.

    This docstring outlines the inheritance structure, general usage, and compatibility with TensorFlow 2.0,
    emphasizing the support for multiple input formats. It also offers a tip regarding the input format preference
    in TensorFlow's `transformers` library, ensuring seamless integration with Keras methods like `model.fit()`.
    """
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>



    Parameters:
        config ([`OpenAIGPTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
定义了一个文档字符串，用于描述 OpenAI GPT 相关的输入参数说明。
"""

@add_start_docstrings(
    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
    OPENAI_GPT_START_DOCSTRING,
)
"""
使用装饰器添加了文档字符串，描述了一个裸的 OpenAI GPT 变压器模型，输出原始的隐藏状态，没有特定的输出头。
"""

class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
    """
    定义了 TFOpenAIGPTModel 类，继承自 TFOpenAIGPTPreTrainedModel。
    """

    def __init__(self, config, *inputs, **kwargs):
        """
        初始化方法，接受配置和其他参数，并调用父类初始化方法。
        """
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
        """
        创建了 TFOpenAIGPTMainLayer 对象，作为 transformer 属性。
        """

    @unpack_inputs
    @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
    """
    使用装饰器添加了文档字符串，描述了模型的前向传播函数，扩展了输入参数的文档说明。
    """

    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    """
    使用装饰器添加了代码示例的文档字符串，指定了用于文档化的检查点、输出类型和配置类。
    """

    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFBaseModelOutput]:
        """
        模型的调用方法，接受多个输入参数，并返回模型输出。
        """
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        return outputs

    def build(self, input_shape=None):
        """
        构建方法，用于构建模型的层次结构。
        """
        if self.built:
            return
        self.built = True
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
                """
                在 transformer 属性上建立命名作用域，并调用其 build 方法。
                """

@add_start_docstrings(
    """
    OpenAI GPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    OPENAI_GPT_START_DOCSTRING,
)
"""
使用装饰器添加了文档字符串，描述了带有语言建模头部的 OpenAI GPT 模型变压器。
"""

class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelingLoss):
    """
    定义了 TFOpenAIGPTLMHeadModel 类，继承自 TFOpenAIGPTPreTrainedModel 和 TFCausalLanguageModelingLoss。
    """

    def __init__(self, config, *inputs, **kwargs):
        """
        初始化方法，接受配置和其他参数，并调用父类初始化方法。
        """
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
        """
        创建了 TFOpenAIGPTMainLayer 对象，作为 transformer 属性。
        OpenAIGPT 模型不支持过去的缓存特性。
        """
        self.supports_xla_generation = False

    def get_output_embeddings(self):
        """
        获取输出嵌入的方法，返回输入嵌入。
        """
        return self.get_input_embeddings()

    def set_output_embeddings(self, value):
        """
        设置输出嵌入的方法，设置输入嵌入的值。
        """
        self.set_input_embeddings(value)

    @unpack_inputs
    @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
    """
    使用装饰器添加了文档字符串，扩展了模型的前向传播函数的输入参数文档说明。
    """
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 使用装饰器添加代码示例的文档字符串，指定文档检查点、输出类型和配置类

    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFCausalLMOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 定义模型的调用方法，接受多个输入参数并返回输出结果

        # 调用 transformer 模型进行前向传播
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 提取 transformer 的隐藏状态作为输出的第一个元素
        hidden_states = transformer_outputs[0]

        # 对隐藏状态进行线性变换，生成预测的 logits
        logits = self.transformer.tokens_embed(hidden_states, mode="linear")

        # 初始化损失为 None
        loss = None
        # 如果 labels 不为空，则计算损失
        if labels is not None:
            # 将 logits 向左移动一位并截断最后一个 logit token
            shifted_logits = logits[:, :-1]
            labels = labels[:, 1:]
            # 使用预测的 logits 和实际的 labels 计算损失
            loss = self.hf_compute_loss(labels, shifted_logits)

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TFCausalLMOutput 对象作为输出
        return TFCausalLMOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    # 准备生成的输入数据格式
    def prepare_inputs_for_generation(self, inputs, **kwargs):
        return {"input_ids": inputs}

    # 构建模型
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果模型已经构建，则直接返回
        if getattr(self, "transformer", None) is not None:
            # 在 transformer 的命名空间下构建模型
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
@add_start_docstrings(
    """
    OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
    input embeddings, the classification head takes as input the input of a specified classification token index in the
    input sequence).
    """,
    OPENAI_GPT_START_DOCSTRING,
)
class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        config.num_labels = 1
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
        self.multiple_choice_head = TFSequenceSummary(
            config, initializer_range=config.initializer_range, name="multiple_choice_head"
        )

    @unpack_inputs
    @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        mc_token_ids: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ):
        """
        Perform the forward pass of the OpenAI GPT model with two heads.

        Args:
            input_ids: Optional[input_ids: tf.TensorSpec((None, None, None), tf.int32, name="input_ids")],
                The input tensor of shape [batch_size, sequence_length].
            attention_mask: Optional[tf.TensorSpec((None, None, None), tf.int32, name="attention_mask")],
                The attention mask tensor of shape [batch_size, sequence_length].
            token_type_ids: Optional[tf.TensorSpec((None, None), tf.int32, name="token_type_ids")],
                The token type ids tensor of shape [batch_size, sequence_length].
            position_ids: Optional[tf.TensorSpec((None, None), tf.int32, name="position_ids")],
                The position ids tensor of shape [batch_size, sequence_length].
            head_mask: Optional[tf.TensorSpec((None, None), tf.float32, name="head_mask")],
                The head mask tensor of shape [num_heads, sequence_length].
            inputs_embeds: Optional[tf.TensorSpec((None, None, None), tf.float32, name="inputs_embeds")],
                The input embeddings tensor of shape [batch_size, sequence_length, hidden_size].
            mc_token_ids: Optional[tf.TensorSpec((None, None), tf.int32, name="mc_token_ids")],
                The multiple choice token ids tensor of shape [batch_size, num_choices].
            output_attentions: Optional[bool],
                Whether to return attentions weights.
            output_hidden_states: Optional[bool],
                Whether to return hidden states.
            return_dict: Optional[bool],
                Whether to return a dictionary instead of a tuple.
            training: Optional[bool],
                Whether in training mode or not.

        Returns:
            TFOpenAIGPTDoubleHeadsModelOutput or tf.Tensor,
            The model output as a named tuple or a tensor.
        """
        pass

    @property
    def input_signature(self):
        """
        Return the input signature for the TensorFlow model.
        """
        return {
            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
        }

    def build(self, input_shape=None):
        """
        Build the OpenAI GPT model.

        Args:
            input_shape: Optional, The shape of the input tensor.
        """
        if self.built:
            return
        self.built = True
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        if getattr(self, "multiple_choice_head", None) is not None:
            with tf.name_scope(self.multiple_choice_head.name):
                self.multiple_choice_head.build(None)


@add_start_docstrings(
    """
    The OpenAI GPT Model transformer with a sequence classification head on top (linear layer).

    [`TFOpenAIGPTForSequenceClassification`] uses the last token in order to do the classification, as other causal
    models (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    """,
    OPENAI_GPT_START_DOCSTRING,
)
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).


OPENAI_GPT_START_DOCSTRING,



    OPENAI_GPT_START_DOCSTRING,
    )
    # 定义一个继承自 TFOpenAIGPTPreTrainedModel 和 TFSequenceClassificationLoss 的类
    class TFOpenAIGPTForSequenceClassification(TFOpenAIGPTPreTrainedModel, TFSequenceClassificationLoss):
        # 初始化函数，接受配置参数 config 和额外的输入 *inputs 和 **kwargs
        def __init__(self, config, *inputs, **kwargs):
            # 调用父类的初始化方法
            super().__init__(config, *inputs, **kwargs)
            # 设置类别数目为配置中的 num_labels
            self.num_labels = config.num_labels
            # 创建一个全连接层 Dense 对象用于生成输出分数
            self.score = keras.layers.Dense(
                config.num_labels,
                kernel_initializer=get_initializer(config.initializer_range),
                name="score",
                use_bias=False,
            )
            # 创建一个 OpenAIGPT 主层对象用于处理输入数据
            self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
            # 保存配置对象到类的属性中
            self.config = config

        @unpack_inputs
        @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING)
        @add_code_sample_docstrings(
            checkpoint=_CHECKPOINT_FOR_DOC,
            output_type=TFSequenceClassifierOutput,
            config_class=_CONFIG_FOR_DOC,
        )
        # 定义模型的前向传播函数，接受多种输入参数并返回模型输出
        def call(
            self,
            input_ids: TFModelInputType | None = None,
            attention_mask: np.ndarray | tf.Tensor | None = None,
            token_type_ids: np.ndarray | tf.Tensor | None = None,
            position_ids: np.ndarray | tf.Tensor | None = None,
            head_mask: np.ndarray | tf.Tensor | None = None,
            inputs_embeds: np.ndarray | tf.Tensor | None = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            labels: np.ndarray | tf.Tensor | None = None,
            training: Optional[bool] = False,
        ) -> Union[Tuple, TFSequenceClassifierOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 调用transformer模型，获取transformer的输出结果
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从transformer输出中获取隐藏状态
        hidden_states = transformer_outputs[0]
        
        # 将隐藏状态输入score层，得到logits
        logits = self.score(hidden_states)
        
        # 初始化in_logits变量
        in_logits = None
        
        # 如果没有定义pad_token_id，则将sequence_lengths设为-1
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            # 如果输入中有input_ids
            if input_ids is not None:
                # 计算每个样本的序列长度
                sequence_lengths = (
                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
                    - 1
                )
                # 如果长度小于0，则设为默认序列长度-1
                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
                # 从logits中根据序列长度取出对应位置的值
                in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
            else:
                sequence_lengths = -1
                # 如果没有input_ids，发出警告
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )
        
        # 初始化loss为None
        loss = None
        
        # 如果提供了labels
        if labels is not None:
            # 根据input_ids或inputs_embeds获取batch_size和sequence_length
            if input_ids is not None:
                batch_size, sequence_length = shape_list(input_ids)[:2]
            else:
                batch_size, sequence_length = shape_list(inputs_embeds)[:2]
            # 检查是否定义了pad_token_id或者batch_size为1，否则报错
            assert (
                self.config.pad_token_id is not None or batch_size == 1
            ), "Cannot handle batch sizes > 1 if no padding token is defined."

            # 如果sequence_lengths不是tensor，则根据batch_size和sequence_lengths取出对应的logits值
            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0:batch_size, sequence_lengths]

            # 计算损失函数
            loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))

        # 如果in_logits不为None，则使用in_logits作为pooled_logits，否则使用logits
        pooled_logits = in_logits if in_logits is not None else logits

        # 如果return_dict为False，则返回输出元组
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果return_dict为True，则返回TFSequenceClassifierOutput对象
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=pooled_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
    # 定义一个方法 `build`，用于构建模型的结构
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 将 built 属性设置为 True，表示模型已构建
        self.built = True
        # 检查是否存在 `score` 属性，并且属性值不为 None
        if getattr(self, "score", None) is not None:
            # 在命名空间 `self.score.name` 下构建 `score` 属性
            with tf.name_scope(self.score.name):
                # 调用 `build` 方法构建 `score`，输入形状为 [None, None, self.config.n_embd]
                self.score.build([None, None, self.config.n_embd])
        # 检查是否存在 `transformer` 属性，并且属性值不为 None
        if getattr(self, "transformer", None) is not None:
            # 在命名空间 `self.transformer.name` 下构建 `transformer` 属性
            with tf.name_scope(self.transformer.name):
                # 调用 `build` 方法构建 `transformer`，输入形状为 None
                self.transformer.build(None)

`.\models\openai\tokenization_openai.py`

# 设定脚本的字符编码为 UTF-8
# 版权声明，使用 Apache License 2.0 开源许可协议
# 详细许可信息可访问 http://www.apache.org/licenses/LICENSE-2.0
# 如果不符合许可协议的要求，不得使用该文件
# 以下代码实现了 OpenAI GPT 的分词功能

# 导入所需的模块和库
import json  # 导入处理 JSON 格式数据的模块
import os  # 导入操作系统功能的模块
import re  # 导入正则表达式模块
import unicodedata  # 导入处理 Unicode 数据的模块
from typing import Optional, Tuple  # 导入类型提示相关的功能

# 导入 tokenization_utils 模块中的函数和类
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
# 导入 logging 模块，用于日志记录
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称常量
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}

# 预训练模型使用的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/vocab.json"
    },
    "merges_file": {
        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/merges.txt"
    },
}

# 预训练模型使用的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "openai-community/openai-gpt": 512,
}

# 以下是从 transformers.models.bert.tokenization_bert 中复制过来的函数
# 这个函数用于基本的空格分词处理
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    # 去除文本两端的空白字符
    text = text.strip()
    # 如果文本为空，则返回空列表
    if not text:
        return []
    # 使用空格分割文本，得到 tokens 列表
    tokens = text.split()
    # 返回分割后的 tokens 列表
    return tokens

# 以下是从 transformers.models.bert.tokenization_bert 中复制过来的类
# 这个类实现了基本的分词功能，包括标点符号分割、小写转换等
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
    """
    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        """
        Initialize the Tokenizer with specified parameters.

        Args:
            do_lower_case (bool, optional, default=True): Whether or not to convert tokens to lowercase.
            never_split (Iterable, optional): Collection of tokens that should never be split during tokenization.
                                              Defaults to an empty list.
            tokenize_chinese_chars (bool, optional, default=True): Whether or not to tokenize Chinese characters.
            strip_accents (bool, optional): Whether or not to remove accents. If None, determined by `lowercase`.
            do_split_on_punc (bool, optional, default=True): Whether or not to split on punctuation marks.
        """
        # Initialize with default values or provided arguments
        if never_split is None:
            never_split = []
        self.do_lower_case = do_lower_case
        self.never_split = set(never_split)
        self.tokenize_chinese_chars = tokenize_chinese_chars
        self.strip_accents = strip_accents
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 使用 union() 方法将 `never_split` 参数与 `self.never_split` 属性合并，以确保不分割的词汇列表完整
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本，去除不必要的空白和特殊字符
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        # 如果设定了 `tokenize_chinese_chars` 参数为真，则对文本进行中文字符的分词处理
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        # 标准化 Unicode 文本为 NFC 形式，确保不同的 Unicode 编码形式被视为相同的字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空白字符进行分词，得到原始的 token 列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        # 遍历每个原始 token
        for token in orig_tokens:
            # 如果 token 不在 `never_split` 中，则可能对其进行小写处理和去重音符处理
            if token not in never_split:
                if self.do_lower_case:
                    # 如果设置了小写处理，则将 token 转换为小写
                    token = token.lower()
                    # 如果设置了去除重音符号，则对 token 进行去重音符处理
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                elif self.strip_accents:
                    # 如果仅设置了去重音符号，则对 token 进行去重音符处理
                    token = self._run_strip_accents(token)
            # 将 token 进行标点符号的分割处理，并加入到分割后的 token 列表中
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将分割后的 token 列表重新用空白字符连接为字符串，并进行最终的分词处理
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        # 返回最终的 token 列表
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 将文本标准化为 NFD 形式，以便去除重音符号
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取当前字符的 Unicode 分类信息
            cat = unicodedata.category(char)
            # 如果当前字符为重音符号，则跳过该字符，不加入到输出列表中
            if cat == "Mn":
                continue
            # 否则将当前字符加入到输出列表中
            output.append(char)
        # 将输出列表中的字符连接成字符串，并返回处理后的文本
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要在标点处分割或者给定的文本在never_split中，则直接返回包含整个文本的列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        # 遍历字符列表
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号
            if _is_punctuation(char):
                # 在输出列表中添加一个新的空列表，用于存储下一个词
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号
                if start_new_word:
                    # 添加一个空列表作为新词的起始
                    output.append([])
                start_new_word = False
                # 将当前字符添加到当前词的末尾
                output[-1].append(char)
            i += 1

        # 将每个词列表转换为字符串，并返回列表
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果是中文字符
            if self._is_chinese_char(cp):
                # 在中文字符的前后添加空格，并添加到输出列表中
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                # 如果不是中文字符，则直接添加到输出列表中
                output.append(char)
        # 将输出列表中的字符连接成一个字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查给定的码点是否是CJK字符的码点范围内
        # 这里参考了CJK统一表意文字的Unicode块范围
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)
            or (cp >= 0x20000 and cp <= 0x2A6DF)
            or (cp >= 0x2A700 and cp <= 0x2B73F)
            or (cp >= 0x2B740 and cp <= 0x2B81F)
            or (cp >= 0x2B820 and cp <= 0x2CEAF)
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)
        ):
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        # 遍历文本中的每个字符
        for char in text:
            cp = ord(char)
            # 如果字符为无效字符或控制字符，则跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果字符是空白字符，则替换为单个空格
            if _is_whitespace(char):
                output.append(" ")
            else:
                # 否则将字符添加到输出列表中
                output.append(char)
        # 将输出列表中的字符连接成一个字符串并返回
        return "".join(output)
def get_pairs(word):
    """
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    """
    # Initialize an empty set to store symbol pairs
    pairs = set()
    # Initialize the previous character as the first character in the word
    prev_char = word[0]
    # Iterate over each character in the word starting from the second character
    for char in word[1:]:
        # Add the pair of previous character and current character to the set
        pairs.add((prev_char, char))
        # Update the previous character to the current character for the next iteration
        prev_char = char
    # Return the set of symbol pairs
    return pairs


def text_standardize(text):
    """
    fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
    """
    # Replace em dashes, en dashes, horizontal bars, and ellipses with standard symbols
    text = text.replace("—", "-")
    text = text.replace("–", "-")
    text = text.replace("―", "-")
    text = text.replace("…", "...")
    text = text.replace("´", "'")
    # Use regex to standardize certain punctuation marks with surrounding spaces
    text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text)
    # Normalize line breaks to ensure consistent whitespace around them
    text = re.sub(r"\s*\n\s*", " \n ", text)
    # Replace multiple spaces and tabs with a single space
    text = re.sub(r"[^\S\n]+", " ", text)
    # Strip leading and trailing whitespace from the text
    return text.strip()


class OpenAIGPTTokenizer(PreTrainedTokenizer):
    """
    Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:

    - lowercases all inputs,
    - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
      `BasicTokenizer` if not.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        try:
            # Attempt to import necessary libraries for tokenization and text fixing
            import ftfy
            from spacy.lang.en import English

            # Use SpaCy tokenizer and ftfy text fixing
            _nlp = English()
            self.nlp = _nlp.tokenizer
            self.fix_text = ftfy.fix_text
        except ImportError:
            # Warn if libraries are not available and fallback to BERT's BasicTokenizer
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

        # Load vocabulary and merges files into the tokenizer
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # Create a reverse dictionary for decoding IDs to tokens
        self.decoder = {v: k for k, v in self.encoder.items()}
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        # Create a dictionary of BPE merges with their respective ranks
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        # Initialize a cache for storing tokenization results
        self.cache = {}

        # Initialize the tokenizer using the superclass method
        super().__init__(unk_token=unk_token, **kwargs)
    # 返回 True，表示执行小写转换
    def do_lower_case(self):
        return True

    # 返回词汇表大小，即编码器中的条目数
    @property
    def vocab_size(self):
        return len(self.encoder)

    # 返回包含编码器和额外令牌编码器的字典
    def get_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    # 对给定的单词进行 BPE（Byte Pair Encoding）处理
    def bpe(self, token):
        # 如果缓存中已存在该单词的处理结果，则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]

        # 将单词转换为 BPE 处理过的形式
        word = tuple(token[:-1]) + (token[-1] + "</w>",)
        pairs = get_pairs(word)

        # 如果没有找到任何对，则返回带结束符的原始单词
        if not pairs:
            return token + "</w>"

        # 开始迭代处理单词中的 bigram
        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0

            # 遍历单词，处理 bigram
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1

            # 更新单词，继续处理直到不再有 bigram
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)

        # 将处理后的单词转换为字符串形式，并处理特殊情况
        word = " ".join(word)
        if word == "\n  </w>":
            word = "\n</w>"

        # 将处理结果添加到缓存中并返回
        self.cache[token] = word
        return word

    # 对文本进行标记化处理，返回标记化后的字符串列表
    def _tokenize(self, text):
        split_tokens = []

        # 如果未指定修正文本处理器，则使用 BERT 的基础标记器进行处理
        if self.fix_text is None:
            text = self.nlp.tokenize(text)
            for token in text:
                split_tokens.extend(list(self.bpe(token).split(" ")))
        else:
            # 使用 SpaCy 和 ftfy 进行原始的标记化处理（OpenAI GPT 的标记化过程）
            text = self.nlp(text_standardize(self.fix_text(text)))
            for token in text:
                split_tokens.extend(list(self.bpe(token.text.lower()).split(" ")))

        return split_tokens

    # 将 token 转换为对应的 id
    def _convert_token_to_id(self, token):
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 将 id 转换为对应的 token（BPE 格式），如果未找到，则返回 unk_token
    def _convert_id_to_token(self, index):
        return self.decoder.get(index, self.unk_token)

    # 将一系列 token（字符串）转换为单一字符串
    def convert_tokens_to_string(self, tokens):
        out_string = "".join(tokens).replace("</w>", " ").strip()
        return out_string
    # 定义一个保存词汇表的方法，接收一个保存目录路径和可选的文件名前缀作为参数，返回一个元组类型的结果
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果保存目录不存在，则记录错误信息并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构建词汇表文件路径，如果有文件名前缀则添加到文件名中
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        
        # 构建合并文件路径，如果有文件名前缀则添加到文件名中
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将词典内容以 JSON 格式写入词汇表文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # 初始化索引值
        index = 0
        # 打开合并文件，以 UTF-8 编码写入内容
        with open(merge_file, "w", encoding="utf-8") as writer:
            # 在文件开头写入版本信息
            writer.write("#version: 0.2\n")
            # 遍历并排序 BPE 标记及其索引，按照索引值升序排列
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                # 如果当前索引值不等于期望的索引值，则记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                # 将 BPE 标记以空格分隔并写入文件
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回保存的词汇表文件路径和合并文件路径组成的元组
        return vocab_file, merge_file

`.\models\openai\tokenization_openai_fast.py`

# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Tokenization classes for OpenAI GPT."""

from typing import Optional, Tuple

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入基于 tokenizers 库的预训练标记器
from ...utils import logging  # 导入日志模块
from .tokenization_openai import OpenAIGPTTokenizer  # 导入 OpenAIGPTTokenizer 类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/vocab.json"
    },
    "merges_file": {
        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/merges.txt"
    },
    "tokenizer_file": {
        "openai-community/openai-gpt": "https://huggingface.co/openai-community/openai-gpt/resolve/main/tokenizer.json"
    },
}

# 预训练模型的位置嵌入尺寸映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "openai-community/openai-gpt": 512,
}

# OpenAIGPTTokenizerFast 类，继承自 PreTrainedTokenizerFast
class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件的名称映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 设置预训练模型的词汇文件映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 设置预训练模型的最大输入尺寸
    model_input_names = ["input_ids", "attention_mask"]  # 模型输入的名称列表
    slow_tokenizer_class = OpenAIGPTTokenizer  # 慢速标记器的类

    def __init__(self, vocab_file=None, merges_file=None, tokenizer_file=None, unk_token="<unk>", **kwargs):
        super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs)
        # 初始化函数，调用父类的构造函数，并传递参数

    @property
    def do_lower_case(self):
        return True  # 返回是否将输入文本转换为小写的标志
    # 定义一个方法用于保存词汇表到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 调用内部的分词器模型的保存方法，将词汇表保存到指定目录下，使用可选的文件名前缀
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        # 将返回的文件名转换成元组并返回
        return tuple(files)

`.\models\openai\init.py`

# 导入需要的模块和函数
from typing import TYPE_CHECKING

# 从相对路径的模块导入必要的工具和异常处理类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构，包含模块名和需要导入的类和函数列表
_import_structure = {
    "configuration_openai": ["OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenAIGPTConfig"],
    "tokenization_openai": ["OpenAIGPTTokenizer"],
}

# 检查是否可用 tokenizers 模块，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加 tokenization_openai_fast 模块到导入结构中
    _import_structure["tokenization_openai_fast"] = ["OpenAIGPTTokenizerFast"]

# 检查是否可用 torch 模块，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加 modeling_openai 模块到导入结构中
    _import_structure["modeling_openai"] = [
        "OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "OpenAIGPTDoubleHeadsModel",
        "OpenAIGPTForSequenceClassification",
        "OpenAIGPTLMHeadModel",
        "OpenAIGPTModel",
        "OpenAIGPTPreTrainedModel",
        "load_tf_weights_in_openai_gpt",
    ]

# 检查是否可用 tensorflow 模块，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加 modeling_tf_openai 模块到导入结构中
    _import_structure["modeling_tf_openai"] = [
        "TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFOpenAIGPTDoubleHeadsModel",
        "TFOpenAIGPTForSequenceClassification",
        "TFOpenAIGPTLMHeadModel",
        "TFOpenAIGPTMainLayer",
        "TFOpenAIGPTModel",
        "TFOpenAIGPTPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从相对路径的 configuration_openai 模块中导入特定类和常量
    from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
    # 从相对路径的 tokenization_openai 模块中导入特定类
    from .tokenization_openai import OpenAIGPTTokenizer

    # 尝试检查 tokenizers 模块是否可用
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从相对路径的 tokenization_openai_fast 模块中导入特定类
        from .tokenization_openai_fast import OpenAIGPTTokenizerFast

    # 尝试检查 torch 模块是否可用
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果不是TensorFlow环境，则导入以下模块和内容
    else:
        from .modeling_openai import (
            OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,  # 导入OpenAI GPT预训练模型存档列表
            OpenAIGPTDoubleHeadsModel,  # 导入OpenAI GPT双头模型类
            OpenAIGPTForSequenceClassification,  # 导入OpenAI GPT序列分类模型类
            OpenAIGPTLMHeadModel,  # 导入OpenAI GPT语言模型头部模型类
            OpenAIGPTModel,  # 导入OpenAI GPT模型类
            OpenAIGPTPreTrainedModel,  # 导入OpenAI GPT预训练模型基类
            load_tf_weights_in_openai_gpt,  # 导入OpenAI GPT加载TensorFlow权重的函数
        )

    try:
        # 如果TensorFlow不可用，则引发OptionalDependencyNotAvailable异常
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果捕获到OptionalDependencyNotAvailable异常，则不做任何处理
        pass
    else:
        # 否则，在TensorFlow环境中导入以下模块和内容
        from .modeling_tf_openai import (
            TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,  # 导入TensorFlow版OpenAI GPT预训练模型存档列表
            TFOpenAIGPTDoubleHeadsModel,  # 导入TensorFlow版OpenAI GPT双头模型类
            TFOpenAIGPTForSequenceClassification,  # 导入TensorFlow版OpenAI GPT序列分类模型类
            TFOpenAIGPTLMHeadModel,  # 导入TensorFlow版OpenAI GPT语言模型头部模型类
            TFOpenAIGPTMainLayer,  # 导入TensorFlow版OpenAI GPT主层模型类
            TFOpenAIGPTModel,  # 导入TensorFlow版OpenAI GPT模型类
            TFOpenAIGPTPreTrainedModel,  # 导入TensorFlow版OpenAI GPT预训练模型基类
        )
else:
    # 导入 sys 模块，用于管理 Python 解释器的系统功能
    import sys

    # 将当前模块注册到 sys.modules 字典中，使用 _LazyModule 对象作为值
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\opt\configuration_opt.py`

# coding=utf-8
# Copyright 2022 The Metaseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" OPT model configuration"""

# 导入预训练配置类 PretrainedConfig 和日志工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取 logger 对象，用于记录日志信息
logger = logging.get_logger(__name__)

# 预训练配置映射字典，将模型名称映射到其配置文件的 URL
OPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/opt-125m": "https://huggingface.co/facebook/opt-125m/blob/main/config.json",
    "facebook/opt-350m": "https://huggingface.co/facebook/opt-350m/blob/main/config.json",
    "facebook/opt-1.3b": "https://huggingface.co/facebook/opt-1.3b/blob/main/config.json",
    "facebook/opt-2.7b": "https://huggingface.co/facebook/opt-2.7b/blob/main/config.json",
    "facebook/opt-6.7b": "https://huggingface.co/facebook/opt-6.7b/blob/main/config.json",
    "facebook/opt-13b": "https://huggingface.co/facebook/opt-13b/blob/main/config.json",
}

# OPTConfig 类，继承自 PretrainedConfig，用于存储 OPTModel 的配置信息
class OPTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate a OPT model
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the OPT
    [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 OPT 模型的配置类 OPTConfig，用于设置模型的各种参数和选项
    Args:
        vocab_size (`int`, *optional*, defaults to 50272):
            OPT 模型的词汇表大小，定义了在调用 OPTModel 时 `inputs_ids` 可以表示的不同标记数量
        hidden_size (`int`, *optional*, defaults to 768):
            层和池化层的维度
        num_hidden_layers (`int`, *optional*, defaults to 12):
            解码器层的数量
        ffn_dim (`int`, *optional*, defaults to 3072):
            解码器中“中间”（通常称为前馈）层的维度
        num_attention_heads (`int`, *optional*, defaults to 12):
            Transformer 解码器中每个注意力层的注意力头数量
        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
            编码器和池化器中的非线性激活函数（函数或字符串），支持的字符串有 `"gelu"`, `"relu"`, `"silu"` 和 `"gelu_new"`
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            模型可能使用的最大序列长度，通常设置为较大的值（例如 512、1024 或 2048）
        do_layer_norm_before (`bool`, *optional*, defaults to `True`):
            是否在注意力块之前执行层归一化
        word_embed_proj_dim (`int`, *optional*):
            可以设置为缩小词嵌入的维度，例如 `opt-350m`。默认为 `hidden_size`
        dropout (`float`, *optional*, defaults to 0.1):
            所有嵌入层、编码器和池化器中完全连接层的 dropout 概率
        attention_dropout (`float`, *optional*, defaults to 0.0):
            注意力概率的 dropout 比率
        layerdrop (`float`, *optional*, defaults to 0.0):
            LayerDrop 概率。参见 LayerDrop 论文以获取更多细节（https://arxiv.org/abs/1909.11556）
        init_std (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差
        use_cache (`bool`, *optional*, defaults to `True`):
            模型是否应返回最后一组键/值注意力（并非所有模型都使用）
        enable_bias (`bool`, *optional*, defaults to `True`):
            注意力块中线性层是否应使用偏置项
        layer_norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
            层归一化是否应具有可学习参数

    Example:

    ```
    >>> from transformers import OPTConfig, OPTModel

    >>> # Initializing a OPT facebook/opt-large style configuration
    >>> configuration = OPTConfig()
    ```
    # 使用 Facebook 的 OPTModel 类初始化一个模型实例，使用给定的配置初始化模型参数（包括随机权重）
    model = OPTModel(configuration)

    # 访问模型的配置信息，获取配置对象
    configuration = model.config

`.\models\opt\convert_opt_original_pytorch_checkpoint_to_pytorch.py`

# 导入必要的库和模块
import argparse     # 导入命令行参数解析模块
from pathlib import Path   # 导入处理路径的模块

import torch    # 导入PyTorch库

# 从transformers库中导入OPTConfig和OPTModel类
from transformers import OPTConfig, OPTModel
# 从transformers.utils中导入logging模块
from transformers.utils import logging

# 设置日志级别为INFO
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 加载检查点文件的函数，checkpoint_path应该以'model.pt'结尾
def load_checkpoint(checkpoint_path):
    """Checkpoint path should end in model.pt"""
    # 使用CPU加载检查点文件中的状态字典
    sd = torch.load(checkpoint_path, map_location="cpu")
    # 如果状态字典中包含"model"键，则仅使用"model"键对应的值
    if "model" in sd.keys():
        sd = torch.load(checkpoint_path, map_location="cpu")["model"]

    # 删除不必要的权重
    keys_to_delete = [
        "decoder.version",
        "decoder.output_projection.weight",
    ]
    for key in keys_to_delete:
        # 如果状态字典中包含需要删除的键，则删除该键
        if key in sd:
            sd.pop(key)

    # 将需要重命名的键从旧名称映射到新名称
    keys_to_rename = {
        "decoder.project_in_dim.weight": "decoder.project_in.weight",
        "decoder.project_out_dim.weight": "decoder.project_out.weight",
        "decoder.layer_norm.weight": "decoder.final_layer_norm.weight",
        "decoder.layer_norm.bias": "decoder.final_layer_norm.bias",
    }
    for old_key, new_key in keys_to_rename.items():
        # 如果状态字典中包含需要重命名的旧键，则将其重命名为新键
        if old_key in sd:
            sd[new_key] = sd.pop(old_key)

    # 获取状态字典中的所有键列表
    keys = list(sd.keys())
    for key in keys:
        # 如果键名中包含".qkv_proj."，则进行以下操作
        if ".qkv_proj." in key:
            value = sd[key]
            # 将QKV权重分割为独立的Q、K、V
            q_name = key.replace(".qkv_proj.", ".q_proj.")
            k_name = key.replace(".qkv_proj.", ".k_proj.")
            v_name = key.replace(".qkv_proj.", ".v_proj.")
            depth = value.shape[0]
            assert depth % 3 == 0  # 断言深度能被3整除
            # 将value在dim=0维度上按照depth//3进行分割为Q、K、V
            k, v, q = torch.split(value, depth // 3, dim=0)

            sd[q_name] = q   # 将分割后的Q赋值给新的Q键名
            sd[k_name] = k   # 将分割后的K赋值给新的K键名
            sd[v_name] = v   # 将分割后的V赋值给新的V键名
            del sd[key]      # 删除原始的QKV键名

    return sd


# 使用torch.no_grad()装饰器定义函数，不会计算梯度
@torch.no_grad()
def convert_opt_checkpoint(checkpoint_path, pytorch_dump_folder_path, config=None):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    # 加载检查点文件中的状态字典
    state_dict = load_checkpoint(checkpoint_path)

    # 如果提供了config参数，则从预训练配置文件中加载OPTConfig对象
    if config is not None:
        config = OPTConfig.from_pretrained(config)
    else:
        config = OPTConfig()  # 否则创建一个空的OPTConfig对象

    # 创建一个OPTModel对象，并设置为半精度(half())和评估模式(eval())
    model = OPTModel(config).half().eval()
    # 使用给定的状态字典加载模型的状态
    model.load_state_dict(state_dict)

    # 检查结果
    # 如果指定路径不存在，则创建目录，用于保存 PyTorch 模型
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 将模型保存到指定路径下，以便后续使用
    model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必选参数
    parser.add_argument(
        "--fairseq_path",
        type=str,
        help=(
            "path to fairseq checkpoint in correct format. You can find all checkpoints in the correct format here:"
            " https://huggingface.co/models?other=opt_metasq"
        ),
    )
    # 添加一个必选参数，指定fairseq模型的路径，必须是字符串类型，并提供帮助文本

    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加一个可选参数，指定输出PyTorch模型的路径，默认值为None，类型为字符串，并提供帮助文本

    parser.add_argument("--hf_config", default=None, type=str, help="Define HF config.")
    # 添加一个可选参数，指定HF配置的路径，默认值为None，类型为字符串，并提供帮助文本

    args = parser.parse_args()
    # 解析命令行参数并存储到args对象中

    convert_opt_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, config=args.hf_config)
    # 调用convert_opt_checkpoint函数，传入fairseq路径、PyTorch模型输出路径和HF配置路径作为参数

`.\models\opt\modeling_flax_opt.py`

# coding=utf-8
# 文件编码设置为 UTF-8

# Copyright 2022 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team. All rights reserved.
# 版权声明，声明版权归属和保留的组织或个人

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 进行许可，允许使用该文件

# you may not use this file except in compliance with the License.
# 除非符合许可证，否则不得使用此文件

# You may obtain a copy of the License at
# 可以在下面链接获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非适用法律要求或书面同意，否则本许可下分发的软件是基于"AS IS"的基础分发的，不提供任何明示或暗示的担保或条件
"""
Flax OPT model.
Flax OPT 模型
"""

from functools import partial
# 导入 partial 函数，用于创建偏函数

from typing import Optional, Tuple
# 导入类型提示工具

import flax.linen as nn
# 导入 Flax 的 linen 模块，并命名为 nn

import jax
# 导入 jax 库

import jax.numpy as jnp
# 导入 jax 的 numpy 模块，并命名为 jnp

from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
# 从 flax.core.frozen_dict 导入 FrozenDict、freeze、unfreeze 函数

from flax.linen import combine_masks, make_causal_mask
# 从 flax.linen 导入 combine_masks、make_causal_mask 函数

from flax.linen.attention import dot_product_attention_weights
# 从 flax.linen.attention 导入 dot_product_attention_weights 函数

from flax.traverse_util import flatten_dict, unflatten_dict
# 从 flax.traverse_util 导入 flatten_dict、unflatten_dict 函数

from jax import lax
# 从 jax 库导入 lax 模块

from jax.random import PRNGKey
# 从 jax.random 导入 PRNGKey 类

from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxMaskedLMOutput
# 从 modeling_flax_outputs 模块导入 FlaxBaseModelOutput、FlaxMaskedLMOutput 类

from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
# 从 modeling_flax_utils 模块导入 ACT2FN、FlaxPreTrainedModel、append_call_sample_docstring 函数

from ...utils import add_start_docstrings, logging
# 从 utils 模块导入 add_start_docstrings、logging 函数

from .configuration_opt import OPTConfig
# 从当前目录下的 configuration_opt 模块导入 OPTConfig 类

logger = logging.get_logger(__name__)
# 使用 logging 模块获取当前模块的日志记录器对象

_CHECKPOINT_FOR_DOC = "facebook/opt-350m"
# 预训练模型的检查点名称，用于文档说明

_CONFIG_FOR_DOC = "OPTConfig"
# 配置类的名称，用于文档说明

OPT_START_DOCSTRING = r"""
    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
# OPT_START_DOCSTRING 字符串，包含了 OPT 模型的文档起始部分，提供了继承的类、Flax Linen 的说明以及 JAX 的特性说明
    Parameters:
        config ([`OPTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
"""

OPT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->OPT
class FlaxOPTAttention(nn.Module):
    config: OPTConfig
    embed_dim: int
    num_heads: int
    dropout: float = 0.0
    causal: bool = False
    bias: bool = True
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    def setup(self) -> None:
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_heads
        # 检查是否能整除，若不能则抛出错误
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {self.num_heads})."
            )

        # 部分应用全连接层函数，用于创建查询、键、值和输出投影
        dense = partial(
            nn.Dense,
            self.embed_dim,
            use_bias=self.bias,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )

        # 创建查询、键、值和输出投影层
        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
        self.out_proj = dense()

        # 创建 dropout 层
        self.dropout_layer = nn.Dropout(rate=self.dropout)

        # 如果是因果注意力，创建因果掩码
        if self.causal:
            self.causal_mask = make_causal_mask(
                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
            )
    # 将隐藏状态按照指定维度重新形状化，以便分离注意力头
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))

    # 将隐藏状态按照指定维度重新形状化，以合并注意力头
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

    # 使用 Flax 框架的 @nn.compact 装饰器定义一个方法，用于将单个输入令牌的投影键、值状态与前几步骤的缓存状态连接起来
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否通过缺少现有缓存数据来进行初始化
        is_initialized = self.has_variable("cache", "cached_key")
        # 获取或创建缓存的键和值，并初始化为零矩阵
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 获取或创建缓存索引，并初始化为零
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # 提取批处理维度、最大长度、头数和每个头的深度
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的一维空间切片更新键和值缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            # 更新缓存中的键和值
            cached_key.value = key
            cached_value.value = value
            # 更新缓存索引以反映更新的缓存向量数
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 用于缓存的因果掩码：我们的单个查询位置只应与已生成和缓存的键位置进行关联，而不是剩余的零元素
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            # 组合当前的掩码和输入的注意力掩码
            attention_mask = combine_masks(pad_mask, attention_mask)
        
        # 返回更新后的键、值和注意力掩码
        return key, value, attention_mask
class FlaxOPTDecoderLayer(nn.Module):
    config: OPTConfig  # 定义一个类成员变量 config，类型为 OPTConfig
    dtype: jnp.dtype = jnp.float32  # 定义一个类成员变量 dtype，默认为 jnp.float32

    def setup(self) -> None:
        self.embed_dim = self.config.hidden_size  # 从 config 中获取 hidden_size 并赋给 embed_dim
        self.self_attn = FlaxOPTAttention(  # 初始化 self_attn，使用 FlaxOPTAttention 类
            config=self.config,  # 传入配置参数 config
            embed_dim=self.embed_dim,  # 传入 embed_dim 参数
            num_heads=self.config.num_attention_heads,  # 传入注意力头数
            dropout=self.config.attention_dropout,  # 传入注意力 dropout 率
            causal=True,  # 是否使用因果注意力
            dtype=self.dtype,  # 数据类型为类成员变量 dtype
        )
        self.do_layer_norm_before = self.config.do_layer_norm_before  # 是否在前面进行层归一化
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)  # 初始化 dropout 层
        self.activation_fn = ACT2FN[self.config.activation_function]  # 根据激活函数名称选择对应的激活函数

        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)  # 初始化自注意力层的 LayerNorm
        self.fc1 = nn.Dense(  # 初始化全连接层 fc1
            self.config.ffn_dim,  # 全连接层的输出维度
            dtype=self.dtype,  # 数据类型为类成员变量 dtype
            kernel_init=jax.nn.initializers.normal(self.config.init_std),  # 使用正态分布初始化权重
        )
        self.fc2 = nn.Dense(  # 初始化全连接层 fc2
            self.embed_dim,  # 全连接层的输出维度为 embed_dim
            dtype=self.dtype,  # 数据类型为类成员变量 dtype
            kernel_init=jax.nn.initializers.normal(self.config.init_std)  # 使用正态分布初始化权重
        )
        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)  # 初始化最终输出的 LayerNorm

    def __call__(
        self,
        hidden_states: jnp.ndarray,  # 输入的隐藏状态张量
        attention_mask: jnp.ndarray,  # 注意力掩码张量
        init_cache: bool = False,  # 是否初始化缓存
        output_attentions: bool = True,  # 是否输出注意力权重
        deterministic: bool = True,  # 是否使用确定性计算
    ) -> Tuple[jnp.ndarray]:
        residual = hidden_states  # 保存输入的隐藏状态作为残差连接的基础

        # 根据 self.do_layer_norm_before 的值判断是否在注意力机制之前应用层归一化
        if self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)

        # 自注意力机制
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            init_cache=init_cache,
            deterministic=deterministic,
        )
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)  # 应用 dropout

        hidden_states = residual + hidden_states  # 添加残差连接

        # 根据 self.do_layer_norm_before 的值判断是否在注意力机制之后应用层归一化
        if not self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)

        # 全连接层
        hidden_states_shape = hidden_states.shape
        hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1])  # 将隐藏状态展平

        residual = hidden_states  # 更新残差连接基础

        # 根据 self.do_layer_norm_before 的值判断是否在全连接层之前应用层归一化
        if self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)

        hidden_states = self.fc1(hidden_states)  # 应用第一个全连接层
        hidden_states = self.activation_fn(hidden_states)  # 应用激活函数

        hidden_states = self.fc2(hidden_states)  # 应用第二个全连接层
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)  # 应用 dropout

        hidden_states = (residual + hidden_states).reshape(hidden_states_shape)  # 添加残差连接并恢复形状

        # 根据 self.do_layer_norm_before 的值判断是否在全连接层之后应用层归一化
        if not self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)  # 准备输出结果

        if output_attentions:
            outputs += (self_attn_weights,)  # 如果需要输出注意力权重，则添加到输出中

        return outputs  # 返回模型的输出
class FlaxOPTDecoderLayerCollection(nn.Module):
    config: OPTConfig
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型

    def setup(self):
        # 创建多个解码器层，并按顺序存储在列表中
        self.layers = [
            FlaxOPTDecoderLayer(self.config, name=str(i), dtype=self.dtype)
            for i in range(self.config.num_hidden_layers)
        ]
        # 从配置中获取层丢弃率
        self.layerdrop = self.config.layerdrop

    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
    ):
        # 如果需要输出隐藏状态，则初始化一个空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化一个空元组
        all_self_attns = () if output_attentions else None

        # 遍历每个解码器层
        for decoder_layer in self.layers:
            # 如果需要输出隐藏状态，则将当前隐藏状态加入到列表中
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            # 调用当前解码器层，获取其输出
            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                init_cache=init_cache,
                output_attentions=output_attentions,
                deterministic=deterministic,
            )

            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力权重，则将当前层的注意力权重加入到列表中
            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        # 组装最终输出，包括最终隐藏状态、所有隐藏状态列表和所有注意力权重列表
        outputs = [hidden_states, all_hidden_states, all_self_attns]
        return outputs


class FlaxOPTLearnedPositionalEmbedding(nn.Embed):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def setup(self):
        # 设置位置偏移量
        self.offset = 2
        # 初始化位置嵌入矩阵参数
        self.embedding = self.param(
            "embedding", self.embedding_init, (self.num_embeddings + self.offset, self.features), self.param_dtype
        )

    def __call__(self, positions):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""

        # 调用父类的 __call__ 方法，并在输入位置上加上偏移量
        return super().__call__(positions + self.offset)


class FlaxOPTDecoder(nn.Module):
    config: OPTConfig
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型
    offset: int = 2
    # 设置方法用于初始化模型参数和各种配置
    def setup(self):
        # 初始化一个dropout层，用于随机失活以防止过拟合
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)

        # 从配置中获取隐藏层大小作为嵌入维度
        embed_dim = self.config.hidden_size
        # 从配置中获取填充 token 的索引
        self.padding_idx = self.config.pad_token_id
        # 从配置中获取最大目标位置
        self.max_target_positions = self.config.max_position_embeddings

        # 初始化词嵌入层，使用正态分布初始化方法
        self.embed_tokens = nn.Embed(
            self.config.vocab_size,
            self.config.word_embed_proj_dim,
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
            dtype=self.dtype,
        )

        # 初始化学习位置嵌入层，使用正态分布初始化方法
        self.embed_positions = FlaxOPTLearnedPositionalEmbedding(
            self.config.max_position_embeddings,
            embed_dim,
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
            dtype=self.dtype,
        )

        # 如果词嵌入投影维度不等于隐藏层大小，则初始化投影层
        if self.config.word_embed_proj_dim != self.config.hidden_size:
            self.project_in = nn.Dense(self.config.hidden_size, use_bias=False)
            self.project_out = nn.Dense(self.config.word_embed_proj_dim, use_bias=False)
        else:
            # 否则将投影层设置为 None
            self.project_in = None
            self.project_out = None

        # 检查是否需要在最后一层使用 LayerNorm，主要是为了向后兼容
        if self.config.do_layer_norm_before and not self.config._remove_final_layer_norm:
            self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
        else:
            # 如果不需要 LayerNorm 则将其设置为 None
            self.final_layer_norm = None

        # 初始化解码器层集合
        self.layers = FlaxOPTDecoderLayerCollection(self.config, self.dtype)

    # 模型调用方法，用于执行模型的前向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
        # 其他参数用于控制模型的行为，如是否输出注意力矩阵、隐藏状态等
        ):
        # 获取输入的张量形状
        input_shape = input_ids.shape
        # 将输入张量展平为二维张量
        input_ids = input_ids.reshape(-1, input_shape[-1])

        # 使用嵌入标记方法对输入张量进行嵌入
        inputs_embeds = self.embed_tokens(input_ids)
        # 如果存在输入投影层，则将嵌入结果投影
        if self.project_in is not None:
            inputs_embeds = self.project_in(inputs_embeds)

        # 使用嵌入位置方法生成位置嵌入张量
        positions = self.embed_positions(position_ids)

        # 将嵌入的输入张量和位置嵌入张量相加以得到隐藏状态张量
        hidden_states = inputs_embeds + positions

        # 调用多层模型的前向传播方法，获取隐藏状态、所有隐藏状态和注意力张量
        hidden_state, all_hidden_states, attentions = self.layers(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        # 如果存在最终层归一化，则对隐藏状态进行归一化
        if self.final_layer_norm is not None:
            hidden_state = self.final_layer_norm(hidden_state)

        # 如果存在输出投影层，则对隐藏状态进行投影
        if self.project_out is not None:
            hidden_state = self.project_out(hidden_state)

        # 如果要求输出所有隐藏状态，则将当前隐藏状态加入到所有隐藏状态列表中
        if output_hidden_states:
            all_hidden_states += (hidden_state,)

        # 根据返回值是否为字典形式，决定返回元组还是命名元组形式的输出
        outputs = [hidden_state, all_hidden_states, attentions]

        if not return_dict:
            return tuple(v for v in outputs if v is not None)

        # 返回命名元组形式的输出
        return FlaxBaseModelOutput(
            last_hidden_state=hidden_state,
            hidden_states=all_hidden_states,
            attentions=attentions,
        )
    # 定义一个继承自FlaxPreTrainedModel的类，用于OPT模型的预训练。
    class FlaxOPTPreTrainedModel(FlaxPreTrainedModel):
        # 指定配置类为OPTConfig
        config_class = OPTConfig
        # 指定基础模型前缀为"model"
        base_model_prefix: str = "model"
        # 模块类初始化为None
        module_class: nn.Module = None

        # 初始化函数，接受配置config、输入形状input_shape、种子seed、数据类型dtype等参数
        def __init__(
            self,
            config: OPTConfig,
            input_shape: Tuple[int] = (1, 1),
            seed: int = 0,
            dtype: jnp.dtype = jnp.float32,
            _do_init: bool = True,
            **kwargs,
        ):
            # 使用module_class创建模块对象module，传入config和其他kwargs参数
            module = self.module_class(config=config, dtype=dtype, **kwargs)
            # 调用父类初始化方法，传入config、module、input_shape、seed、dtype、_do_init等参数
            super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

        # 初始化权重函数，接受随机数生成器rng、输入形状input_shape、参数params等参数，返回初始化后的参数params
        def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
            # 初始化input_ids为全零数组，数据类型为"i4"
            input_ids = jnp.zeros(input_shape, dtype="i4")
            # 初始化attention_mask为与input_ids形状相同的全1数组
            attention_mask = jnp.ones_like(input_ids)

            # 获取batch_size和sequence_length
            batch_size, sequence_length = input_ids.shape
            # 初始化position_ids为广播形式的序列长度数组
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

            # 拆分rng生成params_rng和dropout_rng
            params_rng, dropout_rng = jax.random.split(rng)
            # 构建随机数字典rngs，包含params_rng和dropout_rng
            rngs = {"params": params_rng, "dropout": dropout_rng}

            # 使用module的init方法初始化模型参数
            module_init_outputs = self.module.init(
                rngs,
                input_ids,
                attention_mask,
                position_ids,
                return_dict=False,
            )

            # 获取随机初始化的模型参数random_params
            random_params = module_init_outputs["params"]
            # 如果params不为None，则将随机参数和给定参数params进行扁平化处理并填充缺失键
            if params is not None:
                random_params = flatten_dict(unfreeze(random_params))
                params = flatten_dict(unfreeze(params))
                for missing_key in self._missing_keys:
                    params[missing_key] = random_params[missing_key]
                self._missing_keys = set()
                return freeze(unflatten_dict(params))
            else:
                return random_params

        # 初始化缓存函数，用于快速自回归解码
        def init_cache(self, batch_size, max_length):
            r"""
            Args:
                batch_size (`int`):
                    用于快速自回归解码的批量大小。定义了初始化缓存的批处理大小。
                max_length (`int`):
                    自动回归解码的最大可能长度。定义了初始化缓存的序列长度。
            """
            # 初始化input_ids为全1数组，形状为(batch_size, max_length)，数据类型为"i4"
            input_ids = jnp.ones((batch_size, max_length), dtype="i4")
            # 初始化attention_mask为与input_ids形状相同的全1数组，数据类型为"i4"
            attention_mask = jnp.ones_like(input_ids, dtype="i4")
            # 初始化position_ids为广播形式的input_ids的序列长度数组
            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)

            # 使用module的init方法初始化模型变量，设置init_cache为True以初始化缓存
            init_variables = self.module.init(
                jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
            )
            # 返回解除冻结后的缓存变量
            return unfreeze(init_variables["cache"])
    def __call__(
        self,
        input_ids: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        params: dict = None,
        past_key_values: dict = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        dropout_rng: PRNGKey = None,
        deterministic: bool = True,
    ):
        # 设置输出注意力机制的标志，如果未指定，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态的标志，如果未指定，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回字典的标志，如果未指定，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 如果未提供注意力掩码，则创建一个全为1的掩码
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)

        # 如果未提供位置编码，则根据注意力掩码累积的结果生成位置编码
        if position_ids is None:
            position_ids = (attention_mask.cumsum(axis=1) * attention_mask) - 1

        # 处理可能需要的任何伪随机数生成器
        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}

        # 准备模型输入字典
        inputs = {"params": params or self.params}

        # 如果提供了过去的键值对，则将其缓存放入输入中，并标记为可变
        if past_key_values:
            inputs["cache"] = past_key_values
            mutable = ["cache"]
        else:
            mutable = False

        # 应用模型的前向传播
        outputs = self.module.apply(
            inputs,
            input_ids=jnp.array(input_ids, dtype="i4"),
            attention_mask=jnp.array(attention_mask, dtype="i4"),
            position_ids=jnp.array(position_ids, dtype="i4"),
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
            rngs=rngs,
            mutable=mutable,
        )

        # 如果同时传递了过去的键值对和return_dict为True，则将更新后的缓存添加到模型输出中
        if past_key_values is not None and return_dict:
            outputs, past_key_values = outputs
            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
            return outputs
        # 如果同时传递了过去的键值对和return_dict为False，则将更新后的缓存插入到模型输出的适当位置
        elif past_key_values is not None and not return_dict:
            outputs, past_key_values = outputs
            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]

        # 返回模型的输出结果
        return outputs
class FlaxOPTModule(nn.Module):
    config: OPTConfig
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型

    def setup(self):
        # 初始化解码器对象，使用给定的配置和数据类型
        self.decoder = FlaxOPTDecoder(self.config, dtype=self.dtype)

    def _get_decoder_module(self):
        return self.decoder

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
        init_cache=False,
    ):
        # 调用解码器对象进行前向传播
        decoder_outputs = self.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
            init_cache=init_cache,
        )

        if not return_dict:
            return decoder_outputs

        # 返回经过模型输出的结果，作为 FlaxBaseModelOutput 对象
        return FlaxBaseModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            hidden_states=decoder_outputs.hidden_states,
            attentions=decoder_outputs.attentions,
        )


# 从 transformers.models.bart.modeling_flax_bart.FlaxBartModel 复制而来，将 Bart 换成 OPT
class FlaxOPTModel(FlaxOPTPreTrainedModel):
    config: OPTConfig
    dtype: jnp.dtype = jnp.float32  # 计算时使用的数据类型
    module_class = FlaxOPTModule


# 添加函数签名的示例文档到 FlaxOPTModel 类中
append_call_sample_docstring(FlaxOPTModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)


@add_start_docstrings(
    "The bare OPT Model transformer outputting raw hidden-states without any specific head on top.",
    OPT_START_DOCSTRING,
)
class FlaxOPTForCausalLMModule(nn.Module):
    config: OPTConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 初始化 OPT 模型和语言模型头部
        self.model = FlaxOPTModule(config=self.config, dtype=self.dtype)
        self.lm_head = nn.Dense(
            self.config.vocab_size,
            use_bias=False,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        # 调用模型进行前向传播
        model_outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 如果不要求返回字典形式的输出，直接返回模型的输出
        if not return_dict:
            return model_outputs

        # 否则，返回 FlaxBaseModelOutput 对象，其中包含模型的隐藏状态、注意力等信息
        return FlaxBaseModelOutput(
            last_hidden_state=model_outputs.last_hidden_state,
            hidden_states=model_outputs.hidden_states,
            attentions=model_outputs.attentions,
        )
    ):
        # 调用模型进行推理
        outputs = self.model(
            input_ids,
            attention_mask,
            position_ids,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 从模型输出中获取隐藏状态
        hidden_states = outputs[0]

        # 如果配置要求共享词嵌入，则使用decoder的嵌入矩阵作为共享的嵌入
        if self.config.tie_word_embeddings:
            shared_embedding = self.model.variables["params"]["decoder"]["embed_tokens"]["embedding"]
            # 应用共享的词嵌入到隐藏状态得到语言模型的logits
            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
        else:
            # 否则直接用语言模型头部处理隐藏状态得到logits
            lm_logits = self.lm_head(hidden_states)

        # 如果不要求返回字典形式的输出，则返回tuple形式的结果
        if not return_dict:
            return (lm_logits,) + outputs[1:]

        # 返回FlaxMaskedLMOutput对象，其中包含logits、隐藏状态和注意力权重
        return FlaxMaskedLMOutput(
            logits=lm_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
OPT Model with a language modeling head on top (linear layer with weights tied to the input embeddings) e.g for
autoregressive tasks.
"""
@add_start_docstrings(
    """
    OPT Model with a language modeling head on top (linear layer with weights tied to the input embeddings) e.g for
    autoregressive tasks.
    """,
    OPT_START_DOCSTRING,
)
class FlaxOPTForCausalLM(FlaxOPTPreTrainedModel):
    # 使用 FlaxOPTForCausalLMModule 作为模块类
    module_class = FlaxOPTForCausalLMModule

    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
        # initializing the cache
        batch_size, seq_length = input_ids.shape

        # 初始化缓存，准备用于生成
        past_key_values = self.init_cache(batch_size, max_length)

        # 由于解码器使用因果掩码，attention_mask 通常只需要在 input_ids.shape[-1] 之外和 cache_length 之前的位置放置 0，
        # 但这些位置因为因果掩码而已经被屏蔽了。因此，我们可以在这里创建一个静态的 attention_mask，这样更有效率地进行编译。
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")

        if attention_mask is not None:
            # 计算位置 ids
            position_ids = attention_mask.cumsum(axis=1) - 1
            # 更新动态切片的 extended_attention_mask
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
        else:
            # 如果没有传入 attention_mask，则广播生成位置 ids
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        return {
            "past_key_values": past_key_values,
            "attention_mask": extended_attention_mask,
            "position_ids": position_ids,
        }

    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        # 更新生成过程中的输入参数，更新 past_key_values 和 position_ids
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
        return model_kwargs


# 向类添加调用示例文档字符串
append_call_sample_docstring(
    FlaxOPTForCausalLM,
    _CHECKPOINT_FOR_DOC,
    FlaxBaseModelOutput,
    _CONFIG_FOR_DOC,
)

`.\models\opt\modeling_opt.py`

# coding=utf-8
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
PyTorch OPT model.
"""
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from .configuration_opt import OPTConfig

# Check if flash attention 2 is available and import necessary functions
if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

# Get logger instance for the current module
logger = logging.get_logger(__name__)

# Documented variables for model documentation
_CHECKPOINT_FOR_DOC = "facebook/opt-350m"
_CONFIG_FOR_DOC = "OPTConfig"

# Expected output shape for the base model
_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]

# Checkpoint and expected outputs for sequence classification
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ArthurZ/opt-350m-dummy-sc"
_SEQ_CLASS_EXPECTED_LOSS = 1.71
_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_0'"

# List of pretrained model archives for OPT models
OPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/opt-125m",
    "facebook/opt-350m",
    "facebook/opt-1.3b",
    "facebook/opt-2.7b",
    "facebook/opt-6.7b",
    "facebook/opt-13b",
    "facebook/opt-30b",
    # See all OPT models at https://huggingface.co/models?filter=opt
]

# Function to get unpad data from attention mask
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
    """
    Get indices, cumulative sequence lengths, and maximum sequence length from attention mask.

    Args:
        attention_mask (torch.Tensor): Attention mask tensor.

    Returns:
        Tuple: Tuple containing:
            - indices (torch.Tensor): Indices of attention mask where True.
            - cu_seqlens (torch.Tensor): Cumulative sequence lengths.
            - max_seqlen_in_batch (int): Maximum sequence length in the batch.
    """
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
    return indices, cu_seqlens, max_seqlen_in_batch

class OPTLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """
    def __init__(self, num_embeddings: int, embedding_dim: int):
        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
        # and adjust num_embeddings appropriately. Other models don't have this hack
        # 初始化函数，用于初始化一个嵌入层对象。
        # 如果设置了 padding_idx，偏移嵌入 ID 2 个单位，并相应调整 num_embeddings。
        # 其他模型没有这种特殊处理。
        self.offset = 2
        # 调用父类的初始化方法，将 num_embeddings 加上偏移量 self.offset 传递给父类
        super().__init__(num_embeddings + self.offset, embedding_dim)

    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        # 将 attention_mask 转换为 long 类型
        attention_mask = attention_mask.long()

        # 根据 attention_mask 创建位置编码
        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1

        # 如果 past_key_values_length 大于 0，则截取位置编码
        positions = positions[:, past_key_values_length:]

        # 调用父类的 forward 方法，传递调整后的位置编码 positions + self.offset
        return super().forward(positions + self.offset)
    class OptFlashAttention2(OPTAttention):
        """
        OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.
        The only required change would be on the forward pass where it needs to correctly call the public API of flash
        attention and deal with padding tokens in case the input contains any of them.
        """

        # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)

            # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
            # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
            # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
            self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
    # 定义一个方法，用于执行前向传播操作，接受多个参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    # 从 transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward 复制并引用
    def _flash_attention_forward(
        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`float`):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Determine if causal masking should be applied
        if not self._flash_attn_uses_top_left_mask:
            causal = self.is_causal
        else:
            # Temporary check for specific condition until version 2.1
            # of Flash Attention for RoCm; see LlamaFlashAttention2 __init__ comment
            causal = self.is_causal and query_length != 1

        # Check if there are any padding tokens in the input sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            # Unpad the input based on the attention mask
            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            # Extract sequence lengths after unpadding
            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            # Perform variable length Flash Attention computation
            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=causal,
            )

            # Pad the attention output based on the unpadding indices
            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
        else:
            # Perform standard Flash Attention computation without masking
            attn_output = flash_attn_func(
                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
            )

        # Return the computed attention output
        return attn_output
    # 在内部方法中处理输入数据，用于构建查询、键和值的层
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 获取未填充数据的索引、当前序列长度和批次中最大的序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        # 获取键值对层的形状信息
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        # 根据未填充数据的索引重新排列键值对层，以便处理未填充的数据
        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        # 根据未填充数据的索引重新排列值对层，以便处理未填充的数据
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
        )
        
        # 根据查询长度调整查询层的处理方式
        if query_length == kv_seq_len:
            # 如果查询长度等于键值序列长度，按照未填充数据的索引重新排列查询层
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            # 如果查询长度为1，生成一个序列长度为批次大小的序列，用于查询层的处理
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里有一个memcpy操作，这样做效率很低。
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # 对于其他查询长度，假设存在左填充情况，根据注意力掩码和查询层进行未填充处理
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        # 返回处理后的查询层、键层、值层以及相关的索引和序列长度信息
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
OPT_ATTENTION_CLASSES = {
    "eager": OPTAttention,  # 定义了不同的注意力类，根据配置选择不同的实现方式
    "flash_attention_2": OptFlashAttention2,
}


class OPTDecoderLayer(nn.Module):
    def __init__(self, config: OPTConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 设置嵌入维度为隐藏大小

        self.self_attn = OPT_ATTENTION_CLASSES[config._attn_implementation](config=config, is_decoder=True)
        # 初始化自注意力层，根据配置选择相应的注意力实现类

        self.do_layer_norm_before = config.do_layer_norm_before  # 标志是否在层归一化之前执行
        self.dropout = config.dropout  # 设置dropout比率
        self.activation_fn = ACT2FN[config.activation_function]  # 获取激活函数

        self.self_attn_layer_norm = nn.LayerNorm(
            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
        )
        # 自注意力层的归一化层

        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.enable_bias)
        # 第一个全连接层，将嵌入维度映射到FFN维度

        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.enable_bias)
        # 第二个全连接层，将FFN维度映射回嵌入维度

        self.final_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
        # 最终归一化层

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
    ):
        # 前向传播方法，接收隐藏状态和可选的掩码、层头掩码等参数进行处理
        pass


OPT_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`OPTConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

@add_start_docstrings(
    "The bare OPT Model outputting raw hidden-states without any specific head on top.",
    OPT_START_DOCSTRING,
)
class OPTPreTrainedModel(PreTrainedModel):
    config_class = OPTConfig  # 设置配置类
    base_model_prefix = "model"  # 基础模型前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["OPTDecoderLayer"]  # 不拆分的模块列表
    _supports_flash_attn_2 = True  # 支持闪光注意力2
    # 初始化神经网络模块的权重
    def _init_weights(self, module):
        # 从配置中获取初始化的标准差
        std = self.config.init_std
        
        # 如果当前模块是一个线性层
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，均值为0，标准差为std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果有偏置项，将偏置项数据初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        
        # 如果当前模块是一个嵌入层
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为0，标准差为std
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果设置了填充索引，将填充索引对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
# 定义一个空的文档字符串，通常用于类或函数的说明文档
OPT_INPUTS_DOCSTRING = r"""
"""


class OPTDecoder(OPTPreTrainedModel):
    """
    OPT 解码器，由 config.num_hidden_layers 层组成。每一层都是一个 OPTDecoderLayer 对象。

    Args:
        config: OPTConfig
    """

    def __init__(self, config: OPTConfig):
        super().__init__(config)
        self.dropout = config.dropout  # 初始化 dropout
        self.layerdrop = config.layerdrop  # 初始化层级 dropout
        self.padding_idx = config.pad_token_id  # 初始化填充 token 的索引
        self.max_target_positions = config.max_position_embeddings  # 最大目标位置
        self.vocab_size = config.vocab_size  # 词汇表大小

        # 词嵌入层，将词汇表中的词转换为 word_embed_proj_dim 维度的向量，支持填充 token
        self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
        # 学习到的位置嵌入
        self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)

        if config.word_embed_proj_dim != config.hidden_size:
            # 如果词嵌入维度与隐藏层大小不同，定义一个线性层用于投影输出
            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
        else:
            self.project_out = None

        if config.word_embed_proj_dim != config.hidden_size:
            # 如果词嵌入维度与隐藏层大小不同，定义一个线性层用于投影输入
            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
        else:
            self.project_in = None

        # 根据配置初始化最终的层归一化层，用于处理最后输出
        if config.do_layer_norm_before and not config._remove_final_layer_norm:
            self.final_layer_norm = nn.LayerNorm(
                config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine
            )
        else:
            self.final_layer_norm = None

        # 创建包含 config.num_hidden_layers 个 OPTDecoderLayer 对象的层列表
        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 根据配置选择是否使用 Flash Attention 2 实现
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"

        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 添加文档字符串的装饰器，指示此方法输出原始隐藏状态，没有特定的输出头
        @add_start_docstrings(
            "The bare OPT Model outputting raw hidden-states without any specific head on top.",
            OPT_START_DOCSTRING,
        )
        class OPTModel(OPTPreTrainedModel):
    def __init__(self, config: OPTConfig):
        # 调用父类的初始化方法，传入配置参数 config
        super().__init__(config)
        # 创建一个 OPTDecoder 类的实例，并将其赋值给 self.decoder
        self.decoder = OPTDecoder(config)
        # 调用类内部方法 post_init，用于初始化权重和应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回 self.decoder 的 embed_tokens 属性，通常用于获取输入的嵌入表示
        return self.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置 self.decoder 的 embed_tokens 属性为给定的 value
        self.decoder.embed_tokens = value

    def get_decoder(self):
        # 返回 self.decoder 对象，通常用于获取解码器的实例
        return self.decoder

    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        # 如果 output_attentions 不为 None，则使用其值；否则使用 self.config.output_attentions 的值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 不为 None，则使用其值；否则使用 self.config.output_hidden_states 的值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 use_cache 不为 None，则使用其值；否则使用 self.config.use_cache 的值
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        # 如果 return_dict 不为 None，则使用其值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 self.decoder 的前向传播方法，传入各种参数，并将结果赋值给 decoder_outputs
        decoder_outputs = self.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果 return_dict 为 False，则直接返回 decoder_outputs
        if not return_dict:
            return decoder_outputs

        # 如果 return_dict 为 True，则构造一个 BaseModelOutputWithPast 对象并返回
        return BaseModelOutputWithPast(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            hidden_states=decoder_outputs.hidden_states,
            attentions=decoder_outputs.attentions,
        )
    class OPTForCausalLM(OPTPreTrainedModel):
        # 定义权重共享的键名列表
        _tied_weights_keys = ["lm_head.weight"]

        def __init__(self, config):
            # 调用父类的初始化方法
            super().__init__(config)
            # 根据配置信息创建OPTModel模型
            self.model = OPTModel(config)

            # 初始化 lm_head，将输入维度投影到词汇表大小的线性层，无偏置
            self.lm_head = nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)

            # 执行后续的初始化操作
            self.post_init()

        def get_input_embeddings(self):
            # 返回模型解码器的嵌入词汇表
            return self.model.decoder.embed_tokens

        def set_input_embeddings(self, value):
            # 设置模型解码器的嵌入词汇表
            self.model.decoder.embed_tokens = value

        def get_output_embeddings(self):
            # 返回 lm_head 作为输出的嵌入层
            return self.lm_head

        def set_output_embeddings(self, new_embeddings):
            # 设置 lm_head 作为输出的新嵌入层
            self.lm_head = new_embeddings

        def set_decoder(self, decoder):
            # 设置模型的解码器
            self.model.decoder = decoder

        def get_decoder(self):
            # 返回模型的解码器
            return self.model.decoder

        @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
        def forward(
            self,
            input_ids: torch.LongTensor = None,
            attention_mask: Optional[torch.Tensor] = None,
            head_mask: Optional[torch.Tensor] = None,
            past_key_values: Optional[List[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.FloatTensor] = None,
            labels: Optional[torch.LongTensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ):
            # 模型的前向传播函数，接收多种输入参数并返回相应的输出

        def prepare_inputs_for_generation(
            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
        ):
            if past_key_values is not None:
                # 计算过去键值对的长度
                past_length = past_key_values[0][0].shape[2]

                # 某些生成方法已经仅传递最后一个输入 ID
                if input_ids.shape[1] > past_length:
                    remove_prefix_length = past_length
                else:
                    # 默认情况下保留仅最后一个 ID
                    remove_prefix_length = input_ids.shape[1] - 1

                # 仅保留从 remove_prefix_length 开始的 input_ids
                input_ids = input_ids[:, remove_prefix_length:]

            # 如果传入了 `inputs_embeds`，我们只在第一个生成步骤中使用它们
            if inputs_embeds is not None and past_key_values is None:
                model_inputs = {"inputs_embeds": inputs_embeds}
            else:
                model_inputs = {"input_ids": input_ids}

            # 更新模型输入参数字典
            model_inputs.update(
                {
                    "past_key_values": past_key_values,
                    "use_cache": kwargs.get("use_cache"),
                    "attention_mask": attention_mask,
                }
            )
            return model_inputs

        @staticmethod
    # 定义一个函数 `_reorder_cache`，用于重排序缓存数据 `past_key_values`，以适应新的束搜索索引 `beam_idx`。
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化一个空的重排序后的缓存元组
        reordered_past = ()
        # 遍历 `past_key_values` 中的每一层的过去状态
        for layer_past in past_key_values:
            # 对于每一层的过去状态，根据 `beam_idx` 将状态重新排序并转移到相同的设备上
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重排序后的缓存元组
        return reordered_past
"""
The OPT Model transformer with a sequence classification head on top (linear layer).

[`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-2) do.

Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(OPT_START_DOCSTRING)
class OPTForSequenceClassification(OPTPreTrainedModel):
    def __init__(self, config: OPTConfig):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = OPTModel(config)
        self.score = nn.Linear(config.word_embed_proj_dim, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
        output_type=SequenceClassifierOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the OPTForSequenceClassification model.
        """
        # Implementation details are encapsulated in the model's architecture.

    def get_input_embeddings(self):
        """
        Retrieve the input embeddings from the model's decoder.
        """
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        """
        Set new input embeddings for the model's decoder.
        """
        self.model.decoder.embed_tokens = value


"""
The OPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
(a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(OPT_START_DOCSTRING)
class OPTForQuestionAnswering(OPTPreTrainedModel):
    def __init__(self, config: OPTConfig):
        super().__init__(config)
        self.model = OPTModel(config)
        self.qa_outputs = nn.Linear(config.word_embed_proj_dim, 2)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
    # 使用装饰器替换返回文档字符串，指定输出类型为QuestionAnsweringModelOutput，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token IDs，可以为空
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码，可以为空
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码，可以为空
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 过去的键值对，可以为空
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入向量，可以为空
        start_positions: Optional[torch.LongTensor] = None,  # 起始位置，可以为空
        end_positions: Optional[torch.LongTensor] = None,  # 结束位置，可以为空
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以为空
        output_attentions: Optional[bool] = None,  # 是否输出注意力，可以为空
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以为空
        return_dict: Optional[bool] = None,  # 是否返回字典形式的结果，可以为空
    ):
        # 返回模型输入嵌入的对象
        def get_input_embeddings(self):
            return self.model.decoder.embed_tokens

        # 设置模型输入嵌入的值
        def set_input_embeddings(self, value):
            self.model.decoder.embed_tokens = value

`.\models\opt\modeling_tf_opt.py`

# coding=utf-8
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 OPT model."""

# Future annotations are imported from the __future__ module to ensure compatibility with future versions of Python.
from __future__ import annotations

# Necessary imports from standard library and third-party packages
from typing import Optional, Tuple, Union

import numpy as np  # NumPy library for numerical operations
import tensorflow as tf  # TensorFlow library for deep learning

# Importing specific functions and classes from sibling modules
from ...activations_tf import get_tf_activation  # Function to get TensorFlow activation functions
from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast  # Output classes for TF models

# Public API imports from sibling modules
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    TFSharedEmbeddings,
    keras,
    keras_serializable,
    unpack_inputs,
)
# Utilities for TensorFlow operations and models
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
# Various utility functions and decorators
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# Importing OPT model configuration
from .configuration_opt import OPTConfig  # Configuration class specific to OPT model


logger = logging.get_logger(__name__)  # Logger instance for logging messages


_CHECKPOINT_FOR_DOC = "facebook/opt-350m"  # Pretrained model checkpoint identifier
_CONFIG_FOR_DOC = "OPTConfig"  # Configuration class identifier for documentation purposes

# Base model docstring
_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]  # Expected output shape for the base model

# Causal LM output example
_CAUSAL_LM_EXPECTED_OUTPUT = (
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
)

LARGE_NEGATIVE = -1e8  # Constant representing a large negative value


def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.

    Args:
        input_ids_shape (tf.TensorShape): Shape of input tensor representing input ids.
        past_key_values_length (int): Length of past key values for attention mechanism.

    Returns:
        tf.Tensor: Causal mask tensor for bi-directional self-attention.
    """
    bsz = input_ids_shape[0]  # Batch size extracted from input_ids_shape
    tgt_len = input_ids_shape[1]  # Target sequence length extracted from input_ids_shape

    # Initialize a mask with large negative values
    mask = tf.fill((tgt_len, tgt_len), tf.cast(LARGE_NEGATIVE, tf.float32))
    # Apply upper triangular part of the mask
    mask = tf.linalg.band_part(mask, 0, -1) - tf.linalg.band_part(mask, 0, 0)

    # Concatenate zeros for past key values if present
    if past_key_values_length > 0:
        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)

    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))


# Function copied from BART model implementation to expand attention mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.

    Args:
        mask (tf.Tensor): Tensor representing attention mask.
        tgt_len (Optional[int]): Target sequence length (default: None).

    Returns:
        tf.Tensor: Expanded attention mask tensor.
    """
    src_len = shape_list(mask)[1]  # Source sequence length extracted from mask tensor
    tgt_len = tgt_len if tgt_len is not None else src_len  # Use provided tgt_len or src_len if None
    one_cst = tf.constant(1.0)  # Constant tensor with value 1.0
    mask = tf.cast(mask, dtype=one_cst.dtype)  # Cast mask tensor to the same dtype as one_cst
    # Expand the mask tensor
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    return expanded_mask
    # 返回一个数值，计算方式为 (one_cst - expanded_mask) * LARGE_NEGATIVE
    return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFOPTLearnedPositionalEmbedding(keras.layers.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
        # and adjust num_embeddings appropriately. Other models don't have this hack
        # 设置偏移量为2，以便在指定padding_idx时，将embedding ids偏移2，并相应调整num_embeddings
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)

    def call(self, attention_mask, past_key_values_length: int = 0):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        attention_mask = tf.cast(attention_mask, tf.int64)

        # create positions depending on attention_mask
        # 根据attention_mask创建位置张量
        positions = tf.math.cumsum(attention_mask, axis=1) * attention_mask - 1

        # cut positions if `past_key_values_length` is > 0
        # 如果past_key_values_length > 0，则截取positions张量的后部分
        positions = positions[:, past_key_values_length:]

        return super().call(positions + self.offset)


# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->OPT
class TFOPTAttention(keras.layers.Layer):
    """Multi-headed attention from "Attention Is All You Need"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

        self.num_heads = num_heads
        self.dropout = keras.layers.Dropout(dropout)
        self.head_dim = embed_dim // num_heads
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        # Linear projections for keys, queries, values, and output
        # 用于键（k_proj）、查询（q_proj）、数值（v_proj）、输出（out_proj）的线性投影层
        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")

    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
        # Reshape tensor into [bsz, num_heads, seq_len, head_dim] format
        # 将张量重塑为[bsz, num_heads, seq_len, head_dim]格式
        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))

    def call(
        self,
        hidden_states: tf.Tensor,
        key_value_states: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs
    ):
    # 定义一个方法用于构建网络模型，可以接受输入形状参数
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在 k_proj 属性，则构建 k_proj 层
        if getattr(self, "k_proj", None) is not None:
            # 在命名作用域下构建 k_proj 层，并指定输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.k_proj.name):
                self.k_proj.build([None, None, self.embed_dim])
        
        # 如果存在 q_proj 属性，则构建 q_proj 层
        if getattr(self, "q_proj", None) is not None:
            # 在命名作用域下构建 q_proj 层，并指定输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.q_proj.name):
                self.q_proj.build([None, None, self.embed_dim])
        
        # 如果存在 v_proj 属性，则构建 v_proj 层
        if getattr(self, "v_proj", None) is not None:
            # 在命名作用域下构建 v_proj 层，并指定输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.v_proj.name):
                self.v_proj.build([None, None, self.embed_dim])
        
        # 如果存在 out_proj 属性，则构建 out_proj 层
        if getattr(self, "out_proj", None) is not None:
            # 在命名作用域下构建 out_proj 层，并指定输入形状为 [None, None, self.embed_dim]
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.embed_dim])
# 定义 TFOPTDecoderLayer 类，继承自 keras.layers.Layer
class TFOPTDecoderLayer(keras.layers.Layer):
    # 初始化方法，接受一个 config 参数和其他关键字参数
    def __init__(self, config: OPTConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        
        # 从 config 中获取是否在层归一化之前执行操作的标志
        self.do_layer_norm_before = config.do_layer_norm_before
        # 设置嵌入维度为隐藏大小
        self.embed_dim = config.hidden_size
        
        # 创建自注意力机制的实例 self_attn，传入嵌入维度、注意力头数、注意力层的 dropout、名称和是否为解码器标志
        self.self_attn = TFOPTAttention(
            embed_dim=self.embed_dim,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            name="self_attn",
            is_decoder=True,
        )
        
        # 创建 dropout 层，使用给定的 dropout 率
        self.dropout = keras.layers.Dropout(config.dropout)
        
        # 获取激活函数并赋值给 activation_fn
        self.activation_fn = get_tf_activation(config.activation_function)

        # 创建自注意力层后的层归一化层 self_attn_layer_norm，使用给定的 epsilon 值和名称
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        
        # 创建第一个全连接层 fc1，使用给定的维度 config.ffn_dim 和名称
        self.fc1 = keras.layers.Dense(config.ffn_dim, name="fc1")
        
        # 创建第二个全连接层 fc2，使用前一个全连接层输出的维度作为输入维度，输出维度为嵌入维度，设置名称为 fc2
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        
        # 创建最终的层归一化层 final_layer_norm，使用给定的 epsilon 值和名称
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        
        # 将 config 参数存储在对象中
        self.config = config

    # 定义 call 方法，实现层的正向传播
    def call(
        self,
        hidden_states: tf.Tensor,  # 输入的隐藏状态张量
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，可以是 NumPy 数组、Tensor 或 None
        layer_head_mask: tf.Tensor | None = None,  # 层头掩码，可以是 Tensor 或 None
        past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,  # 过去的键值对，可选的元组类型
        training: Optional[bool] = False,  # 训练模式标志，默认为 False
        output_attentions: Optional[bool] = False,  # 输出注意力权重的标志，默认为 False
        use_cache: Optional[bool] = False,  # 使用缓存的标志，默认为 False

= None,
        training: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        """
        Args:
            hidden_states (`tf.Tensor`): 输入到层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`, *可选*): 注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`，
                其中填充元素由非常大的负值表示。
            layer_head_mask (`tf.Tensor`, *可选*): 给定层中注意力头的掩码，形状为 `(decoder_attention_heads,)`
            past_key_value (`Tuple(tf.Tensor)`, *可选*): 缓存的过去键和值投影状态
            training (`bool`, *可选*, 默认为 `False`):
                是否在训练模式下使用模型（某些模块如 dropout 在训练和评估中的行为不同）。
        """
        residual = hidden_states

        # 125m, 1.7B, ..., 175B 在进行自注意力之前应用层归一化
        if self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)

        # 自注意力机制
        # 解码器单向自注意力缓存的键/值对在位置 1 和 2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None

        # 将当前的自注意力缓存添加到 present_key_value 元组的位置 1 和 2
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=self_attn_past_key_value,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
        )
        hidden_states = self.dropout(hidden_states, training=training)
        hidden_states = residual + hidden_states

        # 350m 在进行自注意力之后应用层归一化
        if not self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)

        # 全连接层
        residual = hidden_states
        # 125m, 1.7B, ..., 175B 在进行自注意力之前应用层归一化
        if self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)

        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)

        hidden_states = self.fc2(hidden_states)
        hidden_states = self.dropout(hidden_states, training=training)
        hidden_states = residual + hidden_states

        # 350m 在进行自注意力之后应用层归一化
        if not self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)

        return (hidden_states, self_attn_weights, present_key_value)
    # 构建模型的方法，用于在给定输入形状的情况下构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建完成，直接返回
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在 self_attn 属性，构建 self_attn 层
        if getattr(self, "self_attn", None) is not None:
            # 在命名空间下构建 self_attn 层
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，构建 self_attn_layer_norm 层
        if getattr(self, "self_attn_layer_norm", None) is not None:
            # 在命名空间下构建 self_attn_layer_norm 层
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，构建 fc1 层
        if getattr(self, "fc1", None) is not None:
            # 在命名空间下构建 fc1 层
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，构建 fc2 层
        if getattr(self, "fc2", None) is not None:
            # 在命名空间下构建 fc2 层
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.ffn_dim])
        
        # 如果存在 final_layer_norm 属性，构建 final_layer_norm 层
        if getattr(self, "final_layer_norm", None) is not None:
            # 在命名空间下构建 final_layer_norm 层
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
OPT_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`OPTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 添加了一个文档字符串，详细描述了模型的继承关系和输入格式的支持情况

@add_start_docstrings(
    "The bare OPT Model outputting raw hidden-states without any specific head on top.",
    OPT_START_DOCSTRING,
)
# 使用装饰器 @add_start_docstrings 添加了模型类的说明，包括输出的类型和继承的文档信息

class TFOPTPreTrainedModel(TFPreTrainedModel):
    """
    TFOPT Pretrained Model that inheritates from transformers.TFPreTrainedModel

    Args:
        config: OPTConfig
    """
    
    # 定义一个 TF 优化模型的子类，继承自 TFPreTrainedModel
    # 参数 config 为 OPTConfig 类的实例，用于配置模型

    config_class = OPTConfig
    # 指定配置类为 OPTConfig，用于设置模型的参数

    base_model_prefix = "model"
    # 设置基础模型前缀为 "model"

OPT_INPUTS_DOCSTRING = r"""
"""

# 添加一个空的文档字符串 OPT_INPUTS_DOCSTRING，等待后续补充说明
    Args:
        input_ids (`tf.Tensor` of shape `({0})`):
            输入序列中词汇表中的输入序列标记的索引。

            可以使用 [`AutoTokenizer`] 获得这些索引。有关详细信息，请参见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。

            [什么是输入 ID？](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
            遮罩，用于在填充标记索引上避免执行注意力操作。遮罩值选择在 `[0, 1]`：

            - 1 表示**未遮罩**的标记，
            - 0 表示**遮罩**的标记。

            [什么是注意力遮罩？](../glossary#attention-mask)
        head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            用于在编码器中将选定的注意力模块头部置零的遮罩。遮罩值选择在 `[0, 1]`：

            - 1 表示**未遮罩**的头部，
            - 0 表示**遮罩**的头部。

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            包含注意力块预计算的键和值隐藏状态。可用于加速解码过程。
            如果使用 `past_key_values`，用户可以选择只输入最后的 `decoder_input_ids`（这些没有给出其过去键值状态的模型）的形状为 `(batch_size, 1)`，而不是所有 `decoder_input_ids` 的形状为 `(batch_size, sequence_length)`。
        use_cache (`bool`, *optional*, defaults to `True`):
            如果设置为 `True`，则返回 `past_key_values` 键值状态，可用于加速解码（参见 `past_key_values`）。在训练期间设置为 `False`，在生成期间设置为 `True`。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。有关更多详细信息，请参见返回张量下的 `attentions`。此参数仅在即时模式下可用，在图模式下将使用配置中的值。
        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。有关更多详细信息，请参见返回张量下的 `hidden_states`。此参数仅在即时模式下可用，在图模式下将使用配置中的值。
        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而不是普通元组。此参数可以在即时模式下使用，在图模式下将始终设置为 True。
        training (`bool`, *optional*, defaults to `False`):
            是否在训练模式下使用模型（某些模块如 dropout 模块在训练和评估中有不同的行为）。
"""
@keras_serializable
class TFOPTDecoder(keras.layers.Layer):
    config_class = OPTConfig

    def __init__(self, config: OPTConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config  # 初始化配置对象，包含解码器的各种配置参数
        self.padding_idx = config.pad_token_id  # 设置填充标记的索引
        self.layerdrop = config.layerdrop  # 设置层跳跃的概率
        num_embeddings = config.max_position_embeddings  # 获取最大位置编码的数量
        self.embed_tokens = TFSharedEmbeddings(
            config.vocab_size, config.word_embed_proj_dim, config.pad_token_id, name="embed_tokens"
        )  # 初始化共享的词嵌入对象
        self.embed_positions = TFOPTLearnedPositionalEmbedding(
            num_embeddings,
            config.hidden_size,
            name="embed_positions",
        )  # 初始化位置编码对象

        # 注意：`config._remove_final_layer_norm` 仅用于保持与旧版本的兼容性，
        # 在 transformers v4.20.1 之前微调过的检查点需要使用，详见 https://github.com/facebookresearch/metaseq/pull/164
        if config.do_layer_norm_before and not config._remove_final_layer_norm:
            self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        else:
            self.final_layer_norm = None  # 如果不需要最终的层归一化，则为 None

        if config.word_embed_proj_dim != config.hidden_size:
            self.project_out = keras.layers.Dense(config.word_embed_proj_dim, name="project_out", use_bias=False)
            self.project_in = keras.layers.Dense(config.hidden_size, name="project_in", use_bias=False)
        else:
            self.project_in = None
            self.project_out = None  # 如果词嵌入投影维度与隐藏层维度相同，则为 None

        self.layers = [TFOPTDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)]
        self.dropout = keras.layers.Dropout(config.dropout)  # 初始化 dropout 层

    def get_embed_tokens(self):
        return self.embed_tokens  # 返回词嵌入对象

    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens  # 设置新的词嵌入对象

    def set_input_embeddings(self, new_embeddings):
        self.embed_tokens.vocab_size = new_embeddings.shape[0]  # 更新词汇表大小
        self.embed_tokens.weight = new_embeddings  # 更新词嵌入权重矩阵

    def get_input_embeddings(self):
        return self.embed_tokens  # 返回当前词嵌入对象
    # 如果模型已经构建，则直接返回，不进行重复构建
    if self.built:
        return

    # 将模型标记为已构建状态
    self.built = True

    # 如果存在嵌入标记，构建嵌入标记模块
    if getattr(self, "embed_tokens", None) is not None:
        with tf.name_scope(self.embed_tokens.name):
            self.embed_tokens.build(None)

    # 如果存在位置嵌入，构建位置嵌入模块
    if getattr(self, "embed_positions", None) is not None:
        with tf.name_scope(self.embed_positions.name):
            self.embed_positions.build(None)

    # 如果存在最终层归一化，构建最终层归一化模块
    if getattr(self, "final_layer_norm", None) is not None:
        with tf.name_scope(self.final_layer_norm.name):
            self.final_layer_norm.build([None, None, self.config.hidden_size])

    # 如果存在输出投影层，构建输出投影层模块
    if getattr(self, "project_out", None) is not None:
        with tf.name_scope(self.project_out.name):
            self.project_out.build([None, None, self.config.hidden_size])

    # 如果存在输入投影层，构建输入投影层模块
    if getattr(self, "project_in", None) is not None:
        with tf.name_scope(self.project_in.name):
            self.project_in.build([None, None, self.config.word_embed_proj_dim])

    # 如果存在多层结构，逐层构建每一层
    if getattr(self, "layers", None) is not None:
        for layer in self.layers:
            with tf.name_scope(layer.name):
                layer.build(None)
# 使用 keras_serializable 装饰器将类声明为可序列化的 Keras 模型
@keras_serializable
class TFOPTMainLayer(keras.layers.Layer):
    # 设置配置类为 OPTConfig
    config_class = OPTConfig

    # 初始化方法，接受配置对象 config 和其他关键字参数
    def __init__(self, config: OPTConfig, **kwargs):
        super().__init__(**kwargs)
        # 将配置对象 config 存储在实例中
        self.config = config
        # 创建 TFOPTDecoder 对象，并命名为 "decoder"
        self.decoder = TFOPTDecoder(config, name="decoder")

    # 获取输入嵌入的方法，返回解码器的嵌入标记
    def get_input_embeddings(self):
        return self.decoder.embed_tokens

    # 设置输入嵌入的方法，用新的嵌入替换解码器的嵌入标记
    def set_input_embeddings(self, new_embeddings):
        self.decoder.set_input_embeddings(new_embeddings)

    # 使用 unpack_inputs 装饰器定义的调用方法，接受多个输入参数，返回 TFBaseModelOutputWithPast 或者 Tensor 元组
    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
        # 根据传入的参数或者配置对象设置输出注意力和隐藏状态
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用解码器对象进行处理，返回结果存储在 outputs 变量中
        outputs = self.decoder(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 如果 return_dict 为 False，则直接返回 outputs
        if not return_dict:
            return outputs

        # 否则，构造 TFBaseModelOutputWithPast 对象，返回其中的属性作为输出
        return TFBaseModelOutputWithPast(
            last_hidden_state=outputs.last_hidden_state,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 构建方法，用于构建模型结构，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在解码器对象，则在解码器的名称空间内构建其结构
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)


# 使用 add_start_docstrings 装饰器添加模型的文档字符串说明和 OPT_START_DOCSTRING
@add_start_docstrings(
    "The bare TF OPT Model outputting raw hidden-states without any specific head on top.",
    OPT_START_DOCSTRING,
)
# 使用 keras_serializable 装饰器将类声明为可序列化的 Keras 模型
@keras_serializable
class TFOPTModel(TFOPTPreTrainedModel):
    # 设置配置类为 OPTConfig
    config_class = OPTConfig

    # 初始化方法，接受配置对象 config 和其他关键字参数
    def __init__(self, config: OPTConfig, **kwargs):
        super().__init__(config, **kwargs)
        # 将配置对象 config 存储在实例中
        self.config = config
        # 创建 TFOPTMainLayer 对象，并命名为 "model"
        self.model = TFOPTMainLayer(config, name="model")
    # 获取输入嵌入层，即模型解码器的嵌入标记
    def get_input_embeddings(self):
        return self.model.decoder.embed_tokens

    # 设置输入嵌入层，用新的嵌入进行替换
    def set_input_embeddings(self, new_embeddings):
        self.model.set_input_embeddings(new_embeddings)

    # 使用装饰器 unpack_inputs 解包输入参数，并为模型的 call 方法添加文档字符串
    # 该方法用于模型调用，接收多个输入参数，返回模型输出或包含过去键值的对象
    @unpack_inputs
    @add_start_docstrings_to_model_forward(OPT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
        # 根据传入的参数或者配置决定是否使用输出注意力、隐藏状态及缓存
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法，传递给模型的参数包括输入数据、注意力掩码、头部掩码等
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 如果不要求返回字典形式的输出，则直接返回模型的原始输出
        if not return_dict:
            return outputs

        # 构造 TFBaseModelOutputWithPast 对象，包含最后隐藏状态、过去键值、隐藏状态、注意力等信息
        return TFBaseModelOutputWithPast(
            last_hidden_state=outputs.last_hidden_state,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 用于服务输出的方法，根据配置决定是否返回过去键值、隐藏状态和注意力
    def serving_output(self, output):
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

        # 返回 TFBaseModelOutputWithPast 对象，包含最后隐藏状态、过去键值、隐藏状态、注意力
        return TFBaseModelOutputWithPast(
            last_hidden_state=output.last_hidden_state,
            past_key_values=pkv,
            hidden_states=hs,
            attentions=attns,
        )
    # 定义模型的构建方法，用于构建模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型的构建状态标记为已构建
        self.built = True
        # 检查模型是否已经实例化，如果是，则在命名空间下构建模型
        if getattr(self, "model", None) is not None:
            with tf.name_scope(self.model.name):
                # 使用 None 的输入形状构建模型
                self.model.build(None)
@add_start_docstrings(
    """
    The OPT Model transformer with a language modeling head on top.
    """,
    OPT_START_DOCSTRING,
)
@keras_serializable
class TFOPTForCausalLM(TFOPTPreTrainedModel, TFCausalLanguageModelingLoss):
    # 使用 OPTConfig 作为配置类
    config_class = OPTConfig

    def __init__(self, config: OPTConfig, **kwargs):
        # 调用父类构造函数，初始化配置
        super().__init__(config, **kwargs)
        self.config = config
        # 创建 TFOPTMainLayer 模型，命名为 "model"
        self.model = TFOPTMainLayer(config, name="model")

    def get_output_embeddings(self):
        # 获取模型的输入嵌入
        return self.model.get_input_embeddings()

    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
        # 获取 kwargs 中的注意力遮罩
        attention_mask = kwargs.get("attention_mask", None)

        # 如果 past_key_values 存在，则只使用输入的最后一个标记
        if past_key_values:
            inputs = tf.expand_dims(inputs[:, -1], -1)

        # 返回准备好的输入字典
        return {
            "input_ids": inputs,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @unpack_inputs
    @replace_return_docstrings(output_type=TFCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CAUSAL_LM_EXPECTED_OUTPUT,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        labels: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    ):
        # 实现模型的前向传播逻辑，详细说明参考函数装饰器
        def serving_output(self, output):
        # 根据配置决定是否使用缓存来处理输出中的过去键值
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 根据配置决定是否输出隐藏状态
        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
        # 根据配置决定是否输出注意力权重
        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

        # 返回带有过去键值的语言模型输出对象
        return TFCausalLMOutputWithPast(
            past_key_values=pkv,
            hidden_states=hs,
            attentions=attns,
            loss=output.loss,
            logits=output.logits,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记已经构建
        self.built = True
        # 如果存在模型属性，则在名称作用域内构建模型
        if getattr(self, "model", None) is not None:
            with tf.name_scope(self.model.name):
                self.model.build(None)

`.\models\opt\init.py`

# 版权声明和许可信息，声明代码版权和使用许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入必要的类型检查模块
from typing import TYPE_CHECKING

# 引入内部工具函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构，用于延迟加载模块
_import_structure = {"configuration_opt": ["OPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OPTConfig"]}

# 检查是否支持 Torch，若不支持则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 Torch，则添加相关模型的导入结构
    _import_structure["modeling_opt"] = [
        "OPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "OPTForCausalLM",
        "OPTModel",
        "OPTPreTrainedModel",
        "OPTForSequenceClassification",
        "OPTForQuestionAnswering",
    ]

# 检查是否支持 TensorFlow，若不支持则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 TensorFlow，则添加相关模型的导入结构
    _import_structure["modeling_tf_opt"] = ["TFOPTForCausalLM", "TFOPTModel", "TFOPTPreTrainedModel"]

# 检查是否支持 Flax，若不支持则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 Flax，则添加相关模型的导入结构
    _import_structure["modeling_flax_opt"] = [
        "FlaxOPTForCausalLM",
        "FlaxOPTModel",
        "FlaxOPTPreTrainedModel",
    ]

# 如果是类型检查模式，则进行额外的导入
if TYPE_CHECKING:
    # 导入配置相关的内容
    from .configuration_opt import OPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OPTConfig

    try:
        # 检查是否支持 Torch，若不支持则跳过
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果支持 Torch，则导入 Torch 相关的模型
        from .modeling_opt import (
            OPT_PRETRAINED_MODEL_ARCHIVE_LIST,
            OPTForCausalLM,
            OPTForQuestionAnswering,
            OPTForSequenceClassification,
            OPTModel,
            OPTPreTrainedModel,
        )

    try:
        # 检查是否支持 TensorFlow，若不支持则跳过
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果支持 TensorFlow，则导入 TensorFlow 相关的模型
        from .modeling_tf_opt import TFOPTForCausalLM, TFOPTModel, TFOPTPreTrainedModel

    try:
        # 检查是否支持 Flax，若不支持则跳过
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果支持 Flax，则导入 Flax 相关的模型
        from .modeling_flax_opt import FlaxOPTForCausalLM, FlaxOPTModel, FlaxOPTPreTrainedModel

else:
    # 如果不是类型检查模式，则使用 LazyModule 进行延迟加载
    import sys

    # 将当前模块替换为 LazyModule 实例，实现延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\owlv2\configuration_owlv2.py`

# coding=utf-8
# 文件编码声明，指定使用UTF-8编码
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明，标明代码版权归HuggingFace Inc.团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 指定代码采用Apache License, Version 2.0许可证发布
# you may not use this file except in compliance with the License.
# 除非符合许可证条件，否则不得使用此文件
# You may obtain a copy of the License at
# 可在上述链接获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
# 许可证链接

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 根据许可证规定，软件按“原样”分发，不提供任何形式的保证或条件

""" OWLv2 model configuration"""
# 代码文件说明，指定为OWLv2模型配置

import os
# 导入操作系统模块
from typing import TYPE_CHECKING, Dict, Union
# 导入类型检查相关模块

if TYPE_CHECKING:
    pass
# 如果类型检查为真，则执行相应操作（此处为空）

from ...configuration_utils import PretrainedConfig
# 导入预训练配置类
from ...utils import logging
# 导入日志模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

OWLV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/owlv2-base-patch16": "https://huggingface.co/google/owlv2-base-patch16/resolve/main/config.json",
}
# OWLv2预训练配置存档映射，将模型名称映射到其配置文件的URL

# Copied from transformers.models.owlvit.configuration_owlvit.OwlViTTextConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2
# 从transformers.models.owlvit.configuration_owlvit.OwlViTTextConfig复制而来，修改了OwlViT为Owlv2，owlvit-base-patch32为owlv2-base-patch16，owlvit为owlv2，OWL-ViT为OWLv2
class Owlv2TextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`Owlv2TextModel`]. It is used to instantiate an
    Owlv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Owlv2
    [google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # Owlv2TextConfig类，用于存储Owlv2TextModel的配置，根据指定参数实例化Owlv2文本编码器，定义模型架构。
    # 使用默认配置实例化将产生类似于Owlv2 [google/owlv2-base-patch16] 架构的配置。

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 配置对象继承自[`PretrainedConfig`]，可用于控制模型输出。阅读[`PretrainedConfig`]文档以获取更多信息。
    # 定义默认的 OWLv2 文本模型的词汇表大小
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the OWLv2 text model. Defines the number of different tokens that can be represented
        by the `inputs_ids` passed when calling [`Owlv2TextModel`].
    
    # 定义编码器层和池化层的维度大小
    hidden_size (`int`, *optional*, defaults to 512):
        Dimensionality of the encoder layers and the pooler layer.
    
    # 定义 Transformer 编码器中“中间”（即前馈）层的维度大小
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    
    # 定义 Transformer 编码器中隐藏层的数量
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    
    # 定义 Transformer 编码器中每个注意力层的注意力头数量
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    
    # 设置该模型可能使用的最大序列长度，通常设为一个较大的值
    max_position_embeddings (`int`, *optional*, defaults to 16):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    
    # 定义编码器和池化器中的非线性激活函数
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
    
    # 定义层归一化层使用的 epsilon 值
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    
    # 定义注意力概率的 dropout 比率
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    
    # 初始化所有权重矩阵的截断正态分布的标准差
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    
    # 用于初始化所有权重矩阵的因子（通常保持为 1，仅在初始化测试中使用）
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    
    # 输入序列中填充标记的 id
    pad_token_id (`int`, *optional*, defaults to 0):
        The id of the padding token in the input sequences.
    
    # 输入序列中开始序列的标记 id
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the input sequences.
    
    # 输入序列中结束序列的标记 id
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the input sequences.

Example:


>>> from transformers import Owlv2TextConfig, Owlv2TextModel

>>> # Initializing a Owlv2TextModel with google/owlv2-base-patch16 style configuration
>>> configuration = Owlv2TextConfig()

>>> # Initializing a Owlv2TextConfig from the google/owlv2-base-patch16 style configuration
>>> model = Owlv2TextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config


model_type = "owlv2_text_model"
    # 初始化函数，用于创建一个新的配置对象实例
    def __init__(
        self,
        vocab_size=49408,
        hidden_size=512,
        intermediate_size=2048,
        num_hidden_layers=12,
        num_attention_heads=8,
        max_position_embeddings=16,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        pad_token_id=0,
        bos_token_id=49406,
        eos_token_id=49407,
        **kwargs,
    ):
        # 调用父类的初始化函数，传递相关参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 初始化对象的属性，设置配置的默认值
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.max_position_embeddings = max_position_embeddings
        self.hidden_act = hidden_act
        self.layer_norm_eps = layer_norm_eps
        self.attention_dropout = attention_dropout
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置关键字参数中的特殊标记
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典及更新后的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型为 "owlv2"，则使用其文本配置字典
        if config_dict.get("model_type") == "owlv2":
            config_dict = config_dict["text_config"]

        # 如果类定义了模型类型，并且配置字典中的模型类型与类中的不同，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典及更新后的关键字参数创建配置对象实例并返回
        return cls.from_dict(config_dict, **kwargs)
# 从 transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig 复制而来的配置类 Owlv2VisionConfig，对 OWLv2 图像编码器的配置进行存储。
# 用于根据指定参数实例化一个 OWLv2 图像编码器，定义模型架构。使用默认配置实例化该配置将得到类似于 OWLv2 google/owlv2-base-patch16 架构的配置。
class Owlv2VisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`Owlv2VisionModel`]. It is used to instantiate
    an OWLv2 image encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the OWLv2
    [google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 768):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```
    >>> from transformers import Owlv2VisionConfig, Owlv2VisionModel

    >>> # Initializing a Owlv2VisionModel with google/owlv2-base-patch16 style configuration
    >>> configuration = Owlv2VisionConfig()

    >>> # Initializing a Owlv2VisionModel model from the google/owlv2-base-patch16 style configuration
    >>> model = Owlv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    model_type = "owlv2_vision_model"

    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        num_hidden_layers=12,
        num_attention_heads=12,
        num_channels=3,
        image_size=768,
        patch_size=16,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # 设置模型的各种参数
        self.hidden_size = hidden_size  # 隐藏层大小
        self.intermediate_size = intermediate_size  # 中间层大小
        self.num_hidden_layers = num_hidden_layers  # 隐藏层数量
        self.num_attention_heads = num_attention_heads  # 注意力头数量
        self.num_channels = num_channels  # 图像通道数
        self.image_size = image_size  # 图像大小
        self.patch_size = patch_size  # 图像分块大小
        self.hidden_act = hidden_act  # 隐藏层激活函数
        self.layer_norm_eps = layer_norm_eps  # 层归一化 epsilon 参数
        self.attention_dropout = attention_dropout  # 注意力机制的 dropout 概率
        self.initializer_range = initializer_range  # 初始化范围
        self.initializer_factor = initializer_factor  # 初始化因子

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        cls._set_token_in_kwargs(kwargs)

        # 从预训练模型中加载配置字典和额外的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型是 "owlv2"，则使用视觉配置字典
        if config_dict.get("model_type") == "owlv2":
            config_dict = config_dict["vision_config"]

        # 检查模型类型是否与类属性中指定的模型类型匹配，如果不匹配则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典创建类的实例
        return cls.from_dict(config_dict, **kwargs)
# 从 transformers.models.owlvit.configuration_owlvit.OwlViTConfig 复制过来，将 OwlViT 替换为 Owlv2，owlvit-base-patch32 替换为 owlv2-base-patch16，owlvit 替换为 owlv2，OWL-ViT 替换为 OWLv2
class Owlv2Config(PretrainedConfig):
    r"""
    [`Owlv2Config`] 是用来存储 [`Owlv2Model`] 配置的类。它用于根据指定的参数实例化一个 OWLv2 模型，定义文本模型和视觉模型的配置。
    使用默认参数实例化配置将产生与 OWLv2 [google/owlv2-base-patch16](https://huggingface.co/google/owlv2-base-patch16) 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。

    Args:
        text_config (`dict`, *optional*):
            用于初始化 [`Owlv2TextConfig`] 的配置选项字典。
        vision_config (`dict`, *optional*):
            用于初始化 [`Owlv2VisionConfig`] 的配置选项字典。
        projection_dim (`int`, *optional*, defaults to 512):
            文本和视觉投影层的维度。
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            *logit_scale* 参数的初始值。默认值与原始 OWLv2 实现相同。
        return_dict (`bool`, *optional*, defaults to `True`):
            模型是否应返回字典。如果为 `False`，返回一个元组。
        kwargs (*optional*):
            关键字参数的字典。
    """

    model_type = "owlv2"

    def __init__(
        self,
        text_config=None,
        vision_config=None,
        projection_dim=512,
        logit_scale_init_value=2.6592,
        return_dict=True,
        **kwargs,
    ):
        super().__init__(**kwargs)

        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the Owlv2TextConfig with default values.")

        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. initializing the Owlv2VisionConfig with default values.")

        # 使用给定的文本配置和视觉配置初始化 Owlv2TextConfig 和 Owlv2VisionConfig 对象
        self.text_config = Owlv2TextConfig(**text_config)
        self.vision_config = Owlv2VisionConfig(**vision_config)

        # 设置投影维度、logit_scale 初始值和返回字典选项
        self.projection_dim = projection_dim
        self.logit_scale_init_value = logit_scale_init_value
        self.return_dict = return_dict
        self.initializer_factor = 1.0

    @classmethod
    # 类方法：从预训练模型名称或路径加载配置，并返回预训练配置对象
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 在关键字参数中设置 token
        cls._set_token_in_kwargs(kwargs)

        # 调用类方法获取预训练模型的配置字典和更新后的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中包含 "model_type" 键且类有 "model_type" 属性，并且它们不一致，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 使用配置字典创建配置对象并返回
        return cls.from_dict(config_dict, **kwargs)

    @classmethod
    def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs):
        r"""
        从 owlv2 文本模型配置和 owlv2 视觉模型配置实例化一个 [`Owlv2Config`]（或其派生类）。

        返回：
            [`Owlv2Config`]: 配置对象的一个实例
        """
        # 创建一个空的配置字典，存储文本配置和视觉配置
        config_dict = {}
        config_dict["text_config"] = text_config
        config_dict["vision_config"] = vision_config

        # 使用配置字典创建配置对象并返回
        return cls.from_dict(config_dict, **kwargs)

`.\models\owlv2\convert_owlv2_to_hf.py`

# 导入必要的模块和库

import argparse  # 导入命令行参数解析模块
import collections  # 导入collections模块，用于处理嵌套的字典
import os  # 导入操作系统相关的功能模块

import jax  # 导入JAX，用于自动求导和并行计算
import jax.numpy as jnp  # 导入JAX的NumPy接口，命名为jnp
import numpy as np  # 导入NumPy库，命名为np
import torch  # 导入PyTorch库
from flax.training import checkpoints  # 导入Flax的checkpoint模块，用于模型保存和加载
from huggingface_hub import hf_hub_download  # 导入Hugging Face Hub的下载函数
from PIL import Image  # 导入PIL库中的Image模块，用于图像处理

from transformers import (  # 从transformers库中导入多个类和函数
    CLIPTokenizer,  # CLIP模型的tokenizer
    Owlv2Config,  # Owlv2模型的配置类
    Owlv2ForObjectDetection,  # Owlv2模型的对象检测类
    Owlv2ImageProcessor,  # Owlv2模型的图像处理类
    Owlv2Processor,  # Owlv2模型的处理类
    Owlv2TextConfig,  # Owlv2模型的文本配置类
    Owlv2VisionConfig,  # Owlv2模型的视觉配置类
)
from transformers.utils import logging  # 导入transformers库中的logging模块，用于日志记录

# 设置日志记录的详细级别为INFO
logging.set_verbosity_info()

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


def get_owlv2_config(model_name):
    # 根据模型名称选择对应的配置参数
    if "large" in model_name:
        # 如果模型名称中包含"large"
        image_size = 1008  # 图像大小设为1008
        patch_size = 14  # 补丁大小设为14
        vision_hidden_size = 1024  # 视觉模型隐藏层大小设为1024
        vision_intermediate_size = 4096  # 视觉模型中间层大小设为4096
        vision_num_hidden_layers = 24  # 视觉模型隐藏层数设为24
        vision_num_attention_heads = 16  # 视觉模型注意力头数设为16
        projection_dim = 768  # 投影维度设为768
        text_hidden_size = 768  # 文本模型隐藏层大小设为768
        text_intermediate_size = 3072  # 文本模型中间层大小设为3072
        text_num_attention_heads = 12  # 文本模型注意力头数设为12
        text_num_hidden_layers = 12  # 文本模型隐藏层数设为12
    else:
        # 如果模型名称不包含"large"
        image_size = 960  # 图像大小设为960
        patch_size = 16  # 补丁大小设为16
        vision_hidden_size = 768  # 视觉模型隐藏层大小设为768
        vision_intermediate_size = 3072  # 视觉模型中间层大小设为3072
        vision_num_hidden_layers = 12  # 视觉模型隐藏层数设为12
        vision_num_attention_heads = 12  # 视觉模型注意力头数设为12
        projection_dim = 512  # 投影维度设为512
        text_hidden_size = 512  # 文本模型隐藏层大小设为512
        text_intermediate_size = 2048  # 文本模型中间层大小设为2048
        text_num_attention_heads = 8  # 文本模型注意力头数设为8
        text_num_hidden_layers = 12  # 文本模型隐藏层数设为12

    # 创建视觉配置对象
    vision_config = Owlv2VisionConfig(
        patch_size=patch_size,
        image_size=image_size,
        hidden_size=vision_hidden_size,
        num_hidden_layers=vision_num_hidden_layers,
        intermediate_size=vision_intermediate_size,
        num_attention_heads=vision_num_attention_heads,
    )

    # 创建文本配置对象
    text_config = Owlv2TextConfig(
        hidden_size=text_hidden_size,
        intermediate_size=text_intermediate_size,
        num_attention_heads=text_num_attention_heads,
        num_hidden_layers=text_num_hidden_layers,
    )

    # 创建总配置对象
    config = Owlv2Config(
        text_config=text_config.to_dict(),
        vision_config=vision_config.to_dict(),
        projection_dim=projection_dim,
    )

    return config


def flatten_nested_dict(params, parent_key="", sep="/"):
    items = []
    # 遍历字典 params 中的键值对
    for k, v in params.items():
        # 如果 parent_key 存在，则将当前键 k 与 parent_key 和分隔符 sep 拼接成新的键 new_key
        # 如果 parent_key 不存在，则直接使用当前键 k 作为新的键 new_key
        new_key = parent_key + sep + k if parent_key else k

        # 检查当前值 v 是否为可变映射（如字典）
        if isinstance(v, collections.MutableMapping):
            # 如果是可变映射，则递归展开其内部结构，并将展开后的结果的键值对添加到 items 列表中
            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
        else:
            # 如果不是可变映射，则将当前键值对作为元组 (new_key, v) 添加到 items 列表中
            items.append((new_key, v))
    
    # 将 items 列表转换为字典并返回
    return dict(items)
# 定义函数，用于创建重命名键列表，根据给定的配置和模型名称
def create_rename_keys(config, model_name):
    # 初始化空的重命名键列表
    rename_keys = []

    # fmt: off
    # CLIP vision encoder
    # 添加重命名键，将原始名称映射为新名称，用于视觉编码器的类嵌入
    rename_keys.append(("backbone/clip/visual/class_embedding", "owlv2.vision_model.embeddings.class_embedding"))
    # 添加重命名键，将原始名称映射为新名称，用于视觉编码器的补丁嵌入权重
    rename_keys.append(("backbone/clip/visual/conv1/kernel", "owlv2.vision_model.embeddings.patch_embedding.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于视觉编码器的位置嵌入权重
    rename_keys.append(("backbone/clip/visual/positional_embedding", "owlv2.vision_model.embeddings.position_embedding.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于视觉编码器的前层归一化权重
    rename_keys.append(("backbone/clip/visual/ln_pre/scale", "owlv2.vision_model.pre_layernorm.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于视觉编码器的前层归一化偏置
    rename_keys.append(("backbone/clip/visual/ln_pre/bias", "owlv2.vision_model.pre_layernorm.bias"))

    # 添加重命名键，将原始名称映射为新名称，用于视觉编码器的后层归一化权重
    rename_keys.append(("backbone/clip/visual/ln_post/scale", "owlv2.vision_model.post_layernorm.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于视觉编码器的后层归一化偏置
    rename_keys.append(("backbone/clip/visual/ln_post/bias", "owlv2.vision_model.post_layernorm.bias"))

    # CLIP text encoder
    # 添加重命名键，将原始名称映射为新名称，用于文本编码器的标记嵌入权重
    rename_keys.append(("backbone/clip/text/token_embedding/embedding", "owlv2.text_model.embeddings.token_embedding.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于文本编码器的位置嵌入权重
    rename_keys.append(("backbone/clip/text/positional_embedding", "owlv2.text_model.embeddings.position_embedding.weight"))

    # 添加重命名键，将原始名称映射为新名称，用于文本编码器的最终层归一化权重
    rename_keys.append(("backbone/clip/text/ln_final/scale", "owlv2.text_model.final_layer_norm.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于文本编码器的最终层归一化偏置
    rename_keys.append(("backbone/clip/text/ln_final/bias", "owlv2.text_model.final_layer_norm.bias"))

    # logit scale
    # 添加重命名键，将原始名称映射为新名称，用于逻辑刻度的权重
    rename_keys.append(("backbone/clip/logit_scale", "owlv2.logit_scale"))

    # projection heads
    # 添加重命名键，将原始名称映射为新名称，用于文本投影头的权重
    rename_keys.append(("backbone/clip/text/text_projection/kernel", "owlv2.text_projection.weight"))

    # class and box heads
    # 添加重命名键，将原始名称映射为新名称，用于合并类令牌的归一化层权重
    rename_keys.append(("backbone/merged_class_token/scale", "layer_norm.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于合并类令牌的归一化层偏置
    rename_keys.append(("backbone/merged_class_token/bias", "layer_norm.bias"))
    # 添加重命名键，将原始名称映射为新名称，用于类头的第一个密集层的权重
    rename_keys.append(("class_head/Dense_0/kernel", "class_head.dense0.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于类头的第一个密集层的偏置
    rename_keys.append(("class_head/Dense_0/bias", "class_head.dense0.bias"))
    # 添加重命名键，将原始名称映射为新名称，用于类头逻辑偏移的权重
    rename_keys.append(("class_head/logit_shift/kernel", "class_head.logit_shift.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于类头逻辑刻度的权重
    rename_keys.append(("class_head/logit_scale/kernel", "class_head.logit_scale.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于类头逻辑刻度的偏置
    rename_keys.append(("class_head/logit_scale/bias", "class_head.logit_scale.bias"))
    # 添加重命名键，将原始名称映射为新名称，用于类头逻辑偏移的偏置
    rename_keys.append(("class_head/logit_shift/bias", "class_head.logit_shift.bias"))
    # 添加重命名键，将原始名称映射为新名称，用于目标框头的第一个密集层的权重
    rename_keys.append(("obj_box_head/Dense_0/kernel", "box_head.dense0.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于目标框头的第一个密集层的偏置
    rename_keys.append(("obj_box_head/Dense_0/bias", "box_head.dense0.bias"))
    # 添加重命名键，将原始名称映射为新名称，用于目标框头的第二个密集层的权重
    rename_keys.append(("obj_box_head/Dense_1/kernel", "box_head.dense1.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于目标框头的第二个密集层的偏置
    rename_keys.append(("obj_box_head/Dense_1/bias", "box_head.dense1.bias"))
    # 添加重命名键，将原始名称映射为新名称，用于目标框头的第三个密集层的权重
    rename_keys.append(("obj_box_head/Dense_2/kernel", "box_head.dense2.weight"))
    # 添加重命名键，将原始名称映射为新名称，用于目标框头的第三个密集层的偏置
    rename_keys.append(("obj_box_head/Dense_2/bias", "box_head.dense2.bias"))
    # objectness head (only for v2)
    # 此处为 v2 特有的目标性头部（暂未提供具体的重命名信息）
    # 如果模型名称包含 "v2"，则执行以下操作
    if "v2" in model_name:
        # 将需要重命名的键值对添加到 rename_keys 列表中
        rename_keys.append(("objectness_head/Dense_0/kernel", "objectness_head.dense0.weight"))
        rename_keys.append(("objectness_head/Dense_0/bias", "objectness_head.dense0.bias"))
        rename_keys.append(("objectness_head/Dense_1/kernel", "objectness_head.dense1.weight"))
        rename_keys.append(("objectness_head/Dense_1/bias", "objectness_head.dense1.bias"))
        rename_keys.append(("objectness_head/Dense_2/kernel", "objectness_head.dense2.weight"))
        rename_keys.append(("objectness_head/Dense_2/bias", "objectness_head.dense2.bias"))

    # 格式化设置关闭，恢复默认的代码格式
    # fmt: on

    # 返回存储重命名键值对的列表 rename_keys
    return rename_keys
# 从字典中弹出旧键对应的值
val = dct.pop(old)

# 如果新键名包含特定字符串并且包含"vision"，则对值进行重新形状，调整为二维数组
if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
    val = val.reshape(-1, config.vision_config.hidden_size)
# 如果新键名包含特定字符串并且包含"text"，则对值进行重新形状，调整为二维数组
if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
    val = val.reshape(-1, config.text_config.hidden_size)

# 如果新键名包含"patch_embedding"，则输出信息并对值进行维度转置
if "patch_embedding" in new:
    print("Reshaping patch embedding... for", new)
    val = val.transpose(3, 2, 0, 1)
# 如果新键名以"weight"结尾并且不包含"position_embedding"和"token_embedding"，则对值进行转置
elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
    val = val.T

# 如果新键名以"bias"结尾，则对值进行形状调整，转换为一维数组
if new.endswith("bias"):
    val = val.reshape(-1)

# 将处理过的值转换为NumPy数组，然后转换为PyTorch张量，并将新键与其对应的值加入字典中
dct[new] = torch.from_numpy(np.array(val))
    # 使用给定的文本和图像创建输入对象，返回PyTorch张量格式的输入
    inputs = processor(text=texts, images=image, return_tensors="pt")

    # 如果模型名称中不包含 "large" 字符串，检查像素值是否与原始像素值非常接近
    if "large" not in model_name:
        assert torch.allclose(inputs.pixel_values, original_pixel_values.float(), atol=1e-6)
    # 检查前四个位置的输入标识是否与原始输入标识非常接近
    assert torch.allclose(inputs.input_ids[:4, :], original_input_ids[:4, :], atol=1e-6)

    # 禁用梯度计算的上下文环境，计算模型的输出
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_boxes = outputs.pred_boxes
        objectness_logits = outputs.objectness_logits

    # 否则，如果模型转换但未验证 logits，则打印消息
    else:
        print("Model converted without verifying logits")

    # 如果指定了 PyTorch 模型保存路径，则保存模型和处理器到本地
    if pytorch_dump_folder_path is not None:
        print("Saving model and processor locally...")
        # 创建保存模型的文件夹
        if not os.path.isdir(pytorch_dump_folder_path):
            os.mkdir(pytorch_dump_folder_path)

        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到模型库
    if push_to_hub:
        print(f"Pushing {model_name} to the hub...")
        # 将模型推送到指定的模型库位置
        model.push_to_hub(f"google/{model_name}")
        # 将处理器推送到指定的模型库位置
        processor.push_to_hub(f"google/{model_name}")
if __name__ == "__main__":
    # 如果脚本被直接运行而非被导入，则执行以下代码

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必需参数
    parser.add_argument(
        "--model_name",
        default="owlv2-base-patch16",
        choices=[
            "owlv2-base-patch16",
            "owlv2-base-patch16-finetuned",
            "owlv2-base-patch16-ensemble",
            "owlv2-large-patch14",
            "owlv2-large-patch14-finetuned",
            "owlv2-large-patch14-ensemble",
        ],
        type=str,
        help="Name of the Owlv2 model you'd like to convert from FLAX to PyTorch."
    )
    # 添加一个名为 model_name 的可选参数，用于指定 Owlv2 模型的名称

    parser.add_argument(
        "--checkpoint_path",
        default=None,
        type=str,
        required=True,
        help="Path to the original Flax checkpoint."
    )
    # 添加一个名为 checkpoint_path 的必选参数，用于指定原始 Flax 检查点的路径

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        required=False,
        help="Path to the output PyTorch model directory."
    )
    # 添加一个名为 pytorch_dump_folder_path 的可选参数，用于指定输出的 PyTorch 模型目录的路径

    parser.add_argument(
        "--verify_logits",
        action="store_false",
        required=False,
        help="Path to the output PyTorch model directory."
    )
    # 添加一个名为 verify_logits 的可选参数，设置为 False，用于验证输出的 logits

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Push model and image preprocessor to the hub"
    )
    # 添加一个名为 push_to_hub 的可选参数，设置为 True，用于将模型和图像预处理器推送到 hub

    args = parser.parse_args()
    # 解析命令行参数并存储到 args 对象中

    convert_owlv2_checkpoint(
        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits
    )
    # 调用 convert_owlv2_checkpoint 函数，传入解析后的参数进行 Owlv2 模型检查点的转换

Transformers-源码解析-八十四-

Transformers 源码解析（八十四）

.\models\openai\modeling_tf_openai.py

.\models\openai\tokenization_openai.py

.\models\openai\tokenization_openai_fast.py

.\models\openai\__init__.py

.\models\opt\configuration_opt.py

.\models\opt\convert_opt_original_pytorch_checkpoint_to_pytorch.py

.\models\opt\modeling_flax_opt.py

.\models\opt\modeling_opt.py

.\models\opt\modeling_tf_opt.py

.\models\opt\__init__.py

.\models\owlv2\configuration_owlv2.py

.\models\owlv2\convert_owlv2_to_hf.py

`.\models\openai\modeling_tf_openai.py`

`.\models\openai\tokenization_openai.py`

`.\models\openai\tokenization_openai_fast.py`

`.\models\openai\init.py`

`.\models\opt\configuration_opt.py`

`.\models\opt\convert_opt_original_pytorch_checkpoint_to_pytorch.py`

`.\models\opt\modeling_flax_opt.py`

`.\models\opt\modeling_opt.py`

`.\models\opt\modeling_tf_opt.py`

`.\models\opt\init.py`

`.\models\owlv2\configuration_owlv2.py`

`.\models\owlv2\convert_owlv2_to_hf.py`