Transformers 源码解析（六十八）

`.\models\lxmert\modeling_tf_lxmert.py`

# coding=utf-8
# 定义文件编码为 UTF-8

# 版权声明：以下代码由 Google AI Language Team Authors、HuggingFace Inc. team 和 Lxmert Authors 创作
# 版权所有 (c) 2018, NVIDIA CORPORATION. 保留所有权利。

# 根据 Apache 许可证 2.0 版本授权，除非符合许可证要求或书面同意，否则不得使用此文件
# 您可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0

# 如果根据适用法律要求或书面同意，软件将按“原样”分发，无任何明示或暗示的担保或条件
# 请参阅许可证获取更多详细信息

""" TF 2.0 LXMERT model."""

from __future__ import annotations

import warnings
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union

import numpy as np
import tensorflow as tf

# 从内部库中导入相关模块和函数
from ...activations_tf import get_tf_activation
from ...modeling_tf_utils import (
    TFModelInputType,
    TFPreTrainedModel,
    get_initializer,
    keras,
    keras_serializable,
    shape_list,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, stable_softmax
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_lxmert import LxmertConfig

# 获取全局日志记录器对象
logger = logging.get_logger(__name__)

# 以下是文档示例中使用的模型检查点和配置信息
_CHECKPOINT_FOR_DOC = "unc-nlp/lxmert-base-uncased"
_CONFIG_FOR_DOC = "LxmertConfig"

# 定义 TF LXMERT 预训练模型的存档列表
TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "unc-nlp/lxmert-base-uncased",
]

# 定义 TFLxmertModelOutput 数据类，包含语言、视觉和跨模态编码器的最后隐藏状态、汇集输出和注意力概率
@dataclass
class TFLxmertModelOutput(ModelOutput):
    """
    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
    visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
    encoder)
    """
    # 定义函数的参数，描述了不同的输出和注意力张量
    Args:
        language_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            语言编码器最后一层的隐藏状态序列。
        vision_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            视觉编码器最后一层的隐藏状态序列。
        pooled_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
            序列第一个令牌（CLS令牌）的最后一层隐藏状态，经过线性层和Tanh激活函数进一步处理后的结果。
        language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            语言编码器每个交叉模态层的输入特征和输出的元组，形状为 `(batch_size, sequence_length, hidden_size)`。
        vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            视觉编码器每个交叉模态层的输入特征和输出的元组，形状为 `(batch_size, sequence_length, hidden_size)`。
        language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            自注意力头中注意力softmax后的权重张量元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            用于计算自注意力头中加权平均值。
        vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            自注意力头中注意力softmax后的权重张量元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            用于计算自注意力头中加权平均值。
        cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            自注意力头中注意力softmax后的权重张量元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            用于计算自注意力头中加权平均值。
@dataclass
class TFLxmertForPreTrainingOutput(ModelOutput):
    """
    Output type of [`LxmertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`):
            Prediction scores of the textual matching objective (classification) head (scores of True/False
            continuation before SoftMax).
        question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
            Prediction scores of question answering objective (classification).
        language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    """

    # Optional attributes representing different outputs from the model

    # Total loss combining masked language modeling and next sequence prediction loss
    loss: tf.Tensor | None = None

    # Scores of language modeling head before softmax
    prediction_logits: tf.Tensor | None = None

    # Scores of textual matching objective (True/False continuation) before softmax
    cross_relationship_score: tf.Tensor | None = None

    # Scores of question answering objective
    question_answering_score: tf.Tensor | None = None

    # Hidden states of language model layers and cross-modality layers
    language_hidden_states: tuple[tf.Tensor] | None = None

    # Hidden states of vision model layers and cross-modality layers
    vision_hidden_states: tuple[tf.Tensor] | None = None

    # Attention weights for language model self-attention heads
    language_attentions: tuple[tf.Tensor] | None = None

    # Attention weights for vision model self-attention heads
    vision_attentions: tuple[tf.Tensor] | None = None

    # Attention weights for cross-encoder self-attention heads
    cross_encoder_attentions: tuple[tf.Tensor] | None = None
    # 定义交叉关系得分，初始化为 None
    cross_relationship_score: tf.Tensor | None = None
    # 定义问答得分，初始化为 None
    question_answering_score: tf.Tensor | None = None
    # 定义语言模型的隐藏状态，初始化为 None，是一个包含 Tensor 的元组
    language_hidden_states: Tuple[tf.Tensor] | None = None
    # 定义视觉模型的隐藏状态，初始化为 None，是一个包含 Tensor 的元组
    vision_hidden_states: Tuple[tf.Tensor] | None = None
    # 定义语言模型的注意力分布，初始化为 None，是一个包含 Tensor 的元组
    language_attentions: Tuple[tf.Tensor] | None = None
    # 定义视觉模型的注意力分布，初始化为 None，是一个包含 Tensor 的元组
    vision_attentions: Tuple[tf.Tensor] | None = None
    # 定义交叉编码器的注意力分布，初始化为 None，是一个包含 Tensor 的元组
    cross_encoder_attentions: Tuple[tf.Tensor] | None = None
class TFLxmertVisualFeatureEncoder(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # Object feature encoding
        # 创建对象特征编码层，使用 Dense 层进行线性变换
        self.visn_fc = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="visn_fc",
        )
        # 对对象特征编码结果进行 LayerNormalization
        self.visn_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="visn_layer_norm")

        # Box position encoding
        # 创建盒子位置编码层，使用 Dense 层进行线性变换
        self.box_fc = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="box_fc",
        )
        # 对盒子位置编码结果进行 LayerNormalization
        self.box_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="box_layer_norm")

        # Dropout 层，用于随机失活以防止过拟合
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        self.feat_dim = config.visual_feat_dim
        self.pos_dim = config.visual_pos_dim
        self.config = config

    def call(self, visn_input, training=False):
        feats, boxes = visn_input

        # 对对象特征进行线性变换和规范化
        x = self.visn_fc(feats)
        x = self.visn_layer_norm(x)
        
        # 对盒子位置进行线性变换和规范化
        y = self.box_fc(boxes)
        y = self.box_layer_norm(y)
        
        # 将对象特征编码和盒子位置编码的结果求平均作为最终输出
        output = (x + y) / 2

        # 对输出结果应用 Dropout
        output = self.dropout(output, training=training)
        return output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果已经建立，直接返回；否则，根据输入形状建立各层
        if getattr(self, "visn_fc", None) is not None:
            with tf.name_scope(self.visn_fc.name):
                self.visn_fc.build([None, None, self.feat_dim])
        if getattr(self, "visn_layer_norm", None) is not None:
            with tf.name_scope(self.visn_layer_norm.name):
                self.visn_layer_norm.build([None, None, self.config.hidden_size])
        if getattr(self, "box_fc", None) is not None:
            with tf.name_scope(self.box_fc.name):
                self.box_fc.build([None, None, self.pos_dim])
        if getattr(self, "box_layer_norm", None) is not None:
            with tf.name_scope(self.box_layer_norm.name):
                self.box_layer_norm.build([None, None, self.config.hidden_size])


class TFLxmertEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 初始化配置信息和参数
        self.config = config
        self.hidden_size = config.hidden_size
        self.max_position_embeddings = config.max_position_embeddings
        self.initializer_range = config.initializer_range
        
        # LayerNormalization 层，用于规范化
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        
        # Dropout 层，用于随机失活以防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
    # 在构建函数中，为词嵌入层添加权重张量
    def build(self, input_shape=None):
        # 在 "word_embeddings" 命名空间下，添加名为 "weight" 的权重张量
        self.weight = self.add_weight(
            name="weight",
            shape=[self.config.vocab_size, self.hidden_size],
            initializer=get_initializer(initializer_range=self.initializer_range),
        )

        # 在 "token_type_embeddings" 命名空间下，添加名为 "embeddings" 的权重张量
        self.token_type_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.config.type_vocab_size, self.hidden_size],
            initializer=get_initializer(initializer_range=self.initializer_range),
        )

        # 在 "position_embeddings" 命名空间下，添加名为 "embeddings" 的权重张量
        self.position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_position_embeddings, self.hidden_size],
            initializer=get_initializer(initializer_range=self.initializer_range),
        )

        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设置已构建标志为 True
        self.built = True
        # 如果存在 LayerNorm 层，则在其命名空间下构建
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])

    # 在调用函数中，基于输入张量应用嵌入
    def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False):
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        # 确保输入中至少包含 input_ids 或 inputs_embeds
        assert not (input_ids is None and inputs_embeds is None)

        # 如果提供了 input_ids，则根据 input_ids 和权重张量获取嵌入
        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取输入嵌入的形状列表，并去掉最后一个维度
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果未提供 token_type_ids，则创建与输入形状相同的全零张量
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 创建位置编码的位置张量，用于获取位置嵌入
        position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)

        # 根据 token_type_ids 获取 token type 嵌入
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)

        # 将输入嵌入、位置嵌入和 token type 嵌入相加得到最终嵌入
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds

        # 对最终嵌入应用 LayerNorm
        final_embeddings = self.LayerNorm(inputs=final_embeddings)

        # 在训练模式下对最终嵌入应用 dropout
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        # 返回最终嵌入张量
        return final_embeddings
    # 定义一个名为 TFLxmertAttention 的自定义层，继承自 keras 的 Layer 类
    class TFLxmertAttention(keras.layers.Layer):
        # 初始化方法，接受一个 config 对象和其他关键字参数
        def __init__(self, config, **kwargs):
            # 调用父类的初始化方法
            super().__init__(**kwargs)
            # 检查隐藏大小是否能被注意力头数整除
            if config.hidden_size % config.num_attention_heads != 0:
                # 如果不能整除，抛出 ValueError 异常
                raise ValueError(
                    f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                    f"heads ({config.num_attention_heads}"
                )

            # 设置注意力头数和注意力头大小
            self.num_attention_heads = config.num_attention_heads
            assert config.hidden_size % config.num_attention_heads == 0
            self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
            self.all_head_size = self.num_attention_heads * self.attention_head_size

            # 定义 query、key、value 三个全连接层，用于计算注意力分数
            self.query = keras.layers.Dense(
                self.all_head_size,
                kernel_initializer=get_initializer(config.initializer_range),
                name="query",
            )
            self.key = keras.layers.Dense(
                self.all_head_size,
                kernel_initializer=get_initializer(config.initializer_range),
                name="key",
            )
            self.value = keras.layers.Dense(
                self.all_head_size,
                kernel_initializer=get_initializer(config.initializer_range),
                name="value",
            )

            # 定义 dropout 层，用于在注意力计算中进行随机失活
            self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
            # 设置上下文维度为隐藏大小
            self.ctx_dim = config.hidden_size
            # 保存配置信息
            self.config = config

        # 定义方法 transpose_for_scores，用于将输入张量重新形状并转置以计算注意力分数
        def transpose_for_scores(self, x, batch_size):
            # 将输入 x 从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
            x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
            # 转置张量以匹配注意力计算的期望维度顺序 [batch_size, num_attention_heads, seq_length, attention_head_size]
            return tf.transpose(x, perm=[0, 2, 1, 3])
    def call(self, hidden_states, context, attention_mask, output_attentions, training=False):
        # 获取批量大小
        batch_size = shape_list(hidden_states)[0]
        # 使用 self.query 对隐藏状态进行转换
        mixed_query_layer = self.query(hidden_states)
        # 使用 self.key 对上下文进行转换
        mixed_key_layer = self.key(context)
        # 使用 self.value 对上下文进行转换
        mixed_value_layer = self.value(context)

        # 将转换后的查询向量调整为注意力分数的形状
        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
        # 将转换后的键向量调整为注意力分数的形状
        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
        # 将转换后的值向量调整为注意力分数的形状
        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)

        # 计算查询向量和键向量的点积，得到原始注意力分数
        attention_scores = tf.matmul(
            query_layer, key_layer, transpose_b=True
        )  # (batch size, num_heads, seq_len_q, seq_len_k)
        # 计算缩放因子 dk，并将注意力分数进行缩放
        dk = tf.cast(shape_list(key_layer)[-1], dtype=attention_scores.dtype)
        attention_scores = attention_scores / tf.math.sqrt(dk)

        if attention_mask is not None:
            # 如果存在注意力遮罩，则应用它（预先为 TFLxmertModel call() 函数中的所有层计算）
            attention_mask = tf.cast(attention_mask, dtype=attention_scores.dtype)
            attention_scores = attention_scores + attention_mask

        # 将注意力分数归一化为注意力概率
        attention_probs = stable_softmax(attention_scores, axis=-1)

        # 使用 dropout 进行注意力概率的随机失活
        attention_probs = self.dropout(attention_probs, training=training)
        # 计算上下文向量，加权和值向量
        context_layer = tf.matmul(attention_probs, value_layer)

        # 调整上下文向量的形状，以便输出
        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
        context_layer = tf.reshape(
            context_layer, (batch_size, -1, self.all_head_size)
        )  # (batch_size, seq_len_q, all_head_size)

        # 准备模型输出，包括上下文层和注意力概率（如果需要输出注意力）
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        # 构建查询、键、值的神经网络层
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.ctx_dim])
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.ctx_dim])
# 定义一个名为 TFLxmertIntermediate 的自定义层，继承自 keras.layers.Layer
class TFLxmertIntermediate(keras.layers.Layer):
    # 初始化函数，接收 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)
        # 创建一个全连接层 dense，输出维度为 config.intermediate_size
        self.dense = keras.layers.Dense(
            config.intermediate_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )
        # 根据 config 中的 hidden_act 字段，获取激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        # 将 config 存储在当前对象的属性中
        self.config = config

    # 定义 call 方法，处理输入 hidden_states
    def call(self, hidden_states):
        # 将 hidden_states 输入到全连接层 dense 中进行变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的 hidden_states 应用 intermediate_act_fn 激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states

    # 定义 build 方法，用于构建层的参数
    def build(self, input_shape=None):
        # 如果层已经构建过，直接返回
        if self.built:
            return
        # 标记当前层已构建
        self.built = True
        # 如果 dense 层存在，则在 tf 的 name_scope 下构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 构建 dense 层，输入维度为 [None, None, self.config.hidden_size]
                self.dense.build([None, None, self.config.hidden_size])


# 定义一个名为 TFLxmertOutput 的自定义层，继承自 keras.layers.Layer
class TFLxmertOutput(keras.layers.Layer):
    # 初始化函数，接收 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)
        # 创建一个全连接层 dense，输出维度为 config.hidden_size
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )
        # 创建一个 LayerNormalization 层，epsilon 设置为 config.layer_norm_eps，命名为 LayerNorm
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建一个 Dropout 层，dropout 概率为 config.hidden_dropout_prob
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 将 config 存储在当前对象的属性中
        self.config = config

    # 定义 call 方法，处理输入 hidden_states 和 input_tensor
    def call(self, hidden_states, input_tensor, training=False):
        # 将 hidden_states 输入到全连接层 dense 中进行变换
        hidden_states = self.dense(hidden_states)
        # 如果处于训练阶段，对变换后的 hidden_states 进行 dropout 操作
        hidden_states = self.dropout(hidden_states, training)
        # 将 dropout 后的 hidden_states 与 input_tensor 相加，并进行 LayerNorm
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的 hidden_states
        return hidden_states

    # 定义 build 方法，用于构建层的参数
    def build(self, input_shape=None):
        # 如果层已经构建过，直接返回
        if self.built:
            return
        # 标记当前层已构建
        self.built = True
        # 如果 dense 层存在，则在 tf 的 name_scope 下构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 构建 dense 层，输入维度为 [None, None, self.config.intermediate_size]
                self.dense.build([None, None, self.config.intermediate_size])
        # 如果 LayerNorm 层存在，则在 tf 的 name_scope 下构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 构建 LayerNorm 层，输入维度为 [None, None, self.config.hidden_size]
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 定义一个名为 TFLxmertAttentionOutput 的自定义层，继承自 keras.layers.Layer
class TFLxmertAttentionOutput(keras.layers.Layer):
    # 初始化函数，接收 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)
        # 创建一个全连接层 dense，输出维度为 config.hidden_size
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )
        # 创建一个 LayerNormalization 层，epsilon 设置为 config.layer_norm_eps，命名为 LayerNorm
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建一个 Dropout 层，dropout 概率为 config.hidden_dropout_prob
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 将 config 存储在当前对象的属性中
        self.config = config
    # 定义神经网络层的调用方法，接收隐藏状态、输入张量和训练标志作为参数
    def call(self, hidden_states, input_tensor, training=False):
        # 将隐藏状态通过全连接层 dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态进行 dropout 操作，根据训练标志决定是否启用
        hidden_states = self.dropout(hidden_states, training=training)
        # 将 dropout 后的隐藏状态与输入张量相加，并通过 LayerNorm 进行归一化
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回经过处理后的隐藏状态
        return hidden_states

    # 构建神经网络层，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 dense 层，则根据指定的输入形状构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果存在 LayerNorm 层，则根据指定的输入形状构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])
# 定义一个 TensorFlow Keras 自定义层 TFLxmertSelfAttentionLayer
class TFLxmertSelfAttentionLayer(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个 TFLxmertAttention 实例，用于自注意力机制
        self.self = TFLxmertAttention(config, name="self")
        # 创建一个 TFLxmertAttentionOutput 实例，用于处理注意力输出
        self.attention_output = TFLxmertAttentionOutput(config, name="output")

    # 定义层的调用方法
    def call(self, input_tensor, attention_mask, output_attentions, training=False):
        # 执行自注意力机制，键和查询均为输入张量本身
        self_output = self.self(input_tensor, input_tensor, attention_mask, output_attentions)
        if output_attentions:
            # 如果需要输出注意力权重，则从 self_output 中获取注意力权重
            attention_probs = self_output[1]
        # 将自注意力的输出传递给注意力输出层处理
        attention_output = self.attention_output(self_output[0], input_tensor)
        # 根据是否需要输出注意力权重，返回不同的结果元组
        return (attention_output, attention_probs) if output_attentions else (attention_output,)

    # 构建层，用于初始化子层
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 self 层，建立 self 层的计算图
        if getattr(self, "self", None) is not None:
            with tf.name_scope(self.self.name):
                self.self.build(None)
        # 如果存在 attention_output 层，建立 attention_output 层的计算图
        if getattr(self, "attention_output", None) is not None:
            with tf.name_scope(self.attention_output.name):
                self.attention_output.build(None)


# 定义一个 TensorFlow Keras 自定义层 TFLxmertCrossAttentionLayer
class TFLxmertCrossAttentionLayer(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个 TFLxmertAttention 实例，用于跨注意力机制
        self.att = TFLxmertAttention(config, name="att")
        # 创建一个 TFLxmertAttentionOutput 实例，用于处理注意力输出
        self.attention_output = TFLxmertAttentionOutput(config, name="output")

    # 定义层的调用方法
    def call(
        self,
        input_tensor,
        ctx_tensor,
        ctx_att_mask,
        output_attentions=False,
        training=False,
    ):
        # 执行跨注意力机制，处理输入张量和上下文张量
        output = self.att(input_tensor, ctx_tensor, ctx_att_mask, output_attentions, training=training)
        if output_attentions:
            # 如果需要输出注意力权重，则从 output 中获取注意力权重
            attention_probs = output[1]
        # 将跨注意力的输出传递给注意力输出层处理
        attention_output = self.attention_output(output[0], input_tensor, training=training)
        # 根据是否需要输出注意力权重，返回不同的结果元组
        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
        return outputs

    # 构建层，用于初始化子层
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 att 层，建立 att 层的计算图
        if getattr(self, "att", None) is not None:
            with tf.name_scope(self.att.name):
                self.att.build(None)
        # 如果存在 attention_output 层，建立 attention_output 层的计算图
        if getattr(self, "attention_output", None) is not None:
            with tf.name_scope(self.attention_output.name):
                self.attention_output.build(None)


# 定义一个 TensorFlow Keras 自定义层 TFLxmertLayer
class TFLxmertLayer(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个 TFLxmertSelfAttentionLayer 实例，用于自注意力层
        self.attention = TFLxmertSelfAttentionLayer(config, name="attention")
        # 创建一个 TFLxmertIntermediate 实例，用于处理中间层计算
        self.intermediate = TFLxmertIntermediate(config, name="intermediate")
        # 创建一个 TFLxmertOutput 实例，用于输出转换层
        self.transformer_output = TFLxmertOutput(config, name="output")
    # 定义一个方法用于调用 Transformer 模型的前向传播过程
    def call(self, hidden_states, attention_mask, output_attentions, training=False):
        # 调用注意力层，得到注意力输出结果
        attention_outputs = self.attention(hidden_states, attention_mask, output_attentions, training=training)
        # 获取注意力输出中的第一个元素，即注意力输出本身
        attention_output = attention_outputs[0]
        # 将注意力输出传入中间层
        intermediate_output = self.intermediate(attention_output)
        # 将中间层输出和注意力输出传入 Transformer 输出层
        layer_output = self.transformer_output(intermediate_output, attention_output, training=training)
        # 构建输出元组，包括层输出和可能的注意力输出
        outputs = (layer_output,) + attention_outputs[1:]  # 如果有的话，添加注意力信息
        # 返回最终的输出
        return outputs

    # 构建方法用于在第一次调用前初始化模型
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设置构建标志为 True，表示模型已构建
        self.built = True
        # 如果存在注意力层，则构建注意力层
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 如果存在中间层，则构建中间层
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        # 如果存在 Transformer 输出层，则构建 Transformer 输出层
        if getattr(self, "transformer_output", None) is not None:
            with tf.name_scope(self.transformer_output.name):
                self.transformer_output.build(None)
# 定义一个自定义的 Keras 层，用于 TFLxmert 模型中的一个层
class TFLxmertXLayer(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建视觉注意力层对象，命名为 "visual_attention"
        self.visual_attention = TFLxmertCrossAttentionLayer(config, name="visual_attention")

        # 创建自注意力层对象用于语言输入，命名为 "lang_self_att"
        self.lang_self_att = TFLxmertSelfAttentionLayer(config, name="lang_self_att")
        # 创建自注意力层对象用于视觉输入，命名为 "visn_self_att"
        self.visn_self_att = TFLxmertSelfAttentionLayer(config, name="visn_self_att")

        # 创建中间层和输出层对象（前馈神经网络）
        self.lang_inter = TFLxmertIntermediate(config, name="lang_inter")
        self.lang_output = TFLxmertOutput(config, name="lang_output")
        self.visn_inter = TFLxmertIntermediate(config, name="visn_inter")
        self.visn_output = TFLxmertOutput(config, name="visn_output")

    def cross_att(
        self,
        lang_input,
        lang_attention_mask,
        visn_input,
        visn_attention_mask,
        output_attentions,
        training=False,
    ):
        # 交叉注意力操作

        # 复制语言输入，避免因为同一输入在两个层间传递导致 Keras 模型保存与加载时出现问题
        lang_attention_lang_input = tf.identity(lang_input)
        visn_attention_lang_input = tf.identity(lang_input)
        lang_attention_visn_input = tf.identity(visn_input)
        visn_attention_visn_input = tf.identity(visn_input)

        # 对语言输入进行视觉注意力计算
        lang_att_output = self.visual_attention(
            lang_attention_lang_input,
            lang_attention_visn_input,
            visn_attention_mask,
            output_attentions=output_attentions,
            training=training,
        )
        # 对视觉输入进行视觉注意力计算
        visn_att_output = self.visual_attention(
            visn_attention_visn_input,
            visn_attention_lang_input,
            lang_attention_mask,
            output_attentions=output_attentions,
            training=training,
        )
        return lang_att_output, visn_att_output

    def self_att(
        self,
        lang_input,
        lang_attention_mask,
        visn_input,
        visn_attention_mask,
        training=False,
    ):
        # 自注意力操作
        output_attentions = False
        # 对语言输入进行语言自注意力计算
        lang_att_output = self.lang_self_att(lang_input, lang_attention_mask, output_attentions, training=training)
        # 对视觉输入进行视觉自注意力计算
        visn_att_output = self.visn_self_att(visn_input, visn_attention_mask, output_attentions, training=training)
        return lang_att_output[0], visn_att_output[0]

    def output_fc(self, lang_input, visn_input, training=False):
        # 全连接层操作
        # 对语言输入进行中间层计算
        lang_inter_output = self.lang_inter(lang_input)
        # 对视觉输入进行中间层计算
        visn_inter_output = self.visn_inter(visn_input)

        # 计算层的输出
        lang_output = self.lang_output(lang_inter_output, lang_input, training)
        visn_output = self.visn_output(visn_inter_output, visn_input, training)
        return lang_output, visn_output

    def call(
        self,
        lang_feats,
        lang_attention_mask,
        visn_feats,
        visn_attention_mask,
        output_attentions,
        training=False,
    ):
        # 调用函数，定义层的调用方式，处理语言和视觉特征

        # 返回语言和视觉特征的交叉注意力输出
        return self.cross_att(
            lang_feats,
            lang_attention_mask,
            visn_feats,
            visn_attention_mask,
            output_attentions,
            training=training,
        )
        ):
        # 将语言特征和视觉特征输出赋给相应的变量
        lang_att_output = lang_feats
        visn_att_output = visn_feats

        # 调用交叉注意力机制进行特征交互
        lang_att_output, visn_att_output = self.cross_att(
            lang_att_output,
            lang_attention_mask,
            visn_att_output,
            visn_attention_mask,
            output_attentions,
            training=training,
        )

        # 从语言注意力输出中提取注意力概率，排除第一个元素（通常是注意力分数）
        attention_probs = lang_att_output[1:]

        # 调用自注意力机制分别处理语言和视觉特征
        lang_att_output, visn_att_output = self.self_att(
            lang_att_output[0],
            lang_attention_mask,
            visn_att_output[0],
            visn_attention_mask,
            training=training,
        )

        # 使用全连接层处理语言和视觉特征的输出
        lang_output, visn_output = self.output_fc(lang_att_output, visn_att_output, training=training)

        # 根据输出注意力开关决定返回结果，包含注意力概率的第一个元素
        return (lang_output, visn_output, attention_probs[0]) if output_attentions else (lang_output, visn_output)

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return

        # 设置模型已构建标志为True
        self.built = True

        # 如果存在视觉注意力模型，构建其网络结构
        if getattr(self, "visual_attention", None) is not None:
            with tf.name_scope(self.visual_attention.name):
                self.visual_attention.build(None)

        # 如果存在语言自注意力模型，构建其网络结构
        if getattr(self, "lang_self_att", None) is not None:
            with tf.name_scope(self.lang_self_att.name):
                self.lang_self_att.build(None)

        # 如果存在视觉自注意力模型，构建其网络结构
        if getattr(self, "visn_self_att", None) is not None:
            with tf.name_scope(self.visn_self_att.name):
                self.visn_self_att.build(None)

        # 如果存在语言交互模型，构建其网络结构
        if getattr(self, "lang_inter", None) is not None:
            with tf.name_scope(self.lang_inter.name):
                self.lang_inter.build(None)

        # 如果存在语言输出模型，构建其网络结构
        if getattr(self, "lang_output", None) is not None:
            with tf.name_scope(self.lang_output.name):
                self.lang_output.build(None)

        # 如果存在视觉交互模型，构建其网络结构
        if getattr(self, "visn_inter", None) is not None:
            with tf.name_scope(self.visn_inter.name):
                self.visn_inter.build(None)

        # 如果存在视觉输出模型，构建其网络结构
        if getattr(self, "visn_output", None) is not None:
            with tf.name_scope(self.visn_output.name):
                self.visn_output.build(None)
# 定义一个自定义的 TensorFlow 层 TFLxmertEncoder，用于实现 LXMERT 模型的编码器功能
class TFLxmertEncoder(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 初始化视觉特征编码器，使用 TFLxmertVisualFeatureEncoder 类
        self.visn_fc = TFLxmertVisualFeatureEncoder(config, name="visn_fc")

        # 设置层的数量
        self.num_l_layers = config.l_layers  # 从配置中获取 L 层的数量
        self.num_x_layers = config.x_layers  # 从配置中获取 X 层的数量
        self.num_r_layers = config.r_layers  # 从配置中获取 R 层的数量

        # 初始化各个层
        # 使用 self.layer 而不是 self.l_layer 是为了支持加载 BERT 权重
        self.layer = [TFLxmertLayer(config, name=f"layer_._{i}") for i in range(self.num_l_layers)]
        self.x_layers = [TFLxmertXLayer(config, name=f"x_layers_._{i}") for i in range(self.num_x_layers)]
        self.r_layers = [TFLxmertLayer(config, name=f"r_layers_._{i}") for i in range(self.num_r_layers)]
        self.config = config

    # 定义 call 方法，用于定义层的前向传播逻辑
    def call(
        self,
        lang_feats=None,
        lang_attention_mask=None,
        visual_feats=None,
        visual_pos=None,
        visual_attention_mask=None,
        output_attentions=None,
        training=False,
        **kwargs
    ):
    ):
        # 初始化空的视觉隐藏状态和语言隐藏状态元组
        vision_hidden_states = ()
        language_hidden_states = ()
        # 根据是否需要输出注意力权重，初始化视觉和语言的注意力权重元组
        vision_attentions = () if output_attentions or self.config.output_attentions else None
        language_attentions = () if output_attentions or self.config.output_attentions else None
        cross_encoder_attentions = () if output_attentions or self.config.output_attentions else None

        # 对视觉特征进行全连接层处理
        visual_feats = self.visn_fc([visual_feats, visual_pos], training=training)

        # 运行语言层的每个模块
        for layer_module in self.layer:
            # 调用当前语言层模块，更新语言特征和可能的注意力权重
            l_outputs = layer_module(lang_feats, lang_attention_mask, output_attentions, training=training)
            lang_feats = l_outputs[0]
            # 更新语言隐藏状态元组
            language_hidden_states = language_hidden_states + (lang_feats,)
            # 如果需要输出注意力权重，更新语言注意力权重元组
            if language_attentions is not None:
                language_attentions = language_attentions + (l_outputs[1],)

        # 运行关系层的每个模块
        for layer_module in self.r_layers:
            # 调用当前关系层模块，更新视觉特征和可能的注意力权重
            v_outputs = layer_module(
                visual_feats,
                visual_attention_mask,
                output_attentions,
                training=training,
            )
            visual_feats = v_outputs[0]
            # 更新视觉隐藏状态元组
            vision_hidden_states = vision_hidden_states + (visual_feats,)
            # 如果需要输出注意力权重，更新视觉注意力权重元组
            if vision_attentions is not None:
                vision_attentions = vision_attentions + (v_outputs[1],)

        # 运行跨模态层的每个模块
        for layer_module in self.x_layers:
            # 调用当前跨模态层模块，更新语言特征、视觉特征和可能的注意力权重
            x_outputs = layer_module(
                lang_feats,
                lang_attention_mask,
                visual_feats,
                visual_attention_mask,
                output_attentions,
                training=training,
            )
            lang_feats, visual_feats = x_outputs[:2]
            # 更新视觉和语言隐藏状态元组
            vision_hidden_states = vision_hidden_states + (visual_feats,)
            language_hidden_states = language_hidden_states + (lang_feats,)
            # 如果需要输出注意力权重，更新跨模态注意力权重元组
            if cross_encoder_attentions is not None:
                cross_encoder_attentions = cross_encoder_attentions + (x_outputs[2],)

        # 组装视觉编码器的输出：视觉隐藏状态和可能的视觉注意力权重
        visual_encoder_outputs = (
            vision_hidden_states,
            vision_attentions if output_attentions else None,
        )
        # 组装语言编码器的输出：语言隐藏状态和可能的语言注意力权重
        lang_encoder_outputs = (
            language_hidden_states,
            language_attentions if output_attentions else None,
        )

        # 返回编码器的输出：视觉编码器输出、语言编码器输出和可能的跨编码器注意力权重
        return (
            visual_encoder_outputs,
            lang_encoder_outputs,
            cross_encoder_attentions if output_attentions else None,
        )
    # 定义一个方法用于构建模型，如果模型已经构建过，则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        
        # 设置模型已构建标志为 True
        self.built = True
        
        # 如果存在名为 "visn_fc" 的属性并且不为 None，则构建其内部的层
        if getattr(self, "visn_fc", None) is not None:
            with tf.name_scope(self.visn_fc.name):
                self.visn_fc.build(None)
        
        # 如果存在名为 "layer" 的属性并且不为 None，则依次构建每个层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)
        
        # 如果存在名为 "x_layers" 的属性并且不为 None，则依次构建每个层
        if getattr(self, "x_layers", None) is not None:
            for layer in self.x_layers:
                with tf.name_scope(layer.name):
                    layer.build(None)
        
        # 如果存在名为 "r_layers" 的属性并且不为 None，则依次构建每个层
        if getattr(self, "r_layers", None) is not None:
            for layer in self.r_layers:
                with tf.name_scope(layer.name):
                    layer.build(None)
@keras_serializable
class TFLxmertMainLayer(keras.layers.Layer):
    # 定义一个 Keras 可序列化的自定义层，用于处理 LXMERT 主层
    config_class = LxmertConfig

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        
        # 初始化方法，设置层的配置参数
        self.config = config
        self.num_l_layers = config.l_layers
        self.num_x_layers = config.x_layers
        self.num_r_layers = config.r_layers
        self.initializer_range = config.initializer_range
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.return_dict = config.use_return_dict
        self.embeddings = TFLxmertEmbeddings(config, name="embeddings")
        self.encoder = TFLxmertEncoder(config, name="encoder")
        self.pooler = TFLxmertPooler(config, name="pooler")
        self.config = config

    def get_input_embeddings(self):
        # 返回 embeddings 层
        return self.embeddings

    def set_input_embeddings(self, value):
        # 设置 embeddings 层的权重和词汇大小
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    def _prune_heads(self, heads_to_prune):
        # 未实现的方法，用于裁剪注意力头部
        raise NotImplementedError

    @unpack_inputs
    def call(
        self,
        input_ids=None,
        visual_feats=None,
        visual_pos=None,
        attention_mask=None,
        visual_attention_mask=None,
        token_type_ids=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        # 模型调用方法，处理输入数据，进行前向传播
        # 使用 unpack_inputs 装饰器来解包输入参数

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        
        # 如果已经构建过，则直接返回
        # 根据需要构建 embeddings、encoder 和 pooler 层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)


class TFLxmertPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = LxmertConfig
    base_model_prefix = "lxmert"

    @property
    def dummy_inputs(self):
        """
        Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        """
        # 定义用于构建网络的虚拟输入数据
        batch_size = 2
        num_visual_features = 10
        input_ids = tf.constant([[3, 5, 6], [2, 3, 4]], dtype=tf.int32)
        visual_feats = tf.random.uniform((batch_size, num_visual_features, self.config.visual_feat_dim))
        visual_pos = tf.random.uniform((batch_size, num_visual_features, 4))

        return {
            "input_ids": input_ids,
            "visual_feats": visual_feats,
            "visual_pos": visual_pos,
        }

    @property


这段代码定义了一个自定义的 Keras 层 `TFLxmertMainLayer` 和一个抽象类 `TFLxmertPreTrainedModel`，分别用于处理 LXMERT 模型的主要层和预训练模型的初始化和虚拟输入数据。
    # 定义输入签名函数，返回一个字典，描述了模型输入的各个特征
    def input_signature(self):
        # 定义输入的张量规格：input_ids是一个二维的整数张量，形状为(None, None)，表示批次中的序列长度可以变化
        return {
            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
            # attention_mask是一个二维的整数张量，形状为(None, None)，用于指示输入序列的填充部分和真实部分
            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
            # visual_feats是一个三维的浮点数张量，形状为(None, None, visual_feat_dim)，包含了视觉特征的表示
            "visual_feats": tf.TensorSpec((None, None, self.config.visual_feat_dim), tf.float32, name="visual_feats"),
            # visual_pos是一个三维的浮点数张量，形状为(None, None, 4)，描述了视觉特征的位置信息
            "visual_pos": tf.TensorSpec((None, None, 4), tf.float32, name="visual_pos"),
            # visual_attention_mask是一个二维的整数张量，形状为(None, None)，用于指示视觉输入序列的填充和真实部分
            "visual_attention_mask": tf.TensorSpec((None, None), tf.int32, name="visual_attention_mask"),
            # token_type_ids是一个二维的整数张量，形状为(None, None)，用于多任务学习或特定任务的标识符
            "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
        }
# LXMERT 模型的文档字符串，描述了模型的提出背景、应用场景和训练数据来源等信息
LXMERT_START_DOCSTRING = r"""

    The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from
    Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. It's a vision and language transformer
    model, pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual
    genome, using a combination of masked language modeling, region of interest feature regression, cross entropy loss
    for question answering attribute prediction, and object tag prediction.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`LxmertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# LXMERT 模型的输入文档字符串，当前为空，用于指定输入格式和输入参数的解释
LXMERT_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.",
    LXMERT_START_DOCSTRING,
)
# 定义 TFLxmertModel 类，继承自 TFLxmertPreTrainedModel，用于表示 LXMERT 模型的核心变换器输出原始隐藏状态
class TFLxmertModel(TFLxmertPreTrainedModel):
    # 初始化方法，用于创建一个新的对象实例
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法，传入配置、输入参数和关键字参数
        super().__init__(config, *inputs, **kwargs)
        # 创建一个 TFLxmertMainLayer 实例，命名为 "lxmert"
        self.lxmert = TFLxmertMainLayer(config, name="lxmert")

    # 将装饰器 unpack_inputs 应用于 call 方法
    # 向模型前向传播函数添加模型输入的文档字符串
    # 向模型前向传播函数添加代码示例的文档字符串，包括检查点、输出类型和配置类
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        visual_feats: tf.Tensor | None = None,
        visual_pos: tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        visual_attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple, TFLxmertModelOutput]:
        # 调用 self.lxmert 来执行 LXMERT 模型的前向传播
        outputs = self.lxmert(
            input_ids,
            visual_feats,
            visual_pos,
            attention_mask,
            visual_attention_mask,
            token_type_ids,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict,
            training,
        )
        # 返回模型输出
        return outputs

    # 构建模型的方法，用于定义模型的结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 将模型标记为已构建
        self.built = True
        # 如果 self.lxmert 存在，则在其命名作用域内构建它
        if getattr(self, "lxmert", None) is not None:
            with tf.name_scope(self.lxmert.name):
                self.lxmert.build(None)
# 定义一个自定义层 TFLxmertPooler，继承自 keras 的 Layer 类
class TFLxmertPooler(keras.layers.Layer):
    
    # 初始化方法，接受配置参数 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个 Dense 层，用于池化操作，输出维度为 config.hidden_size
        # 使用指定的初始化器初始化权重，激活函数为 tanh
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    # 定义调用方法，接受隐藏状态 hidden_states 作为输入
    def call(self, hidden_states):
        # 池化模型，简单地取第一个 token 对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        return pooled_output

    # 构建方法，用于构建层的结构
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，则直接返回；否则，构建 Dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# 从 transformers 库中复制的类，用于 Lxmert 模型的预测头转换
# 基于 TFBertPredictionHeadTransform 修改
class TFLxmertPredictionHeadTransform(keras.layers.Layer):
    
    # 初始化方法，接受 LxmertConfig 类型的配置参数 config 和其他关键字参数
    def __init__(self, config: LxmertConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 创建一个 Dense 层，输出维度为 config.hidden_size
        # 使用指定的初始化器初始化权重
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="dense",
        )
        
        # 根据配置中的隐藏激活函数类型选择对应的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.transform_act_fn = config.hidden_act
        
        # 创建 LayerNormalization 层，epsilon 参数为 config.layer_norm_eps
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        self.config = config

    # 定义调用方法，接受 tf.Tensor 类型的隐藏状态作为输入，返回处理后的隐藏状态
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用 Dense 层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 应用激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # 应用 LayerNormalization
        hidden_states = self.LayerNorm(inputs=hidden_states)

        return hidden_states

    # 构建方法，用于构建层的结构
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，则直接返回；否则，构建 Dense 层和 LayerNormalization 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 从 transformers 库中复制的类，用于 Lxmert 模型的语言模型预测头
# 基于 TFBertLMPredictionHead 修改
class TFLxmertLMPredictionHead(keras.layers.Layer):
    
    # 略
    # 初始化函数，用于创建一个新的 LxmertOutput 类的实例
    def __init__(self, config: LxmertConfig, input_embeddings: keras.layers.Layer, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)

        # 保存配置信息和隐藏大小
        self.config = config
        self.hidden_size = config.hidden_size

        # 创建一个 TFLxmertPredictionHeadTransform 的实例，用于变换输出
        self.transform = TFLxmertPredictionHeadTransform(config, name="transform")

        # 将输入的嵌入层保存为类的一个属性
        # 输出权重与输入嵌入层相同，但每个标记有一个仅用于输出的偏置
        self.input_embeddings = input_embeddings

    # 在构建层时被调用，用于初始化层的权重
    def build(self, input_shape=None):
        # 添加一个名为 "bias" 的可训练权重，形状为 (词汇表大小,)
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True

        # 如果存在变换器 transform，则在命名空间下构建它
        if getattr(self, "transform", None) is not None:
            with tf.name_scope(self.transform.name):
                self.transform.build(None)

    # 返回输入嵌入层的引用
    def get_output_embeddings(self) -> keras.layers.Layer:
        return self.input_embeddings

    # 设置输入嵌入层的权重和词汇表大小
    def set_output_embeddings(self, value: tf.Variable):
        self.input_embeddings.weight = value
        self.input_embeddings.vocab_size = shape_list(value)[0]

    # 返回偏置的字典形式
    def get_bias(self) -> Dict[str, tf.Variable]:
        return {"bias": self.bias}

    # 设置偏置的值
    def set_bias(self, value: tf.Variable):
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]

    # 模型调用函数，接受隐藏状态张量作为输入，返回预测的标记概率张量
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 使用 transform 对隐藏状态进行变换
        hidden_states = self.transform(hidden_states=hidden_states)
        # 获取隐藏状态的序列长度
        seq_length = shape_list(hidden_states)[1]
        # 将隐藏状态重塑为二维张量
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
        # 执行矩阵乘法，将隐藏状态与输入嵌入层的权重相乘
        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
        # 将结果重塑为三维张量
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        # 添加偏置到隐藏状态的最后一个维度
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        # 返回最终的预测张量
        return hidden_states
# 从 transformers.models.bert.modeling_tf_bert.TFBertMLMHead 复制而来，将 Bert 替换为 Lxmert
class TFLxmertMLMHead(keras.layers.Layer):
    def __init__(self, config: LxmertConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)
        
        # 创建 TFLxmertLMPredictionHead 对象作为预测头部
        self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions")

    # 对输入的序列输出进行预测
    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
        # 通过预测头部获取预测分数
        prediction_scores = self.predictions(hidden_states=sequence_output)

        return prediction_scores

    # 构建层，确保仅构建一次
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在预测头部，构建它
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                self.predictions.build(None)


# Lxmert 的预训练头部，包含语言模型预测和序列关系预测
class TFLxmertPreTrainingHeads(keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)
        
        # 创建 TFLxmertLMPredictionHead 对象作为预测头部
        self.predictions = TFLxmertLMPredictionHead(config, input_embeddings, name="predictions")
        
        # 创建用于序列关系预测的全连接层
        self.seq_relationship = keras.layers.Dense(
            2,
            kernel_initializer=get_initializer(config.initializer_range),
            name="seq_relationship",
        )
        self.config = config

    # 对序列输出和池化输出进行调用，生成预测分数和序列关系分数
    def call(self, sequence_output, pooled_output):
        # 获取语言模型预测分数
        prediction_scores = self.predictions(sequence_output)
        # 获取序列关系预测分数
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score

    # 构建层，确保仅构建一次
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在预测头部，构建它
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                self.predictions.build(None)
        # 如果存在序列关系预测层，构建它
        if getattr(self, "seq_relationship", None) is not None:
            with tf.name_scope(self.seq_relationship.name):
                self.seq_relationship.build([None, None, self.config.hidden_size])


# Lxmert 的视觉回答头部，用于分类问题的预测
class TFLxmertVisualAnswerHead(keras.layers.Layer):
    def __init__(self, config, num_labels, **kwargs):
        super().__init__(**kwargs)
        hid_dim = config.hidden_size
        
        # 创建全连接层，输入维度为隐藏维度的两倍，输出维度为标签数量
        self.dense = keras.layers.Dense(
            hid_dim * 2,
            kernel_initializer=get_initializer(config.initializer_range),
            name="logit_fc_._0",
        )
        self.activation = get_tf_activation("gelu")  # 获取 GELU 激活函数
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="logit_fc_._2")
        
        # 创建输出全连接层，输入为隐藏状态的维度，输出为标签数量
        self.dense_1 = keras.layers.Dense(
            num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="logit_fc_._3",
        )
        self.hid_dim = hid_dim

    # 对隐藏状态进行处理，通过全连接层和激活函数生成预测
    def call(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        hidden_states = self.dense_1(hidden_states)

        return hidden_states
    # 定义神经网络层的构建方法，参数input_shape为输入形状，默认为None
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果存在dense属性，则构建dense层
        if getattr(self, "dense", None) is not None:
            # 使用dense层的名称作为命名空间
            with tf.name_scope(self.dense.name):
                # 构建dense层，输入形状为[None, None, self.hid_dim]
                self.dense.build([None, None, self.hid_dim])
        # 如果存在layer_norm属性，则构建layer_norm层
        if getattr(self, "layer_norm", None) is not None:
            # 使用layer_norm层的名称作为命名空间
            with tf.name_scope(self.layer_norm.name):
                # 构建layer_norm层，输入形状为[None, self.hid_dim * 2]
                self.layer_norm.build([None, self.hid_dim * 2])
        # 如果存在dense_1属性，则构建dense_1层
        if getattr(self, "dense_1", None) is not None:
            # 使用dense_1层的名称作为命名空间
            with tf.name_scope(self.dense_1.name):
                # 构建dense_1层，输入形状为[None, None, self.hid_dim * 2]
                self.dense_1.build([None, None, self.hid_dim * 2])
# 定义一个自定义的 Keras 层，用于处理 Lxmert 模型的视觉对象预测任务
class TFLxmertVisualObjHead(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 初始化一个用于预测头部变换的层
        self.transform = TFLxmertPredictionHeadTransform(config, name="transform")

        # 根据配置决定是否使用视觉损失
        visual_losses = {}
        if config.visual_obj_loss:
            visual_losses["obj"] = {"shape": (-1,), "num": config.num_object_labels}
        if config.visual_attr_loss:
            visual_losses["attr"] = {"shape": (-1,), "num": config.num_attr_labels}
        if config.visual_feat_loss:
            visual_losses["feat"] = {"shape": (-1, 2048), "num": config.visual_feat_dim}
        self.visual_losses = visual_losses

        # 输出权重与输入嵌入相同，但每个标记都有一个仅输出的偏置项
        # 创建一个字典，其中每个键对应于一个类型的视觉损失，并且值是对应的全连接层
        self.decoder_dict = {
            key: keras.layers.Dense(
                self.visual_losses[key]["num"],
                kernel_initializer=get_initializer(config.initializer_range),
                name=f"decoder_dict.{key}",
            )
            for key in self.visual_losses
        }
        self.config = config

    def call(self, hidden_states):
        # 对输入的隐藏状态进行变换
        hidden_states = self.transform(hidden_states)
        output = {}
        # 对每种视觉损失类型进行预测
        for key in self.visual_losses:
            output[key] = self.decoder_dict[key](hidden_states)
        return output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建了 transform 层，则构建它
        if getattr(self, "transform", None) is not None:
            with tf.name_scope(self.transform.name):
                self.transform.build(None)
        # 如果已经构建了 decoder_dict 字典中的层，则分别构建每一层
        if getattr(self, "decoder_dict", None) is not None:
            for layer in self.decoder_dict.values():
                with tf.name_scope(layer.name):
                    # 构建每个全连接层，输入形状为 [None, None, config.hidden_size]
                    layer.build([None, None, self.config.hidden_size])


@add_start_docstrings("""Lxmert Model with a `language modeling` head on top.""", LXMERT_START_DOCSTRING)
class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
    # 这里省略类的具体实现部分，但包含了一个 Lxmert 模型和一个语言建模头部
    pass
    # 初始化方法，用于创建一个新的实例
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 将配置信息存储在实例中
        self.config = config
        # 设置问题回答标签的数量
        self.num_qa_labels = config.num_qa_labels
        # 可视化损失的正常化器
        self.visual_loss_normalizer = config.visual_loss_normalizer

        # 使用预训练任务的标志
        self.task_mask_lm = config.task_mask_lm
        self.task_obj_predict = config.task_obj_predict
        self.task_matched = config.task_matched
        self.task_qa = config.task_qa

        # Lxmert 主干网络
        self.lxmert = TFLxmertMainLayer(config, name="lxmert")

        # 预训练头部
        self.cls = TFLxmertPreTrainingHeads(config, self.lxmert.embeddings, name="cls")
        # 如果有物体预测任务，则创建物体预测头部
        if self.task_obj_predict:
            self.obj_predict_head = TFLxmertVisualObjHead(config, name="obj_predict_head")
        # 如果有问题回答任务，则创建问题回答头部
        if self.task_qa:
            self.answer_head = TFLxmertVisualAnswerHead(config, self.num_qa_labels, name="answer_head")

        # 损失函数
        self.loss_fcts = {
            "l2": keras.losses.Huber(delta=1.0, name="huber_loss"),  # L2 损失函数
            "visn_ce": keras.losses.SparseCategoricalCrossentropy(from_logits=True),  # 稀疏分类交叉熵损失函数
            "ce": keras.losses.SparseCategoricalCrossentropy(from_logits=True),  # 稀疏分类交叉熵损失函数
        }

        # 可视化损失字典
        visual_losses = {}
        # 如果配置中包含物体损失，则添加到可视化损失字典中
        if config.visual_obj_loss:
            visual_losses["obj"] = {
                "shape": (-1,),  # 形状为一维向量
                "num": config.num_object_labels,  # 物体标签数量
                "loss": "visn_ce",  # 使用稀疏分类交叉熵损失
            }
        # 如果配置中包含属性损失，则添加到可视化损失字典中
        if config.visual_attr_loss:
            visual_losses["attr"] = {
                "shape": (-1,),  # 形状为一维向量
                "num": config.num_attr_labels,  # 属性标签数量
                "loss": "visn_ce",  # 使用稀疏分类交叉熵损失
            }
        # 如果配置中包含特征损失，则添加到可视化损失字典中
        if config.visual_feat_loss:
            visual_losses["feat"] = {
                "shape": (-1, config.visual_feat_dim),  # 形状为二维向量
                "num": config.visual_feat_dim,  # 特征维度
                "loss": "l2",  # 使用L2损失
            }
        # 将可视化损失字典存储在实例中
        self.visual_losses = visual_losses
    @unpack_inputs
    @add_start_docstrings_to_model_forward(LXMERT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFLxmertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    # 使用装饰器对 call 方法进行功能增强和文档替换
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        visual_feats: tf.Tensor | None = None,
        visual_pos: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        visual_attention_mask: tf.Tensor | None = None,
        token_type_ids: tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        masked_lm_labels: tf.Tensor | None = None,
        obj_labels: Dict[str, Tuple[tf.Tensor, tf.Tensor]] | None = None,
        matched_label: tf.Tensor | None = None,
        ans: tf.Tensor | None = None,
        output_attentions: bool | None = None,
        output_hidden_states: bool | None = None,
        return_dict: bool | None = None,
        training: bool = False,
        # 定义 call 方法的参数，包括输入的张量和布尔值控制标志


这段代码定义了一个 `call` 方法，用于执行模型的前向传播操作。方法中使用了装饰器来增强其功能和修改返回文档。
    # 定义模型构建方法，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 将构建标志设置为True，表示模型已经构建过
        self.built = True
        # 如果存在名为"lxmert"的属性，使用其名称作为命名空间来构建lxmert模块
        if getattr(self, "lxmert", None) is not None:
            with tf.name_scope(self.lxmert.name):
                self.lxmert.build(None)
        # 如果存在名为"cls"的属性，使用其名称作为命名空间来构建cls模块
        if getattr(self, "cls", None) is not None:
            with tf.name_scope(self.cls.name):
                self.cls.build(None)
        # 如果存在名为"obj_predict_head"的属性，使用其名称作为命名空间来构建obj_predict_head模块
        if getattr(self, "obj_predict_head", None) is not None:
            with tf.name_scope(self.obj_predict_head.name):
                self.obj_predict_head.build(None)
        # 如果存在名为"answer_head"的属性，使用其名称作为命名空间来构建answer_head模块
        if getattr(self, "answer_head", None) is not None:
            with tf.name_scope(self.answer_head.name):
                self.answer_head.build(None)

`.\models\lxmert\tokenization_lxmert.py`

# coding=utf-8
# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections  # 引入 collections 模块，用于 OrderedDict 的创建
import os  # 引入 os 模块，用于操作系统相关功能
import unicodedata  # 引入 unicodedata 模块，用于 Unicode 数据库中的字符属性查询
from typing import List, Optional, Tuple  # 引入类型提示相关的工具

from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace  # 引入 LxmertTokenizer 所需的模块
from ...utils import logging  # 引入 logging 模块，用于日志记录

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}  # 定义词汇表文件名的映射

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
    }
}  # 预训练模型词汇表文件的映射

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "unc-nlp/lxmert-base-uncased": 512,
}  # 预训练模型的位置编码嵌入大小映射

PRETRAINED_INIT_CONFIGURATION = {
    "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
}  # 预训练模型的初始化配置映射


# Copied from transformers.models.bert.tokenization_bert.load_vocab
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()  # 创建一个有序字典对象 vocab
    with open(vocab_file, "r", encoding="utf-8") as reader:  # 打开词汇表文件
        tokens = reader.readlines()  # 逐行读取文件内容
    for index, token in enumerate(tokens):  # 遍历行索引和行内容
        token = token.rstrip("\n")  # 去除行尾的换行符
        vocab[token] = index  # 将 token 添加到 vocab 字典，并使用索引作为值
    return vocab  # 返回构建好的词汇表字典


# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()  # 去除文本两端的空白字符
    if not text:  # 如果文本为空
        return []  # 返回空列表
    tokens = text.split()  # 使用空白字符分割文本，得到 token 列表
    return tokens  # 返回分割后的 token 列表


# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with bert-base-cased->unc-nlp/lxmert-base-uncased, BERT->Lxmert, BertTokenizer->LxmertTokenizer
class LxmertTokenizer(PreTrainedTokenizer):
    r"""
    Construct a Lxmert tokenizer. Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    pass  # LxmertTokenizer 类暂时没有实现额外的方法或属性，因此只需保留文档字符串即可
    # 定义一个类，用于处理词汇表和标记化参数的配置
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练词汇文件名映射表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练模型初始化配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 预训练位置嵌入的最大输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # 初始化方法，用于设置词汇文件、标记化的参数及其它配置
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 检查词汇文件是否存在，如果不存在则抛出 ValueError 异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = LxmertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表到 self.vocab
        self.vocab = load_vocab(vocab_file)
        # 根据词汇表创建一个从 id 到 token 的有序字典 self.ids_to_tokens
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 根据参数设置是否进行基本的 tokenization
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            # 如果需要进行基本 tokenization，则初始化 BasicTokenizer
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )

        # 使用给定的词汇表和未知标记 unk_token 初始化 WordpieceTokenizer
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法，设置各种参数和特殊标记
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    @property
    def do_lower_case(self):
        # 返回当前的 do_lower_case 参数值，由 basic_tokenizer 决定
        return self.basic_tokenizer.do_lower_case

    @property
    def vocab_size(self):
        # 返回词汇表的大小，即词汇表中条目的数量
        return len(self.vocab)

    def get_vocab(self):
        # 返回一个包含词汇表和 added_tokens_encoder 的合并字典
        return dict(self.vocab, **self.added_tokens_encoder)

    def _tokenize(self, text, split_special_tokens=False):
        # 初始化分割后的 token 列表
        split_tokens = []
        if self.do_basic_tokenize:
            # 如果需要进行基本 tokenization，则使用 BasicTokenizer 分词
            for token in self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens if not split_special_tokens else None
            ):
                # 如果 token 在 never_split 集合中，则直接添加到 split_tokens 中
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                else:
                    # 否则，使用 WordpieceTokenizer 进行进一步分词，并添加到 split_tokens 中
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 如果不需要基本 tokenization，则直接使用 WordpieceTokenizer 进行分词
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        # 返回分割后的 token 列表
        return split_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 将 token 转换为其对应的 id，如果不存在则返回 unk_token 对应的 id
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 将 id 转换为对应的 token，如果不存在则返回 unk_token
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 token 列表转换为单个字符串，去除特殊标记 " ##"
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A Lxmert sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # If only one sequence is provided, concatenate it with [CLS] and [SEP] tokens
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        
        # For a pair of sequences, concatenate them with [CLS], [SEP] (between sequences), and final [SEP]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # If the input already contains special tokens, delegate to superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Compute the mask indicating positions of special tokens in the concatenated sequence(s)
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs from token lists representing sequences.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: A list of token type IDs where each ID corresponds to the segment ID of a token.
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Lxmert sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define the separator and classification tokens as lists containing the corresponding token IDs
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        
        # If token_ids_1 is None, return a list of zeros corresponding to the length of cls + token_ids_0 + sep
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # Otherwise, concatenate lists to form a sequence pair mask:
        #   - First sequence: cls + token_ids_0 + sep, all assigned token type ID 0
        #   - Second sequence: token_ids_1 + sep, all assigned token type ID 1
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        index = 0
        
        # Determine the vocabulary file path based on whether save_directory is a directory or a direct file path
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        
        # Write the vocabulary to the determined file path
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # Check for non-consecutive vocabulary indices and log a warning if found
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # Write each token followed by a newline character
                writer.write(token + "\n")
                index += 1
        
        # Return the path to the saved vocabulary file as a tuple
        return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """

    def __init__(
        self,
        do_lower_case=True,
        never_split=None,
        tokenize_chinese_chars=True,
        strip_accents=None,
        do_split_on_punc=True,
    ):
        """
        初始化函数，设置基本分词器的参数。

        Args:
            do_lower_case (bool, optional): 是否在分词时将输入转换为小写，默认为 True。
            never_split (Iterable, optional): 在分词过程中永远不会被分割的标记集合，默认为 None。
            tokenize_chinese_chars (bool, optional): 是否分词中文字符，默认为 True。
                对于日语可能需要禁用此选项（参见相关问题）。
            strip_accents (bool, optional): 是否去除所有重音符号。如果未指定，则由 lowercase 的值决定。
            do_split_on_punc (bool, optional): 是否进行基本的标点符号分割，默认为 True。
                在某些情况下，我们希望跳过基本的标点符号分割，以便后续的分词可以捕捉词语的完整上下文，例如缩略词。
        """
        # 如果 never_split 为 None，则设为一个空列表
        if never_split is None:
            never_split = []
        # 设置是否在分词时转换为小写
        self.do_lower_case = do_lower_case
        # 将 never_split 转换为集合，表示在分词时永远不会被分割的标记
        self.never_split = set(never_split)
        # 设置是否分词中文字符
        self.tokenize_chinese_chars = tokenize_chinese_chars
        # 设置是否去除所有重音符号
        self.strip_accents = strip_accents
        # 设置是否进行基本的标点符号分割
        self.do_split_on_punc = do_split_on_punc
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 将输入的never_split列表与实例属性self.never_split的集合进行并集操作，如果never_split为None则使用空集合
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本内容，去除不必要的字符或格式
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        # 如果开启了tokenize_chinese_chars选项，则对文本中的中文字符进行特定处理
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        # 使用Unicode规范化函数将文本标准化为NFC形式，处理Unicode中可能存在的不同编码的字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空白符分词函数对标准化后的文本进行分词，得到原始token列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        for token in orig_tokens:
            # 如果token不在never_split集合中，则进一步处理
            if token not in never_split:
                # 如果设置了小写处理，则将token转换为小写
                if self.do_lower_case:
                    token = token.lower()
                    # 如果需要去除重音符号，则调用私有方法_run_strip_accents处理token
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果需要去除重音符号，则调用私有方法_run_strip_accents处理token
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            # 使用私有方法_run_split_on_punc对token进行进一步的标点符号分割处理，加入split_tokens列表
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 使用空白符分词函数对处理后的token列表进行再次分词，得到最终的输出token列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 使用Unicode规范化函数将文本标准化为NFD形式，处理Unicode中可能存在的不同编码的字符
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            # 获取字符的Unicode类别
            cat = unicodedata.category(char)
            # 如果字符的类别为Mn（Mark, Nonspacing），表示为重音符号，跳过处理
            if cat == "Mn":
                continue
            # 将不含重音符号的字符添加到output列表中
            output.append(char)
        # 将处理后的字符列表拼接成字符串并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要根据标点符号分割文本，或者文本在never_split列表中，则直接返回文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号
            if _is_punctuation(char):
                # 在输出列表中添加新的子列表，用于存放标点符号
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，检查是否需要开始新的单词
                if start_new_word:
                    output.append([])
                start_new_word = False
                # 将当前字符添加到当前单词的子列表中
                output[-1].append(char)
            i += 1

        # 将子列表中的字符连接成字符串，并返回结果列表
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果是中日韩字符，添加空格到输出列表中
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        # 将输出列表中的字符连接成字符串，并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查给定的代码点是否属于中日韩字符的Unicode块
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或者控制字符，直接跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果是空白字符，用单个空格替换
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将输出列表中的字符连接成字符串，并返回
        return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化 WordpieceTokenizer 类的实例
        self.vocab = vocab  # 词汇表，用于词片段的匹配
        self.unk_token = unk_token  # 未知标记，用于表示未能识别的词片段
        self.max_input_chars_per_word = max_input_chars_per_word  # 单词的最大输入字符数

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 初始化空的输出词片段列表
        output_tokens = []
        # 使用 whitespace_tokenize 函数对文本进行分词
        for token in whitespace_tokenize(text):
            chars = list(token)  # 将当前分词转换为字符列表
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)  # 如果分词长度超过最大字符数，将其标记为未知标记
                continue

            is_bad = False  # 标志变量，表示当前分词是否无法分解成词片段
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                # 使用贪婪最长匹配算法寻找当前字符片段的词片段
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr  # 如果不是第一个片段，则在片段前加上 '##' 表示连接
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True  # 如果未找到匹配的词片段，则将该分词标记为无法识别
                    break
                sub_tokens.append(cur_substr)  # 将找到的词片段添加到词片段列表中
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)  # 如果无法分解成词片段，则使用未知标记替代
            else:
                output_tokens.extend(sub_tokens)  # 将词片段列表扩展到输出列表中
        return output_tokens  # 返回最终的词片段列表作为结果

`.\models\lxmert\tokenization_lxmert_fast.py`

# coding=utf-8
# 引入必要的库和模块
import json  # 导入 json 库，用于处理 JSON 数据
from typing import List, Optional, Tuple  # 导入类型提示模块，用于类型标注

from tokenizers import normalizers  # 从 tokenizers 库中导入 normalizers 模块

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入预训练的 tokenizer 类
from .tokenization_lxmert import LxmertTokenizer  # 从当前目录导入 LxmertTokenizer 类

# 定义与词汇相关的文件名字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
    },
    "tokenizer_file": {
        "unc-nlp/lxmert-base-uncased": (
            "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/tokenizer.json"
        ),
    },
}

# 预训练模型的位置嵌入尺寸
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "unc-nlp/lxmert-base-uncased": 512,
}

# 预训练模型的初始化配置
PRETRAINED_INIT_CONFIGURATION = {
    "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
}

# 从 transformers.models.bert.tokenization_bert_fast.BertTokenizerFast 复制而来，修改为 Lxmert 相关的类和文件名
class LxmertTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" Lxmert tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    # 引入全局变量，包含预定义的词汇表文件名
    vocab_files_names = VOCAB_FILES_NAMES
    
    # 引入预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    
    # 引入预训练模型的初始化配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    
    # 引入预训练模型的最大输入大小
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    
    # 引入慢速分词器的类，这里指定为 LxmertTokenizer
    slow_tokenizer_class = LxmertTokenizer
    
    # 初始化方法，用于创建一个新的实例
    def __init__(
        self,
        vocab_file=None,  # 词汇表文件路径，可选参数
        tokenizer_file=None,  # 分词器文件路径，可选参数
        do_lower_case=True,  # 是否将输入转换为小写，可选参数，默认为 True
        unk_token="[UNK]",  # 未知标记，词汇表中不存在的标记
        sep_token="[SEP]",  # 分隔符标记，用于构建多个序列的序列
        pad_token="[PAD]",  # 填充标记，用于将不同长度的序列填充到相同长度
        cls_token="[CLS]",  # 分类器标记，在序列分类时作为序列的第一个标记
        mask_token="[MASK]",  # 掩码标记，用于掩码语言建模中的预测
        tokenize_chinese_chars=True,  # 是否对中文字符进行分词，可选参数，默认为 True
        strip_accents=None,  # 是否去除所有的重音符号，如果未指定，则由 lowercase 参数决定
        **kwargs,  # 其他参数，以字典形式接收
    ):
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

# 调用父类构造函数，并传入初始化参数，包括词汇文件路径、分词器文件路径、大小写转换标志、未知标记、分隔标记、填充标记、类标记、掩码标记、处理中文字符的标志、去除重音符号的标志以及其他关键字参数。


        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

# 加载后端分词器的标准化器状态，并根据初始化参数检查是否需要更新标准化器的设置（如小写转换、去除重音符号、处理中文字符）。如果有变化，则更新标准化器的类和相关参数设置。


        self.do_lower_case = do_lower_case

# 将初始化参数中的 `do_lower_case` 值保存到实例变量 `self.do_lower_case` 中。


    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A Lxmert sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

# 构建模型输入，根据序列或序列对的情况连接并添加特殊标记。Lxmert 序列有特定的格式：单个序列包括 `[CLS] X [SEP]`，序列对包括 `[CLS] A [SEP] B [SEP]`。函数接受两个参数 `token_ids_0` 和 `token_ids_1`，分别是待添加特殊标记的ID列表，返回一个列表，包含输入ID及相应的特殊标记。


    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None

# 创建用于区分两个序列的 token type IDs 的方法。接受两个参数 `token_ids_0` 和 `token_ids_1`，分别是序列的ID列表，返回一个标识序列类型的ID列表。```
):
    super().__init__(
        vocab_file,
        tokenizer_file=tokenizer_file,
        do_lower_case=do_lower_case,
        unk_token=unk_token,
        sep_token=sep_token,
        pad_token=pad_token,
        cls_token=cls_token,
        mask_token=mask_token,
        tokenize_chinese_chars=tokenize_chinese_chars,
        strip_accents=strip_accents,
        **kwargs,
    )

# 调用父类的构造函数，并传入初始化参数，包括词汇文件路径、分词器文件路径、大小写转换标志、未知标记、分隔标记、填充标记、类标记、掩码标记、处理中文字符的标志，以及其他关键字参数。


    normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
    if (
        normalizer_state.get("lowercase", do_lower_case) != do_lower_case
        or normalizer_state.get("strip_accents", strip_accents) != strip_accents
        or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
    ):
        normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
        normalizer_state["lowercase"] = do_lower_case
        normalizer_state["strip_accents"] = strip_accents
        normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
        self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

# 加载后端分词器的标准化器状态，并检查是否需要更新标准化器的设置（如小写转换、去除重音符号、处理中文字符）。如果有变化，则通过反射获取标准化器类，并更新相关参数后重新设置到 `self.backend_tokenizer.normalizer` 中。


    self.do_lower_case = do_lower_case

# 将初始化参数 `do_lower_case` 的值保存到实例变量 `self.do_lower_case` 中。


def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    """
    Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
    adding special tokens. A Lxmert sequence has the following format:

    - single sequence: `[CLS] X [SEP]`
    - pair of sequences: `[CLS] A [SEP] B [SEP]`

    Args:
        token_ids_0 (`List[int]`):
            List of IDs to which the special tokens will be added.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.

    Returns:
        `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
    """
    output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

    if token_ids_1 is not None:
        output += token_ids_1 + [self.sep_token_id]

    return output

# 构建模型输入的方法，根据单个序列或序列对的情况连接并添加特殊标记。Lxmert 模型序列有特定格式：单个序列包括 `[CLS] X [SEP]`，序列对包括 `[CLS] A [SEP] B [SEP]`。函数接受 `token_ids_0` 和 `token_ids_1` 两个参数，分别是需要添加特殊标记的 ID 列表，返回一个包含输入 ID 及相应特殊标记的列表。


def create_token_type_ids_from_sequences(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None

# 创建生成序列类型标识符 token type IDs 的方法。接受两个参数 `token_ids_0` 和 `token_ids_1`，分别是序列的 ID 列表，返回一个标识序列类型的 ID 列表。
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Lxmert sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define special tokens for separation and classification
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        
        # If token_ids_1 is not provided, return a mask with zeros only for the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # Concatenate the lengths of token_ids_0, sep, and token_ids_1 with appropriate token type IDs
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Save the tokenizer's vocabulary to the specified directory with an optional filename prefix
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

`.\models\lxmert\init.py`

# 引入必要的类型检查模块
from typing import TYPE_CHECKING

# 从相对路径导入必要的实用工具和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构字典，包含 LXMERT 相关配置和模型的导入信息
_import_structure = {
    "configuration_lxmert": ["LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LxmertConfig"],
    "tokenization_lxmert": ["LxmertTokenizer"],
}

# 尝试检查是否存在 tokenizers，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 tokenizers，则加入 tokenization_lxmert_fast 到导入结构中
    _import_structure["tokenization_lxmert_fast"] = ["LxmertTokenizerFast"]

# 尝试检查是否存在 torch，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 torch，则加入 modeling_lxmert 到导入结构中
    _import_structure["modeling_lxmert"] = [
        "LxmertEncoder",
        "LxmertForPreTraining",
        "LxmertForQuestionAnswering",
        "LxmertModel",
        "LxmertPreTrainedModel",
        "LxmertVisualFeatureEncoder",
        "LxmertXLayer",
    ]

# 尝试检查是否存在 tensorflow，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 tensorflow，则加入 modeling_tf_lxmert 到导入结构中
    _import_structure["modeling_tf_lxmert"] = [
        "TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFLxmertForPreTraining",
        "TFLxmertMainLayer",
        "TFLxmertModel",
        "TFLxmertPreTrainedModel",
        "TFLxmertVisualFeatureEncoder",
    ]

# 如果是类型检查模式，则进一步导入特定模块，用于类型检查
if TYPE_CHECKING:
    from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
    from .tokenization_lxmert import LxmertTokenizer

    # 尝试检查是否存在 tokenizers，如果不存在则忽略导入
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果存在 tokenizers，则导入 tokenization_lxmert_fast 模块
        from .tokenization_lxmert_fast import LxmertTokenizerFast

    # 尝试检查是否存在 torch，如果不存在则忽略导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果存在 torch，则导入 modeling_lxmert 模块
        from .modeling_lxmert import (
            LxmertEncoder,
            LxmertForPreTraining,
            LxmertForQuestionAnswering,
            LxmertModel,
            LxmertPreTrainedModel,
            LxmertVisualFeatureEncoder,
            LxmertXLayer,
        )

    # 尝试检查是否存在 tensorflow，如果不存在则忽略导入
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果不是第一个情况，即没有直接从本地导入需要的模块，
        # 而是从当前包（package）中导入所需模块和类
        from .modeling_tf_lxmert import (
            TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST,  # 导入预训练模型的列表常量
            TFLxmertForPreTraining,  # 导入用于预训练的 TF LXMERT 模型
            TFLxmertMainLayer,  # 导入 TF LXMERT 主要层
            TFLxmertModel,  # 导入 TF LXMERT 模型
            TFLxmertPreTrainedModel,  # 导入 TF LXMERT 预训练模型基类
            TFLxmertVisualFeatureEncoder,  # 导入 TF LXMERT 视觉特征编码器
        )
else:
    # 如果不是以上任何情况，即当前模块并非主模块，需要导入 sys 模块进行处理
    import sys

    # 将当前模块(__name__)对应的模块对象替换为一个懒加载模块对象(_LazyModule)
    # _LazyModule会延迟加载模块内容，避免直接导入大量模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\m2m_100\configuration_m2m_100.py`

# coding=utf-8
# 定义了文件的编码格式为 UTF-8

# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 依据 Apache License, Version 2.0 授权许可，详细条款可在此获取：http://www.apache.org/licenses/LICENSE-2.0

# You may obtain a copy of the License at
# 可在上述网址获取许可证副本

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非适用法律要求或书面同意，本软件按"原样"分发，不附带任何明示或暗示的担保或条件。
# 详细信息请参阅许可证

""" M2M100 model configuration"""
# M2M100 模型配置

from collections import OrderedDict
# 导入 OrderedDict 数据结构

from typing import Any, Mapping, Optional
# 导入类型提示

from ... import PreTrainedTokenizer
# 导入预训练的 Tokenizer

from ...configuration_utils import PretrainedConfig
# 导入配置工具中的预训练配置

from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
# 导入 ONNX 相关配置

from ...onnx.utils import compute_effective_axis_dimension
# 导入计算有效轴维度的工具函数

from ...utils import TensorType, is_torch_available, logging
# 导入工具函数：张量类型、是否可用 Torch、日志记录

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/config.json",
    # 预训练模型的存档映射，链接指向 M2M100 模型的配置文件
    # 查看所有 M2M100 模型，请访问 https://huggingface.co/models?filter=m2m_100
}


class M2M100Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`M2M100Model`]. It is used to instantiate an
    M2M100 model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the M2M100
    [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import M2M100Config, M2M100Model

    >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
    >>> configuration = M2M100Config()

    >>> # Initializing a model (with random weights) from the facebook/m2m100_418M style configuration
    >>> model = M2M100Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    # M2M100 模型的配置类，用于存储和实例化模型的配置参数

    model_type = "m2m_100"
    # 模型类型为 "m2m_100"

    keys_to_ignore_at_inference = ["past_key_values"]
    # 推断过程中忽略的键名列表，例如 "past_key_values"

    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
    # 属性映射，将外部命名映射到内部模型使用的命名，例如 "num_attention_heads" 映射到 "encoder_attention_heads"
    # 初始化函数，用于创建一个新的Transformer模型实例
    def __init__(
        self,
        vocab_size=128112,  # 词汇表大小，默认为128112
        max_position_embeddings=1024,  # 最大位置编码数，默认为1024
        encoder_layers=12,  # 编码器层数，默认为12层
        encoder_ffn_dim=4096,  # 编码器中间层维度，默认为4096
        encoder_attention_heads=16,  # 编码器注意力头数，默认为16个
        decoder_layers=12,  # 解码器层数，默认为12层
        decoder_ffn_dim=4096,  # 解码器中间层维度，默认为4096
        decoder_attention_heads=16,  # 解码器注意力头数，默认为16个
        encoder_layerdrop=0.05,  # 编码器层dropout率，默认为0.05
        decoder_layerdrop=0.05,  # 解码器层dropout率，默认为0.05
        use_cache=True,  # 是否使用缓存，默认为True
        is_encoder_decoder=True,  # 是否是编码-解码结构，默认为True
        activation_function="relu",  # 激活函数类型，默认为ReLU
        d_model=1024,  # 模型维度，默认为1024
        dropout=0.1,  # 全连接层和注意力层的dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力层中的dropout率，默认为0.1
        activation_dropout=0.0,  # 激活函数中的dropout率，默认为0.0
        init_std=0.02,  # 参数初始化标准差，默认为0.02
        decoder_start_token_id=2,  # 解码器起始标记ID，默认为2
        scale_embedding=True,  # 是否对嵌入进行缩放，默认为True
        pad_token_id=1,  # 填充标记ID，默认为1
        bos_token_id=0,  # 起始标记ID，默认为0
        eos_token_id=2,  # 结束标记ID，默认为2
        **kwargs,  # 其他关键字参数，用于传递给父类初始化函数
    ):
        self.vocab_size = vocab_size  # 初始化词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 初始化最大位置编码数
        self.d_model = d_model  # 初始化模型维度
        self.encoder_ffn_dim = encoder_ffn_dim  # 初始化编码器中间层维度
        self.encoder_layers = encoder_layers  # 初始化编码器层数
        self.encoder_attention_heads = encoder_attention_heads  # 初始化编码器注意力头数
        self.decoder_ffn_dim = decoder_ffn_dim  # 初始化解码器中间层维度
        self.decoder_layers = decoder_layers  # 初始化解码器层数
        self.decoder_attention_heads = decoder_attention_heads  # 初始化解码器注意力头数
        self.dropout = dropout  # 初始化全连接层和注意力层的dropout率
        self.attention_dropout = attention_dropout  # 初始化注意力层中的dropout率
        self.activation_dropout = activation_dropout  # 初始化激活函数中的dropout率
        self.activation_function = activation_function  # 初始化激活函数类型
        self.init_std = init_std  # 初始化参数初始化标准差
        self.encoder_layerdrop = encoder_layerdrop  # 初始化编码器层dropout率
        self.decoder_layerdrop = decoder_layerdrop  # 初始化解码器层dropout率
        self.use_cache = use_cache  # 初始化是否使用缓存
        self.num_hidden_layers = encoder_layers  # 初始化隐藏层的数量为编码器层数
        self.scale_embedding = scale_embedding  # 初始化是否对嵌入进行缩放

        # 调用父类的初始化函数，传入相关参数
        super().__init__(
            pad_token_id=pad_token_id,  # 传入填充标记ID
            bos_token_id=bos_token_id,  # 传入起始标记ID
            eos_token_id=eos_token_id,  # 传入结束标记ID
            is_encoder_decoder=is_encoder_decoder,  # 传入是否是编码-解码结构
            decoder_start_token_id=decoder_start_token_id,  # 传入解码器起始标记ID
            **kwargs,  # 传入其他关键字参数
        )
class M2M100OnnxConfig(OnnxSeq2SeqConfigWithPast):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 定义通用的输入格式字典
        common_inputs = OrderedDict(
            [
                ("input_ids", {0: "batch", 1: "encoder_sequence"}),
                ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
            ]
        )

        # 根据是否使用过去状态，确定decoder的输入格式
        if self.use_past:
            common_inputs["decoder_input_ids"] = {0: "batch"}
            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
        else:
            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}

        # 如果使用过去状态，调用填充过去键值的方法，填充通用输入字典
        if self.use_past:
            self.fill_with_past_key_values_(common_inputs, direction="inputs")
        # 返回最终的输入格式字典
        return common_inputs

    # 从BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering复制而来
    # 名称更适合是_generate_dummy_inputs_for_encoder_and_decoder，因为M2M100不支持序列分类和问答，
    # 但保留此名称以便检查副本是否与BART的匹配，并在需要时进行更新。
    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 从OnnxConfig.generate_dummy_inputs复制而来
        # 为了代码清晰性，没有使用super(OnnxConfigWithPast, self).generate_dummy_inputs。
        # 如果动态轴（-1），则前向传播时采用固定维度的2个样本以避免ONNX做的优化。
        batch_size = compute_effective_axis_dimension(
            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
        )

        # 如果动态轴（-1），则前向传播时采用固定维度的8个标记以避免ONNX做的优化。
        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
        seq_length = compute_effective_axis_dimension(
            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
        )

        # 根据计算的批次和序列长度生成虚拟输入
        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
        common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
        return common_inputs

    # 从transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_default_and_seq2seq_lm复制而来
    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
        ) -> Mapping[str, Any]:
        # 生成编码器输入数据
        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, seq_length, is_pair, framework
        )

        # 生成解码器输入数据
        # 如果使用过去信息，则解码器序列长度为1，否则与编码器序列长度相同
        decoder_seq_length = seq_length if not self.use_past else 1
        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, decoder_seq_length, is_pair, framework
        )
        # 将解码器输入数据格式化为以"decoder_"开头的命名格式
        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
        # 整合编码器和解码器的输入数据
        common_inputs = dict(**encoder_inputs, **decoder_inputs)

        if self.use_past:
            # 检查是否安装了 PyTorch，如果没有则抛出异常
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch
            
            # 获取批次大小和编码器序列长度
            batch, encoder_seq_length = common_inputs["input_ids"].shape
            # 获取解码器输入序列长度
            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
            # 获取注意力头的数量
            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
            # 定义编码器和解码器的形状
            encoder_shape = (
                batch,
                num_encoder_attention_heads,
                encoder_seq_length,
                self._config.hidden_size // num_encoder_attention_heads,
            )
            decoder_past_length = decoder_seq_length + 3
            decoder_shape = (
                batch,
                num_decoder_attention_heads,
                decoder_past_length,
                self._config.hidden_size // num_decoder_attention_heads,
            )

            # 扩展解码器注意力掩码，以确保其长度与decoder_past_length相同
            common_inputs["decoder_attention_mask"] = torch.cat(
                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
            )

            # 初始化过去键值列表
            common_inputs["past_key_values"] = []

            # 根据模型配置中的编码器和解码器层数，初始化过去键值对
            num_encoder_layers, num_decoder_layers = self.num_layers
            min_num_layers = min(num_encoder_layers, num_decoder_layers)
            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"

            # 对于最小层数，初始化过去键值对为零张量
            for _ in range(min_num_layers):
                common_inputs["past_key_values"].append(
                    (
                        torch.zeros(decoder_shape),
                        torch.zeros(decoder_shape),
                        torch.zeros(encoder_shape),
                        torch.zeros(encoder_shape),
                    )
                )

            # 添加剩余层数的过去键值对，如果是编码器优先，则使用编码器的形状，否则使用解码器的形状
            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
            for _ in range(min_num_layers, max_num_layers):
                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))

        # 返回整合了所有输入数据的字典
        return common_inputs
    # 将函数_generate_dummy_inputs_for_default_and_seq2seq_lm赋值给generate_dummy_inputs变量
    generate_dummy_inputs = _generate_dummy_inputs_for_default_and_seq2seq_lm

`.\models\m2m_100\convert_m2m100_original_checkpoint_to_pytorch.py`

# 导入命令行参数解析库
import argparse

# 导入PyTorch库
import torch
from torch import nn

# 导入transformers库中的M2M100Config和M2M100ForConditionalGeneration类
from transformers import M2M100Config, M2M100ForConditionalGeneration


# 定义函数，用于移除状态字典中指定的键
def remove_ignore_keys_(state_dict):
    # 要移除的键列表
    ignore_keys = [
        "encoder.version",
        "decoder.version",
        "model.encoder.version",
        "model.decoder.version",
        "decoder.output_projection.weight",
        "_float_tensor",
        "encoder.embed_positions._float_tensor",
        "decoder.embed_positions._float_tensor",
    ]
    # 逐个移除指定键
    for k in ignore_keys:
        state_dict.pop(k, None)


# 定义函数，从给定的嵌入层创建一个线性层
def make_linear_from_emb(emb):
    # 获取嵌入层的词汇量大小和嵌入维度大小
    vocab_size, emb_size = emb.weight.shape
    # 创建一个无偏置的线性层
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
    # 将线性层的权重设置为嵌入层的权重
    lin_layer.weight.data = emb.weight.data
    return lin_layer


# 定义函数，从Fairseq的M2M100模型检查点文件中转换为transformers的M2M100模型
def convert_fairseq_m2m100_checkpoint_from_disk(checkpoint_path):
    # 从硬盘加载Fairseq的M2M100模型
    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
    # 获取模型参数
    args = m2m_100["args"] or m2m_100["cfg"]["model"]
    # 获取模型状态字典
    state_dict = m2m_100["model"]
    # 移除状态字典中不需要的键
    remove_ignore_keys_(state_dict)
    # 获取词汇量大小
    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]

    # 根据Fairseq的参数创建transformers的配置对象
    config = M2M100Config(
        vocab_size=vocab_size,
        max_position_embeddings=1024,
        encoder_layers=args.encoder_layers,
        decoder_layers=args.decoder_layers,
        encoder_attention_heads=args.encoder_attention_heads,
        decoder_attention_heads=args.decoder_attention_heads,
        encoder_ffn_dim=args.encoder_ffn_embed_dim,
        decoder_ffn_dim=args.decoder_ffn_embed_dim,
        d_model=args.encoder_embed_dim,
        encoder_layerdrop=args.encoder_layerdrop,
        decoder_layerdrop=args.decoder_layerdrop,
        dropout=args.dropout,
        attention_dropout=args.attention_dropout,
        activation_dropout=args.activation_dropout,
        activation_function="relu",
    )

    # 调整状态字典以适应transformers的模型结构
    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
    # 创建M2M100ForConditionalGeneration模型
    model = M2M100ForConditionalGeneration(config)
    # 加载模型的状态字典（允许部分严格性）
    model.model.load_state_dict(state_dict, strict=False)
    # 将语言模型头部设置为从嵌入层创建的线性层
    model.lm_head = make_linear_from_emb(model.model.shared)

    return model


# 主程序入口
if __name__ == "__main__":
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser()
    # 添加必需的命令行参数：fairseq模型检查点文件的路径
    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
    # 添加可选的命令行参数：输出PyTorch模型的文件夹路径
    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 解析命令行参数，获取用户输入的参数值
    args = parser.parse_args()
    
    # 调用函数 convert_fairseq_m2m100_checkpoint_from_disk，从磁盘中加载 Fairseq M2M100 模型的检查点
    model = convert_fairseq_m2m100_checkpoint_from_disk(args.fairseq_path)
    
    # 将转换后的 PyTorch 模型保存到指定的文件夹路径 args.pytorch_dump_folder_path
    model.save_pretrained(args.pytorch_dump_folder_path)

`.\models\m2m_100\modeling_m2m_100.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch M2M100 model."""

import math  # 导入数学库
from typing import List, Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入PyTorch库
from torch import nn  # 导入神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 导入激活函数相关的模块
from ...integrations.deepspeed import is_deepspeed_zero3_enabled  # 导入DeepSpeed相关的模块
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask  # 导入处理注意力掩码相关的函数
from ...modeling_outputs import (  # 导入模型输出相关的类
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型相关的工具函数
from ...utils import (  # 导入工具函数
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_m2m_100 import M2M100Config  # 导入M2M100模型的配置

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CONFIG_FOR_DOC = "M2M100Config"  # 文档中使用的配置名称
_CHECKPOINT_FOR_DOC = "facebook/m2m100_418M"  # 文档中使用的检查点名称

M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST = [  # M2M100预训练模型的存档列表
    "facebook/m2m100_418M",
    # 查看所有M2M100模型 https://huggingface.co/models?filter=m2m_100
]


# 从transformers.models.bart.modeling_bart.shift_tokens_right复制而来
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    将输入的token向右移动一位。
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)  # 创建一个与输入形状相同的零张量
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()  # 将输入的除了第一个位置的所有token向右移动一位
    shifted_input_ids[:, 0] = decoder_start_token_id  # 将第一个位置设置为decoder的起始token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # 用pad_token_id替换标签中可能存在的-100值
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    根据输入的input_ids生成位置id，非padding符号用它们的位置数字表示。位置数字从padding_idx+1开始，padding符号被忽略。
    这是从fairseq的`utils.make_positions`修改而来的。
    """
    # 这里的类型转换和转换非常平衡，既适用于ONNX导出，也适用于XLA。
    mask = input_ids.ne(padding_idx).int()  # 创建一个掩码，指示哪些位置是非padding的
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask  # 生成递增的位置id，并应用掩码
    # 返回一个张量，其中包含 incremental_indices 张量的长整型值与 padding_idx 的加法结果
    return incremental_indices.long() + padding_idx
class M2M100SinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
        super().__init__()
        self.offset = 2  # 定义偏移量为2，用于创建位置嵌入
        self.embedding_dim = embedding_dim  # 嵌入维度
        self.padding_idx = padding_idx  # 可选的填充索引
        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)  # 调用make_weights方法创建权重

    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)  # 调用get_embedding方法获取嵌入权重
        if hasattr(self, "weights"):
            # 在forward方法中，将权重转换为参数的正确dtype和device
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        self.register_buffer("weights", emb_weights, persistent=False)  # 将权重注册为缓冲区，非持久性注册

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        """
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        """
        half_dim = embedding_dim // 2  # 嵌入维度的一半
        emb = math.log(10000) / (half_dim - 1)  # 计算基于半嵌入维度的对数间隔
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)  # 计算指数衰减的正弦周期
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)  # 创建位置嵌入
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)  # 组合sin和cos，形成嵌入
        if embedding_dim % 2 == 1:
            # 若嵌入维度为奇数，进行零填充
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            emb[padding_idx, :] = 0  # 将填充索引位置的嵌入置零

        return emb.to(torch.get_default_dtype())  # 返回默认dtype的嵌入张量

    @torch.no_grad()
    def forward(
        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
    ):
        if input_ids is not None:
            bsz, seq_len = input_ids.size()
            # 从输入的token ids创建位置ids，任何填充的token保持填充状态
            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
                input_ids.device
            )
        else:
            bsz, seq_len = inputs_embeds.size()[:-1]
            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)

        # 如果需要扩展嵌入
        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
        if max_pos > self.weights.size(0):
            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)

        # 选择对应位置ids的嵌入并返回，同时进行分离计算图
        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
    # 根据输入的嵌入向量生成位置编码标识符
    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor  # 输入的嵌入向量，形状为 [batch_size, sequence_length, embedding_size]

        Returns: torch.Tensor  # 返回形状与输入相同的位置编码标识符张量
        """
        input_shape = inputs_embeds.size()[:-1]  # 获取输入张量的形状，不包括最后一维（通常是嵌入维度）
        sequence_length = input_shape[1]  # 获取序列长度，即第二个维度的大小

        # 生成从 self.padding_idx + 1 到 sequence_length + self.padding_idx + 1 的序列，作为位置编码标识符
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 将位置编码标识符张量进行扩展，使其形状与输入张量相同，并确保内存布局连续
        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->M2M100
class M2M100Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[M2M100Config] = None,
    ):
        super().__init__()
        # 初始化函数，设置注意力模型的参数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        # 确保 embed_dim 必须被 num_heads 整除，否则抛出错误
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子，用于缩放注意力分数
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        # 线性变换层，用于计算查询、键、值和输出
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入张量重塑成期望的形状，用于多头注意力的计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 前向传播函数，执行注意力计算和线性变换
        # hidden_states: 输入的隐藏状态张量
        # key_value_states: 键值对状态张量，可选
        # past_key_value: 过去的键值对张量，可选
        # attention_mask: 注意力掩码张量，可选
        # layer_head_mask: 层头掩码张量，可选
        # output_attentions: 是否输出注意力张量，布尔值

# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->M2M100, MBART->M2M100
class M2M100EncoderLayer(nn.Module):
    def __init__(self, config: M2M100Config):
        super().__init__()
        # 初始化函数，设置编码器层的参数
        self.embed_dim = config.d_model

        # 自注意力层
        self.self_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        # 前馈神经网络的两个线性层
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)

        # 最终层的 LayerNorm
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_head_mask: torch.Tensor,
        output_attentions: bool = False,
    ):
        # 前向传播函数，执行编码器层的计算
        # hidden_states: 输入的隐藏状态张量
        # attention_mask: 注意力掩码张量
        # layer_head_mask: 层头掩码张量
        # output_attentions: 是否输出注意力张量，布尔值
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 记录输入的原始状态，用于残差连接
        residual = hidden_states
        # 对输入的 hidden_states 进行 layer normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 使用 self-attention 层处理输入，返回处理后的 hidden_states、注意力权重 attn_weights，以及可能的 attentions
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 对 self-attention 处理后的 hidden_states 进行 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差与处理后的 hidden_states 相加，实现残差连接
        hidden_states = residual + hidden_states

        # 记录输入的原始状态，用于残差连接
        residual = hidden_states
        # 对处理后的 hidden_states 再进行 layer normalization
        hidden_states = self.final_layer_norm(hidden_states)
        # 经过第一个全连接层 fc1，并使用激活函数 activation_fn
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 对 fc1 输出的 hidden_states 进行 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 经过第二个全连接层 fc2
        hidden_states = self.fc2(hidden_states)
        # 对 fc2 输出的 hidden_states 进行 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差与处理后的 hidden_states 相加，实现残差连接
        hidden_states = residual + hidden_states

        # 如果 hidden_states 的数据类型为 torch.float16，并且包含无穷大或 NaN 值，则进行值的修正
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 构造输出元组，包含处理后的 hidden_states
        outputs = (hidden_states,)

        # 如果需要返回 attentions tensors，则将 attn_weights 加入输出元组
        if output_attentions:
            outputs += (attn_weights,)

        # 返回最终的输出元组
        return outputs
# 定义一个字典，映射关系为字符串"eager"到类M2M100Attention
M2M100_ATTENTION_CLASSES = {"eager": M2M100Attention}

# 从transformers.models.mbart.modeling_mbart.MBartDecoderLayer复制并修改为使用M2M100，替换MBart为M2M100
class M2M100DecoderLayer(nn.Module):
    def __init__(self, config: M2M100Config):
        super().__init__()
        self.embed_dim = config.d_model

        # 初始化自注意力层，使用配置中指定的注意力实现方法
        self.self_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            is_causal=True,
            config=config,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数根据配置选择
        self.activation_dropout = config.activation_dropout

        # 对自注意力层的输出进行LayerNorm归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 初始化编码器注意力层，使用配置中指定的注意力实现方法
        self.encoder_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            config=config,
        )
        # 对编码器注意力层的输出进行LayerNorm归一化
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 第一个全连接层，线性变换到decoder_ffn_dim维度
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)

        # 第二个全连接层，线性变换回embed_dim维度
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)

        # 最终输出层的LayerNorm归一化
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
):  
    pass

# M2M100PreTrainedModel继承自PreTrainedModel，设置相关类属性和方法
class M2M100PreTrainedModel(PreTrainedModel):
    config_class = M2M100Config  # 指定配置类为M2M100Config
    base_model_prefix = "model"  # 基础模型前缀为"model"
    supports_gradient_checkpointing = True  # 支持梯度检查点

    # 不需要分割的模块名称列表，排除"M2M100Attention"
    _no_split_modules = ["M2M100Attention"]

    # 初始化模型权重
    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

# M2M_100_START_DOCSTRING是一个字符串，包含了关于M2M100PreTrainedModel的文档字符串模板
M2M_100_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
"""
    # 作为普通的 PyTorch 模块使用，并参考 PyTorch 文档以获取有关一般使用和行为的所有信息。

    Parameters:
        config ([`M2M100Config`]):
            模型配置类，包含模型的所有参数。使用配置文件初始化时不会加载与模型相关的权重，只加载配置信息。
            可以查看 [`~PreTrainedModel.from_pretrained`] 方法来加载模型的权重。
"""

M2M_100_GENERATION_EXAMPLE = r"""
    Translation example:

    ```
    >>> from transformers import AutoTokenizer, M2M100ForConditionalGeneration

    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")

    >>> text_to_translate = "Life is like a box of chocolates"
    >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")

    >>> # translate to French
    >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
    >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
    ```
"""

M2M_100_INPUTS_DOCSTRING = r"""
"""


class M2M100Encoder(M2M100PreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`M2M100EncoderLayer`].

    Args:
        config: M2M100Config
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        # Embedding layer for tokens
        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        # Positional embedding for token positions
        self.embed_positions = M2M100SinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
            self.padding_idx,
        )

        # List of encoder layers
        self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)])

        # Layer normalization
        self.layer_norm = nn.LayerNorm(config.d_model)

        # Gradient checkpointing disabled by default
        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # Implementation of forward pass for encoder
        pass


class M2M100Decoder(M2M100PreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`M2M100DecoderLayer`]

    Args:
        config: M2M100Config
        embed_tokens (nn.Embedding): output embedding
    """
    # 初始化方法，接收配置参数和可选的嵌入层
    def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None):
        # 调用父类的初始化方法，传递配置参数
        super().__init__(config)
        
        # 设置对象的属性，从配置中获取各种参数
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        # 创建嵌入层对象，vocab_size表示词汇表大小，d_model表示嵌入维度，padding_idx表示填充标识
        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        # 如果提供了外部的嵌入层，将其权重复制到当前的嵌入层
        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        # 创建位置编码对象，使用正弦函数生成位置编码
        self.embed_positions = M2M100SinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            self.padding_idx,
        )

        # 创建解码器层的列表，每个解码器层具有相同的配置参数
        self.layers = nn.ModuleList([M2M100DecoderLayer(config) for _ in range(config.decoder_layers)])
        
        # 创建层归一化对象，对隐藏层进行归一化处理
        self.layer_norm = nn.LayerNorm(config.d_model)

        # 是否启用梯度检查点，初始化为False
        self.gradient_checkpointing = False

        # 执行初始化后的处理操作，可能包括权重初始化和其他的后续处理
        self.post_init()

    # 前向传播方法，接收多个输入参数，实现模型的数据流向
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用装饰器添加文档字符串到模型类，描述该类的基本信息和用途
@add_start_docstrings(
    "The bare M2M100 Model outputting raw hidden-states without any specific head on top.",
    M2M_100_START_DOCSTRING,
)
# 定义 M2M100Model 类，继承自 M2M100PreTrainedModel 类
class M2M100Model(M2M100PreTrainedModel):
    # 定义用于共享权重的键列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    # 初始化函数，接受一个 M2M100Config 对象作为参数
    def __init__(self, config: M2M100Config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 从配置中获取填充索引和词汇表大小
        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
        # 创建一个共享的嵌入层，用于编码器和解码器
        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)

        # 初始化编码器和解码器
        self.encoder = M2M100Encoder(config, self.shared)
        self.decoder = M2M100Decoder(config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层的方法
    def get_input_embeddings(self):
        return self.shared

    # 设置输入嵌入层的方法
    def set_input_embeddings(self, value):
        self.shared = value
        # 更新编码器和解码器的嵌入层
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    # 实现权重绑定的方法
    def _tie_weights(self):
        # 如果配置中指定了词嵌入层共享
        if self.config.tie_word_embeddings:
            # 将编码器和解码器的词嵌入层绑定或克隆为共享的嵌入层
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    # 获取编码器的方法
    def get_encoder(self):
        return self.encoder

    # 获取解码器的方法
    def get_decoder(self):
        return self.decoder

    # 前向传播方法，接受多个输入参数，并返回模型的输出
    @add_start_docstrings_to_model_forward(M2M_100_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 函数参数的详细描述在文档字符串中给出
    ):
        # 函数主体实现模型的前向传播逻辑，具体细节可以参考函数内部实现
    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
        # 设置输出注意力权重，如果未指定则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态，如果未指定则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否使用缓存，如果未指定则使用配置中的默认设置
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        # 设置是否返回字典形式的输出，如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果没有提供编码器的输出，则调用编码器来生成编码器的输出
        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        # 如果用户传入的是一个元组形式的编码器输出，在 return_dict=True 时将其包装为 BaseModelOutput 类型
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        # 解码器的输出包括 (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_outputs[0],
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果不是以字典形式返回结果，则将解码器和编码器输出组合起来返回
        if not return_dict:
            return decoder_outputs + encoder_outputs

        # 以 Seq2SeqModelOutput 类型返回结果，包括解码器和编码器的相关隐藏状态和注意力权重
        return Seq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )
# 为 M2M100ForConditionalGeneration 类添加文档字符串，描述其作为带有语言建模头的 M2M100 模型，用于摘要生成
@add_start_docstrings(
    "The M2M100 Model with a language modeling head. Can be used for summarization.", M2M_100_START_DOCSTRING
)
class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
    # 指定模型中用于连接的前缀
    base_model_prefix = "model"
    # 定义共享权重的键列表，这些权重被绑定在一起
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

    # 初始化函数，接受 M2M100Config 类型的配置对象作为参数
    def __init__(self, config: M2M100Config):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)
        # 创建一个 M2M100Model 模型实例，并赋值给 self.model
        self.model = M2M100Model(config)
        # 创建一个线性层 lm_head，用于语言建模任务的最终处理，将输入维度设为 config.d_model，输出维度设为 self.model.shared.num_embeddings，不带偏置
        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        # 执行额外的初始化操作和最终处理
        self.post_init()

    # 返回模型中的编码器部分
    def get_encoder(self):
        return self.model.get_encoder()

    # 返回模型中的解码器部分
    def get_decoder(self):
        return self.model.get_decoder()

    # 返回 lm_head 层，用于输出嵌入
    def get_output_embeddings(self):
        return self.lm_head

    # 设置 lm_head 层的新嵌入
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 前向传播函数，接受多个输入参数，包括输入的 ID、注意力掩码、解码器输入的 ID 等
    # 使用装饰器添加文档字符串，描述了输入参数和输出类型
    # 使用装饰器替换返回值的文档字符串为 Seq2SeqLMOutput 类型，并指定相关配置类
    # 添加末尾的文档字符串，展示了 M2M-100 生成任务的示例
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Returns a tuple containing either torch.Tensor or Seq2SeqLMOutput.

        """
        # Determine whether to use the provided return_dict or the default from configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
            # If decoder_input_ids is not provided, shift labels to the right for decoder input
            if decoder_input_ids is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        # Forward pass through the model with specified inputs and optional arguments
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # Generate logits from the language model head
        lm_logits = self.lm_head(outputs[0])

        masked_lm_loss = None
        if labels is not None:
            # Move labels tensor to the same device as lm_logits for proper loss computation
            labels = labels.to(lm_logits.device)
            loss_fct = CrossEntropyLoss()
            # Compute masked language modeling loss using CrossEntropyLoss
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            # Return output as tuple if return_dict is False
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return Seq2SeqLMOutput object containing relevant outputs if return_dict is True
        return Seq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )

    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
        # 如果使用了过去的键值（past_key_values），则计算过去的长度
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法可能已经仅传递了最后一个输入 ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：仅保留最后一个输入 ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 修剪 decoder_input_ids，去除前面不需要的部分
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回一个字典，包含不同的模型输入和掩码
        return {
            "input_ids": None,  # encoder_outputs 已定义，input_ids 不再需要
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 更改此项以避免缓存（可能是为了调试目的）
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        # 重新排序过去的键值，根据 beam_idx 进行重新排列
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

`.\models\m2m_100\tokenization_m2m_100.py`

# 版权声明和许可声明，说明代码的版权和使用条款
# 请注意，这部分代码不会执行，仅作为声明性文本存在

"""Tokenization classes for M2M100."""
# 引入所需的模块和库，包括json、os、Path、copyfile和typing等
import json
import os
from pathlib import Path
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union

# 引入sentencepiece库，用于处理分词
import sentencepiece

# 引入日志记录模块
from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer
from ...utils import logging

# 获取logger对象
logger = logging.get_logger(__name__)

# 定义句子片段的连接符，用于后续的分词处理
SPIECE_UNDERLINE = "▁"

# 定义词汇文件名的映射关系
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "spm_file": "sentencepiece.bpe.model",
    "tokenizer_config_file": "tokenizer_config.json",
}

# 预训练模型的词汇文件映射关系，包括不同模型对应的文件地址
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/vocab.json",
        "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/vocab.json",
    },
    "spm_file": {
        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentencepiece.bpe.model",
        "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/sentencepiece.bpe.model",
    },
    "tokenizer_config_file": {
        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/tokenizer_config.json",
        "facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/tokenizer_config.json",
    },
}

# 预训练位置嵌入的大小，对应不同模型
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/m2m100_418M": 1024,
}

# 定义Fairseq的语言代码，包括m2m100和wmt21模型的支持语言列表
# fmt: off
FAIRSEQ_LANGUAGE_CODES = {
    "m2m100": ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"],
    "wmt21": ['en', 'ha', 'is', 'ja', 'cs', 'ru', 'zh', 'de']
}
# fmt: on


class M2M100Tokenizer(PreTrainedTokenizer):
    """
    构造一个M2M100分词器。基于SentencePiece实现。
    """
    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        spm_file (`str`):
            Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary.
        src_lang (`str`, *optional*):
            A string representing the source language.
        tgt_lang (`str`, *optional*):
            A string representing the target language.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        language_codes (`str`, *optional*, defaults to `"m2m100"`):
            What language codes to use. Should be one of `"m2m100"` or `"wmt21"`.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```
    >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
    # 使用给定的src_text和tgt_text以及tokenizer对象，生成模型输入
    model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
    # 使用生成的模型输入调用模型，返回模型输出
    outputs = model(**model_inputs)  # 应该正常工作

    vocab_files_names = VOCAB_FILES_NAMES
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    model_input_names = ["input_ids", "attention_mask"]

    prefix_tokens: List[int] = []
    suffix_tokens: List[int] = []

    # 初始化函数，设置各种属性和参数
    def __init__(
        self,
        vocab_file,
        spm_file,
        src_lang=None,
        tgt_lang=None,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        pad_token="<pad>",
        unk_token="<unk>",
        language_codes="m2m100",
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        num_madeup_words=8,
        **kwargs,
    ) -> None:
        # 如果未提供spm_file参数，则使用空字典
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 设置语言代码和Fairseq语言代码映射，用于生成特殊标记
        self.language_codes = language_codes
        fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
        self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code}

        # 处理额外的特殊标记，确保每种语言的特殊标记都在额外特殊标记列表中
        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
        for lang_code in fairseq_language_code:
            token = self.get_lang_token(lang_code)
            if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
                additional_special_tokens.append(token)

        # 设置词汇文件和解码器，从词汇文件加载词汇映射
        self.vocab_file = vocab_file
        self.encoder = load_json(vocab_file)
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.spm_file = spm_file
        # 加载SPM模型，使用给定的spm_file和参数
        self.sp_model = load_spm(spm_file, self.sp_model_kwargs)

        # 设置编码器的大小为词汇表大小
        self.encoder_size = len(self.encoder)

        # 创建语言标记到ID的映射，使用Fairseq语言代码
        self.lang_token_to_id = {
            self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)
        }
        self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)}
        self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}

        # 设置源语言和目标语言，默认源语言为英语
        self._src_lang = src_lang if src_lang is not None else "en"
        self.tgt_lang = tgt_lang
        # 获取当前语言的ID，使用源语言设置
        self.cur_lang_id = self.get_lang_id(self._src_lang)

        # 设置虚构词数量
        self.num_madeup_words = num_madeup_words

        # 调用父类的初始化方法，设置其他参数
        super().__init__(
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            unk_token=unk_token,
            pad_token=pad_token,
            language_codes=language_codes,
            sp_model_kwargs=self.sp_model_kwargs,
            additional_special_tokens=additional_special_tokens,
            num_madeup_words=num_madeup_words,
            **kwargs,
        )
        # 设置源语言的特殊标记
        self.set_src_lang_special_tokens(self._src_lang)

    @property
    # 返回编码器中的词汇量大小
    def vocab_size(self) -> int:
        return len(self.encoder)

    # 获取词汇表，并将索引与词汇一一对应的字典返回
    def get_vocab(self) -> Dict:
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        # 添加自定义的特殊标记到词汇表中
        vocab.update(self.added_tokens_encoder)
        return vocab

    # 返回源语言代码
    @property
    def src_lang(self) -> str:
        return self._src_lang

    # 设置源语言代码，并更新相关特殊标记
    @src_lang.setter
    def src_lang(self, new_src_lang: str) -> None:
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)

    # 使用句子分段模型对文本进行分词，并返回结果
    def _tokenize(self, text: str) -> List[str]:
        return self.sp_model.encode(text, out_type=str)

    # 将词汇转换为对应的 ID
    def _convert_token_to_id(self, token):
        if token in self.lang_token_to_id:
            return self.lang_token_to_id[token]
        # 如果未找到对应词汇，则使用未知标记的 ID
        return self.encoder.get(token, self.encoder[self.unk_token])

    # 将 ID 转换为对应的词汇
    def _convert_id_to_token(self, index: int) -> str:
        """Converts an index (integer) in a token (str) using the decoder."""
        if index in self.id_to_lang_token:
            return self.id_to_lang_token[index]
        # 如果未找到对应 ID，则使用未知标记的词汇
        return self.decoder.get(index, self.unk_token)

    # 将一系列的 tokens 转换为单个字符串
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        for token in tokens:
            # 确保特殊标记不会被句子分段模型解码
            if token in self.all_special_tokens:
                out_string += self.sp_model.decode(current_sub_tokens) + token
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string.strip()

    # 获取特殊标记的掩码
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            # If the tokens already have special tokens, delegate to superclass method
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Initialize a list of ones corresponding to prefix special tokens
        prefix_ones = [1] * len(self.prefix_tokens)
        # Initialize a list of ones corresponding to suffix special tokens
        suffix_ones = [1] * len(self.suffix_tokens)

        if token_ids_1 is None:
            # If there is only one sequence (token_ids_1 is None), return prefix tokens + sequence tokens + suffix tokens
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones

        # If there are two sequences, return prefix tokens + sequence 1 tokens + sequence 2 tokens + suffix tokens
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # If there is only one sequence (token_ids_1 is None), return prefix tokens + token_ids_0 + suffix tokens
            return self.prefix_tokens + token_ids_0 + self.suffix_tokens

        # If there are two sequences, return prefix tokens + token_ids_0 + token_ids_1 + suffix tokens
        # We maintain pair logic for API consistency
        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

    def __getstate__(self) -> Dict:
        # Serialize the object state excluding the sp_model attribute
        state = self.__dict__.copy()
        state["sp_model"] = None  # Ensure sp_model is set to None during serialization
        return state

    def __setstate__(self, d: Dict) -> None:
        # Deserialize the object state
        self.__dict__ = d

        # Ensure backward compatibility by setting sp_model_kwargs if it doesn't exist
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # Load the sp_model attribute using the existing attributes spm_file and sp_model_kwargs
        self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 将保存目录路径转换为Path对象
        save_dir = Path(save_directory)
        # 如果保存目录不存在，则抛出异常
        if not save_dir.is_dir():
            raise OSError(f"{save_directory} should be a directory")
        
        # 构建词汇表文件保存路径
        vocab_save_path = save_dir / (
            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
        )
        # 构建序列化模型文件保存路径
        spm_save_path = save_dir / (
            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
        )

        # 保存编码器对象到JSON文件
        save_json(self.encoder, vocab_save_path)

        # 如果当前的序列模型文件路径与目标路径不同且存在有效的序列模型文件，则复制序列模型文件
        if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
            copyfile(self.spm_file, spm_save_path)
        # 否则，如果当前序列模型文件路径无效，则将序列化模型内容写入目标路径
        elif not os.path.isfile(self.spm_file):
            with open(spm_save_path, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回保存的词汇表文件路径和序列模型文件路径的元组
        return (str(vocab_save_path), str(spm_save_path))

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        src_lang: str = "en",
        tgt_texts: Optional[List[str]] = None,
        tgt_lang: str = "ro",
        **kwargs,
    ) -> BatchEncoding:
        # 设置源语言和目标语言，并配置源语言特殊标记
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.set_src_lang_special_tokens(self.src_lang)
        # 调用父类方法，准备序列到序列任务的批处理编码
        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

    def _build_translation_inputs(self, raw_inputs, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs):
        """Used by translation pipeline, to prepare inputs for the generate function"""
        # 检查源语言和目标语言是否为空，若为空则抛出值错误异常
        if src_lang is None or tgt_lang is None:
            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
        # 设置当前实例的源语言
        self.src_lang = src_lang
        # 使用模型处理原始输入，并添加特殊标记
        inputs = self(raw_inputs, add_special_tokens=True, **extra_kwargs)
        # 获取目标语言对应的语言ID，并设置为强制BOS标记ID
        tgt_lang_id = self.get_lang_id(tgt_lang)
        inputs["forced_bos_token_id"] = tgt_lang_id
        return inputs

    def _switch_to_input_mode(self):
        # 切换为输入模式，设置源语言的特殊标记
        self.set_src_lang_special_tokens(self.src_lang)

    def _switch_to_target_mode(self):
        # 切换为目标模式，设置目标语言的特殊标记
        self.set_tgt_lang_special_tokens(self.tgt_lang)

    def set_src_lang_special_tokens(self, src_lang: str) -> None:
        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
        # 获取源语言对应的语言标记，并设置当前语言ID
        lang_token = self.get_lang_token(src_lang)
        self.cur_lang_id = self.lang_token_to_id[lang_token]
        # 设置前缀特殊标记为当前语言ID，后缀特殊标记为结束标记ID
        self.prefix_tokens = [self.cur_lang_id]
        self.suffix_tokens = [self.eos_token_id]
    # 设置目标语言的特殊标记。无前缀，后缀包含[eos, tgt_lang_code]。
    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
        # 获取目标语言对应的语言特殊标记
        lang_token = self.get_lang_token(tgt_lang)
        # 将当前语言ID设置为目标语言特殊标记对应的ID
        self.cur_lang_id = self.lang_token_to_id[lang_token]
        # 将前缀标记设置为当前语言ID
        self.prefix_tokens = [self.cur_lang_id]
        # 将后缀标记设置为包含结束符(eos)和目标语言特殊标记对应的ID
        self.suffix_tokens = [self.eos_token_id]

    # 根据语言名称获取语言特殊标记
    def get_lang_token(self, lang: str) -> str:
        return self.lang_code_to_token[lang]

    # 根据语言名称获取语言ID
    def get_lang_id(self, lang: str) -> int:
        # 获取语言特殊标记
        lang_token = self.get_lang_token(lang)
        # 返回语言特殊标记对应的ID
        return self.lang_token_to_id[lang_token]
# 根据指定的参数加载 SentencePieceProcessor 对象
def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
    # 使用传入的参数初始化 SentencePieceProcessor 对象
    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
    # 加载指定路径下的 SentencePiece 模型文件
    spm.Load(str(path))
    # 返回加载后的 SentencePieceProcessor 对象
    return spm


# 加载指定路径的 JSON 文件并返回其内容，可以是字典或列表
def load_json(path: str) -> Union[Dict, List]:
    # 打开指定路径的 JSON 文件作为只读模式
    with open(path, "r") as f:
        # 使用 json 模块加载 JSON 文件内容并返回
        return json.load(f)


# 将数据以 JSON 格式保存到指定路径的文件中
def save_json(data, path: str) -> None:
    # 打开指定路径的文件以写入模式
    with open(path, "w") as f:
        # 使用 json 模块将数据以可读性更好的缩进格式保存到文件中
        json.dump(data, f, indent=2)

`.\models\m2m_100\init.py`

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 导入可选的依赖未安装异常和延迟加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config", "M2M100OnnxConfig"],
    "tokenization_m2m_100": ["M2M100Tokenizer"],
}

# 检查是否有 torch 可用，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加模型相关的导入
    _import_structure["modeling_m2m_100"] = [
        "M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST",
        "M2M100ForConditionalGeneration",
        "M2M100Model",
        "M2M100PreTrainedModel",
    ]

# 如果是类型检查阶段，导入具体的模型配置和标记器
if TYPE_CHECKING:
    from .configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config, M2M100OnnxConfig
    from .tokenization_m2m_100 import M2M100Tokenizer

    # 再次检查是否有 torch 可用，若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 torch 可用，则导入模型相关的类
        from .modeling_m2m_100 import (
            M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST,
            M2M100ForConditionalGeneration,
            M2M100Model,
            M2M100PreTrainedModel,
        )

# 如果不是类型检查阶段，将当前模块设为延迟加载模块
else:
    import sys

    # 使用延迟加载模块的方式加载当前模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mamba\configuration_mamba.py`

# coding=utf-8
# 版权所有 2024 年 HuggingFace Inc. 团队。
#
# 根据 Apache 许可证 2.0 版本使用此文件；除非遵守许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发的软件分发在“按现状”基础上，
# 没有任何明示或暗示的保证或条件。请查看许可证获取特定语言的权限及限制。
"""MAMBA configuration"""

import math  # 导入 math 模块

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志模块


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

MAMBA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "state-spaces/mamba-2.8b": "https://huggingface.co/state-spaces/mamba-2.8b/resolve/main/config.json",
}

class MambaConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`MambaModel`]. It is used to instantiate a MAMBA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the MAMBA
    [state-spaces/mamba-2.8b](https://huggingface.co/state-spaces/mamba-2.8b) architecture.
    
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    
    Example:
    
    ```
    >>> from transformers import MambaConfig, MambaModel
    
    >>> # Initializing a Mamba configuration
    >>> configuration = MambaConfig()
    
    >>> # Initializing a model (with random weights) from the configuration
    >>> model = MambaModel(configuration)
    
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "mamba"

    def __init__(
        self,
        vocab_size=50280,
        hidden_size=768,
        state_size=16,
        num_hidden_layers=32,
        layer_norm_epsilon=1e-5,
        pad_token_id=0,
        bos_token_id=0,
        eos_token_id=0,
        expand=2,
        conv_kernel=4,
        use_bias=False,
        use_conv_bias=True,
        hidden_act="silu",
        initializer_range=0.1,
        residual_in_fp32=True,
        time_step_rank="auto",
        time_step_scale=1.0,
        time_step_min=0.001,
        time_step_max=0.1,
        time_step_init_scheme="random",
        time_step_floor=1e-4,
        rescale_prenorm_residual=False,
        use_cache=True,
        **kwargs,
    ):
        """
        初始化 MambaConfig 类，设置 MAMBA 模型的配置参数。
        
        参数：
            vocab_size (int): 词汇表大小，默认为 50280
            hidden_size (int): 隐藏层大小，默认为 768
            state_size (int): 状态大小，默认为 16
            num_hidden_layers (int): 隐藏层层数，默认为 32
            layer_norm_epsilon (float): 层归一化的 epsilon 值，默认为 1e-5
            pad_token_id (int): 填充标记的 ID，默认为 0
            bos_token_id (int): 起始标记的 ID，默认为 0
            eos_token_id (int): 结束标记的 ID，默认为 0
            expand (int): 扩展参数，默认为 2
            conv_kernel (int): 卷积核大小，默认为 4
            use_bias (bool): 是否使用偏置，默认为 False
            use_conv_bias (bool): 是否使用卷积偏置，默认为 True
            hidden_act (str): 隐藏层激活函数，默认为 "silu"
            initializer_range (float): 初始化范围，默认为 0.1
            residual_in_fp32 (bool): 是否在 fp32 下进行残差连接，默认为 True
            time_step_rank (str): 时间步长等级，默认为 "auto"
            time_step_scale (float): 时间步长缩放，默认为 1.0
            time_step_min (float): 最小时间步长，默认为 0.001
            time_step_max (float): 最大时间步长，默认为 0.1
            time_step_init_scheme (str): 时间步长初始化方案，默认为 "random"
            time_step_floor (float): 时间步长下限，默认为 1e-4
            rescale_prenorm_residual (bool): 是否对预归一化残差进行重新缩放，默认为 False
            use_cache (bool): 是否使用缓存，默认为 True
            **kwargs: 其他关键字参数
        """
        super().__init__(**kwargs)  # 调用父类 PretrainedConfig 的初始化方法
        # 初始化模型的各种参数
        self.vocab_size = vocab_size                    # 设置词汇表大小
        self.hidden_size = hidden_size                  # 设置隐藏层大小
        self.state_size = state_size                    # 设置状态大小
        self.num_hidden_layers = num_hidden_layers      # 设置隐藏层的数量
        self.layer_norm_epsilon = layer_norm_epsilon    # 设置层归一化的 epsilon 值
        self.conv_kernel = conv_kernel                  # 设置卷积核大小
        self.expand = expand                            # 设置扩展因子
        self.intermediate_size = int(expand * self.hidden_size)  # 计算中间层大小
        self.bos_token_id = bos_token_id                # 设置起始标记 ID
        self.eos_token_id = eos_token_id                # 设置结束标记 ID
        self.pad_token_id = pad_token_id                # 设置填充标记 ID
        self.use_bias = use_bias                        # 设置是否使用偏置
        self.use_conv_bias = use_conv_bias              # 设置卷积层是否使用偏置
        self.hidden_act = hidden_act                    # 设置隐藏层激活函数类型
        self.initializer_range = initializer_range      # 设置初始化范围
        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank  # 设置时间步骤的秩
        self.time_step_scale = time_step_scale          # 设置时间步骤的比例
        self.time_step_min = time_step_min              # 设置时间步骤的最小值
        self.time_step_max = time_step_max              # 设置时间步骤的最大值
        self.time_step_init_scheme = time_step_init_scheme  # 设置时间步骤的初始化方案
        self.time_step_floor = time_step_floor          # 设置时间步骤的下限
        self.rescale_prenorm_residual = rescale_prenorm_residual  # 设置前归一化残差的重新缩放
        self.residual_in_fp32 = residual_in_fp32        # 设置是否在 FP32 下使用残差连接
        self.use_cache = use_cache                      # 设置是否使用缓存

        # 调用父类的初始化方法，传递起始、结束和填充标记 ID 以及其它参数
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)

`.\models\mamba\modeling_mamba.py`

# coding=utf-8
# Copyright 2024 state-spaces/mamba org and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch MAMBA model."""

import math
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
from .configuration_mamba import MambaConfig


logger = logging.get_logger(__name__)

# Check if MAMBA SSM (Selective State Memory Access) functionalities are available
if is_mamba_ssm_available():
    from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
else:
    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None

# Check if causal conv1d functionalities are available
if is_causal_conv1d_available():
    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
    causal_conv1d_update, causal_conv1d_fn = None, None

# Check if all fast path functionalities are available
is_fast_path_available = all(
    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
)

# Documentation variables
_CHECKPOINT_FOR_DOC = "state-spaces/mamba-130m-hf"
_CONFIG_FOR_DOC = "MambaConfig"

# List of pretrained model archive names for MAMBA models
MAMBA_PRETRAINED_MODEL_ARCHIVE_LIST = []  # See all MAMBA models at https://huggingface.co/models?filter=mamba


class MambaCache:
    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
        """
        Initialize MambaCache object.

        Args:
            config (MambaConfig): The configuration object for MAMBA model.
            batch_size (int): Batch size for the cache.
            dtype (torch.dtype, optional): Data type for tensors in cache (default: torch.float16).
            device (torch.device, optional): Device for tensors in cache (default: None).
        """
        self.seqlen_offset = 0
        self.dtype = dtype
        intermediate_size = config.intermediate_size
        ssm_state_size = config.state_size
        conv_kernel_size = config.conv_kernel

        # Initialize convolutional states dictionary
        self.conv_states = {
            i: torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
            for i in range(config.num_hidden_layers)
        }

        # Initialize SSM (Selective State Memory) states dictionary
        self.ssm_states = {
            i: torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
            for i in range(config.num_hidden_layers)
        }


class MambaMixer(nn.Module):
    """
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    """
    """
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    """

    # 初始化方法，接受配置对象和层索引作为参数
    def __init__(self, config, layer_idx):
        super().__init__()
        # 从配置对象中获取隐藏层大小和状态空间大小
        self.hidden_size = config.hidden_size
        self.ssm_state_size = config.state_size
        # 获取卷积核大小、中间层大小、时间步级别的排名、层索引以及是否使用卷积偏置的配置
        self.conv_kernel_size = config.conv_kernel
        self.intermediate_size = config.intermediate_size
        self.time_step_rank = config.time_step_rank
        self.layer_idx = layer_idx
        self.use_conv_bias = config.use_conv_bias

        # 创建一个 1D 卷积层，用于特征转换
        self.conv1d = nn.Conv1d(
            in_channels=self.intermediate_size,
            out_channels=self.intermediate_size,
            bias=config.use_conv_bias,
            kernel_size=config.conv_kernel,
            groups=self.intermediate_size,
            padding=config.conv_kernel - 1,
        )

        # 激活函数和激活函数名称的映射
        self.activation = config.hidden_act
        self.act = ACT2FN[config.hidden_act]

        # 投影输入隐藏状态的线性层
        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
        # 选择性投影，用于使 dt、B 和 C 依赖于输入
        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
        # 时间步投影（离散化）
        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)

        # S4D 真实初始化。这些值不是离散化的！
        # 核心是加载它们，计算离散状态，然后写入更新后的状态。保持内存有限
        A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
        A = A.expand(self.intermediate_size, -1).contiguous()

        # 初始化 A 的对数，作为可学习参数
        self.A_log = nn.Parameter(torch.log(A))
        # 初始化 D，作为可学习参数
        self.D = nn.Parameter(torch.ones(self.intermediate_size))
        # 输出投影线性层
        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
        self.use_bias = config.use_bias

        # 如果快速路径不可用，则发出警告
        if not is_fast_path_available:
            logger.warning_once(
                "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
                " https://github.com/Dao-AILab/causal-conv1d"
            )

    # fmt: off
    # fmt: on

    # 前向传播方法，接受隐藏状态和缓存参数作为输入
    def forward(self, hidden_states, cache_params: Optional[MambaCache] = None):
        # 如果快速路径可用且在 GPU 上，则调用 CUDA 版本的前向传播
        if is_fast_path_available and "cuda" in self.x_proj.weight.device.type:
            return self.cuda_kernels_forward(hidden_states, cache_params)
        # 否则，调用慢速的 Python 实现的前向传播
        return self.slow_forward(hidden_states, cache_params)
# 定义一个自定义的神经网络模块，用于实现MambaRMSNorm规范化层
class MambaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        """
        super().__init__()
        # 初始化可学习参数weight，初始值为1，用于缩放规范化后的输出
        self.weight = nn.Parameter(torch.ones(hidden_size))
        # 规范化中的一个小常数，用于防止除以零
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        # 记录输入的数据类型，后续会将hidden_states转换为float32
        input_dtype = hidden_states.dtype
        # 将输入hidden_states转换为float32类型
        hidden_states = hidden_states.to(torch.float32)
        # 计算hidden_states的方差，并在最后一个维度上保持维度不变
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        # 对hidden_states进行RMS（均方根）规范化操作
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        # 返回规范化后乘以权重的结果，并转回输入数据类型
        return self.weight * hidden_states.to(input_dtype)


# 定义一个自定义的神经网络模块，表示Mamba模型中的一个块
class MambaBlock(nn.Module):
    def __init__(self, config, layer_idx):
        super().__init__()
        # 记录配置和层索引
        self.config = config
        self.layer_idx = layer_idx
        # 是否在浮点数（float32）中处理残差连接
        self.residual_in_fp32 = config.residual_in_fp32
        # 初始化规范化层，使用MambaRMSNorm，并传入隐藏大小和层归一化的小常数值
        self.norm = MambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        # 初始化MambaMixer模块，用于处理隐藏状态
        self.mixer = MambaMixer(config, layer_idx=layer_idx)

    def forward(self, hidden_states, cache_params: Optional[MambaCache] = None):
        # 备份原始的hidden_states作为残差连接
        residual = hidden_states
        # 对隐藏状态进行规范化，并将数据类型转换为self.norm.weight的数据类型
        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
        # 如果配置要求，将残差连接转换为float32类型
        if self.residual_in_fp32:
            residual = residual.to(torch.float32)
        # 经过MambaMixer处理后的隐藏状态与残差连接相加，作为本模块的输出
        hidden_states = self.mixer(hidden_states, cache_params=cache_params)
        hidden_states = residual + hidden_states
        return hidden_states


# MambaPreTrainedModel是一个抽象类，用于处理权重初始化，下载和加载预训练模型的简单接口
class MambaPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定该类使用的配置类
    config_class = MambaConfig
    # 指定模型的主要前缀字符串
    base_model_prefix = "backbone"
    # 不需要拆分的模块名称列表
    _no_split_modules = ["MambaBlock"]
    # 支持梯度检查点的标志
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights."""
        # 如果 module 是 MambaMixer 类的实例
        if isinstance(module, MambaMixer):
            # 设置权重不参与权重衰减的标志位
            module.A_log._no_weight_decay = True
            module.D._no_weight_decay = True

            # 根据配置参数初始化时间步长的标准差
            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
            # 根据初始化方案初始化时间步长投影权重
            if self.config.time_step_init_scheme == "constant":
                nn.init.constant_(module.dt_proj.weight, dt_init_std)
            elif self.config.time_step_init_scheme == "random":
                nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)

            # 生成指数分布的时间步长，并进行上下限截断
            dt = torch.exp(
                torch.rand(self.config.intermediate_size)
                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
                + math.log(self.config.time_step_min)
            ).clamp(min=self.config.time_step_floor)
            # 计算逆 softplus 函数的结果，用于初始化偏置
            inv_dt = dt + torch.log(-torch.expm1(-dt))
            # 用逆 softplus 函数的结果设置偏置，不进行梯度计算
            with torch.no_grad():
                module.dt_proj.bias.copy_(inv_dt)
            module.dt_proj.bias._no_reinit = True

        # 如果 module 是 nn.Linear 类的实例
        if isinstance(module, nn.Linear):
            # 如果存在偏置项且未标记为不重新初始化，则将其初始化为零
            if module.bias is not None:
                if not getattr(module.bias, "_no_reinit", False):
                    nn.init.zeros_(module.bias)
        
        # 如果 module 是 nn.Embedding 类的实例
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重
            nn.init.normal_(module.weight, std=self.config.initializer_range)

        # 如果配置要求重新缩放预正则化残差
        if self.config.rescale_prenorm_residual:
            # 针对选定的参数进行重新初始化，参考 OpenAI GPT-2 论文中的方案
            # 对于 "out_proj.weight" 参数，特殊的缩放初始化策略
            for name, p in module.named_parameters():
                if name in ["out_proj.weight"]:
                    # 使用特殊的 Kaiming 均匀初始化，除以 sqrt(2 * num_layers) 进行缩放
                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
                    with torch.no_grad():
                        p /= math.sqrt(self.config.num_layers)
"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
"""

MambaOutput:
"""
Class for the MAMBA model outputs.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
    last_hidden_state: Optional[torch.FloatTensor] = None
    cache_params: Optional[MambaCache] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None


MambaCausalLMOutput:
"""
Base class for causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
"""
    loss: Optional[torch.FloatTensor] = None
    logits: Optional[torch.FloatTensor] = None
    cache_params: Optional[MambaCache] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)



    # 该库实现了其所有模型的各种功能，如下载或保存模型、调整输入嵌入的大小、修剪模型头等。



    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.



    # 这个模型也是 PyTorch 的 torch.nn.Module 的子类。
    # 可以像使用常规的 PyTorch 模块一样使用它，关于一般用法和行为的所有事项，请参考 PyTorch 文档。



    Parameters:
        config ([`MambaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.



    # 参数:
    #     config ([`MambaConfig`]): 包含模型所有参数的配置类。
    #         使用配置文件初始化不会加载模型的权重，只会加载配置信息。
    #         若要加载模型权重，请查阅 [`~PreTrainedModel.from_pretrained`] 方法。
"""
MAMBA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            Indices of input sequence tokens in the vocabulary.

            If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

@add_start_docstrings(
    "The bare MAMBA Model transformer outputting raw hidden-states without any specific head on top.",
    MAMBA_START_DOCSTRING,
)
class MambaModel(MambaPreTrainedModel):
    """
    MAMBA 模型的核心类，输出未经特定头部处理的原始隐藏状态。
    """

    def __init__(self, config):
        """
        初始化 MambaModel 类。

        Args:
            config (MambaConfig): 包含模型配置信息的实例。

        Attributes:
            embeddings (nn.Embedding): 输入 token 的嵌入表示。
            layers (nn.ModuleList): MambaBlock 层的列表，构成模型的主体。
            gradient_checkpointing (bool): 是否使用梯度检查点。
            norm_f (MambaRMSNorm): 应用于隐藏状态的 RMS 标准化器。
        """
        super().__init__(config)

        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.layers = nn.ModuleList([MambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])

        self.gradient_checkpointing = False
        self.norm_f = MambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        """
        获取输入嵌入层。

        Returns:
            nn.Embedding: 输入嵌入层对象。
        """
        return self.embeddings

    def set_input_embeddings(self, new_embeddings):
        """
        设置输入嵌入层。

        Args:
            new_embeddings (nn.Embedding): 新的输入嵌入层对象。
        """
        self.embeddings = new_embeddings

    @add_start_docstrings_to_model_forward(MAMBA_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MambaOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(self, input_ids=None, inputs_embeds=None, cache_params=None, use_cache=False,
                output_hidden_states=False, return_dict=True):
        """
        模型的前向传播。

        Args:
            input_ids (torch.LongTensor, optional): 输入 token 的索引序列。
            inputs_embeds (torch.FloatTensor, optional): 输入 token 的嵌入表示。
            cache_params (MambaCache, optional): 缓存参数，用于模型的历史状态。
            use_cache (bool, optional): 如果为 True，则返回缓存参数以便快速生成下一个 logit。
            output_hidden_states (bool, optional): 是否返回所有层的隐藏状态。
            return_dict (bool, optional): 是否返回 ModelOutput 对象而不是普通元组。

        Returns:
            ModelOutput or tuple: 模型输出对象或普通元组，具体取决于 return_dict 参数的设置。
        """
        pass
    # 定义模型的前向传播方法，接受多个输入参数，并返回一个元组或MambaOutput对象
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token ids序列，可选
        inputs_embeds: Optional[torch.LongTensor] = None,  # 输入的嵌入表示，可选
        cache_params: Optional[MambaCache] = None,  # 缓存参数对象，可选
        use_cache: Optional[bool] = None,  # 是否使用缓存，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出所有隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选
        **kwargs,  # 其他关键字参数，例如attention_mask由分词器传递，不需要处理
    ) -> Union[Tuple, MambaOutput]:  # 返回值可以是元组或MambaOutput对象
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )  # 如果没有显式指定输出隐藏状态，则使用配置中的默认设置

        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
        # 如果没有显式指定是否使用缓存，则根据训练状态和模型配置进行设定

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果没有显式指定是否返回字典形式的输出，则根据模型配置进行设定

        if (input_ids is None) ^ (inputs_embeds is not None):  # 异或运算符判断输入参数的合法性
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)  # 如果没有提供嵌入表示，则根据输入的token ids生成嵌入表示

        if self.gradient_checkpointing and self.training and use_cache:
            use_cache = False  # 如果启用了梯度检查点且处于训练模式并且使用缓存，则禁用缓存

        if cache_params is None and use_cache:
            cache_params = MambaCache(
                self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype
            )  # 如果没有提供缓存参数且需要使用缓存，则创建新的MambaCache对象

        hidden_states = inputs_embeds  # 将嵌入表示作为初始隐藏状态
        all_hidden_states = () if output_hidden_states else None  # 如果需要输出所有隐藏状态，则初始化空元组

        for mixer_block in self.layers:
            if self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(mixer_block.__call__, hidden_states, cache_params)
                # 如果启用了梯度检查点并且处于训练模式，则使用梯度检查点函数计算mixer_block的输出
            else:
                hidden_states = mixer_block(hidden_states, cache_params=cache_params)
                # 否则直接调用mixer_block计算隐藏状态

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)  # 如果需要输出所有隐藏状态，则保存当前隐藏状态

        if use_cache:
            cache_params.seqlen_offset += inputs_embeds.shape[1]  # 更新缓存参数的序列长度偏移量

        hidden_states = self.norm_f(hidden_states)  # 对最终的隐藏状态进行归一化处理

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)  # 如果需要输出所有隐藏状态，则保存最终的隐藏状态

        if not return_dict:
            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
            # 如果不需要返回字典形式的输出，则返回一个元组，包含非空的hidden_states、cache_params和all_hidden_states

        return MambaOutput(
            last_hidden_state=hidden_states,  # 返回MambaOutput对象，包括最终的隐藏状态
            cache_params=cache_params if use_cache else None,  # 如果使用缓存，则返回缓存参数，否则为None
            hidden_states=all_hidden_states,  # 返回所有的隐藏状态
        )
@add_start_docstrings(
    """
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    MAMBA_START_DOCSTRING,
)
class MambaForCausalLM(MambaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.backbone = MambaModel(config)  # 初始化 MambaModel 作为 backbone
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        # 初始化线性层 lm_head，用于语言建模头部，权重与输入嵌入层相关联

        # 初始化后处理
        self.post_init()

    def get_output_embeddings(self):
        return self.lm_head  # 返回 lm_head 作为输出嵌入层

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings  # 设置新的输出嵌入层

    def get_input_embeddings(self):
        return self.backbone.get_input_embeddings()  # 获取输入嵌入层

    def set_input_embeddings(self, new_embeddings):
        return self.backbone.set_input_embeddings(new_embeddings)  # 设置新的输入嵌入层

    def _update_model_kwargs_for_generation(
        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], **kwargs
    ) -> Dict[str, Any]:
        model_kwargs["cache_params"] = outputs.get("cache_params", None)
        return model_kwargs
        # 更新用于生成的模型参数，包括缓存参数

    def prepare_inputs_for_generation(
        self, input_ids, cache_params: Optional[MambaCache] = None, inputs_embeds=None, attention_mask=None, **kwargs
    ):
        # 如果传递了状态，则只使用输入 IDs 的最后一个标记
        if cache_params is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)

        if inputs_embeds is not None and cache_params is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs["cache_params"] = cache_params
        return model_inputs
        # 为生成准备输入数据，支持输入 IDs 或嵌入张量，以及缓存参数

    @add_start_docstrings_to_model_forward(MAMBA_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MambaCausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        cache_params: Optional[MambaCache] = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        use_cache: Optional[bool] = None,
        **kwargs,  # for now we need this for generation
    ):
        # 此处是模型的前向传播方法，输入参数包括 input_ids、inputs_embeds 等等，用于生成或训练模型
    ) -> Union[Tuple, MambaCausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 根据需要确定是否返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的主体部分进行前向计算，获取模型输出
        mamba_outputs = self.backbone(
            input_ids,
            cache_params=cache_params,
            inputs_embeds=inputs_embeds,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            use_cache=use_cache,
        )
        # 获取模型主体输出中的隐藏状态
        hidden_states = mamba_outputs[0]

        # 使用语言模型头部计算逻辑回归结果
        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()

        # 初始化损失为None
        loss = None
        if labels is not None:
            # 将标签移到正确的设备上，以支持模型并行计算
            labels = labels.to(logits.device)
            # 将预测的logits向左移动一个位置，以对齐标签
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # 使用交叉熵损失函数计算损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # 如果不返回字典形式的输出，构造输出元组
        if not return_dict:
            output = (logits,) + mamba_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回自定义的输出类MambaCausalLMOutput，包括损失、logits和其他额外的模型输出
        return MambaCausalLMOutput(
            loss=loss,
            logits=logits,
            cache_params=mamba_outputs.cache_params,
            hidden_states=mamba_outputs.hidden_states,
        )

Transformers-源码解析-六十八-

Transformers 源码解析（六十八）

.\models\lxmert\modeling_tf_lxmert.py

.\models\lxmert\tokenization_lxmert.py

.\models\lxmert\tokenization_lxmert_fast.py

.\models\lxmert\__init__.py

.\models\m2m_100\configuration_m2m_100.py

.\models\m2m_100\convert_m2m100_original_checkpoint_to_pytorch.py

.\models\m2m_100\modeling_m2m_100.py

.\models\m2m_100\tokenization_m2m_100.py

.\models\m2m_100\__init__.py

.\models\mamba\configuration_mamba.py

.\models\mamba\modeling_mamba.py

`.\models\lxmert\modeling_tf_lxmert.py`

`.\models\lxmert\tokenization_lxmert.py`

`.\models\lxmert\tokenization_lxmert_fast.py`

`.\models\lxmert\init.py`

`.\models\m2m_100\configuration_m2m_100.py`

`.\models\m2m_100\convert_m2m100_original_checkpoint_to_pytorch.py`

`.\models\m2m_100\modeling_m2m_100.py`

`.\models\m2m_100\tokenization_m2m_100.py`

`.\models\m2m_100\init.py`

`.\models\mamba\configuration_mamba.py`

`.\models\mamba\modeling_mamba.py`