Transformers 源码解析（十九）

`.\models\blenderbot\modeling_tf_blenderbot.py`

# coding=utf-8
# 版权所有 2021 年 Facebook, Inc 和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权；
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“现状”分发的软件
# 没有任何明示或暗示的保证或条件。
# 有关特定语言的详细信息，请参阅许可证。
""" TF 2.0 Blenderbot 模型。"""


from __future__ import annotations

import os
import random
import warnings
from typing import List, Optional, Tuple, Union

import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFSeq2SeqLMOutput,
    TFSeq2SeqModelOutput,
)

# 公共 API
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFPreTrainedModel,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_blenderbot import BlenderbotConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
_CONFIG_FOR_DOC = "BlenderbotConfig"


LARGE_NEGATIVE = -1e8


# 从 transformers.models.bart.modeling_tf_bart.shift_tokens_right 复制而来
# 将输入的 token 向右移动，用于生成过程中的输入
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
    # 创建以 decoder_start_token_id 填充的张量，作为起始 token
    start_tokens = tf.fill(
        (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
    )
    # 向右移动输入的 token ids
    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
    # 将标签中可能存在的 -100 值替换为 pad_token_id
    shifted_input_ids = tf.where(
        shifted_input_ids == -100,
        tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
        shifted_input_ids,
    )

    # 确保 `labels` 只包含正值和 -100
    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))

    # 确保断言操作在调用时不会被优化掉
    with tf.control_dependencies([assert_gte0]):
        shifted_input_ids = tf.identity(shifted_input_ids)

    return shifted_input_ids

# 从 transformers.models.bart.modeling_tf_bart._make_causal_mask 复制而来
# 创建用于自注意力的因果遮罩，用于单向解码器自注意力机制
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.
    创建用于双向自注意力的因果遮罩。
    """
    # 获取批量大小
    bsz = input_ids_shape[0]
    # 获取目标序列长度
    tgt_len = input_ids_shape[1]
    # 创建初始遮罩，设定为非常大的负数
    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
    # 生成一个序列长度的范围
    mask_cond = tf.range(shape_list(mask)[-1])

    # 将遮罩设定为只对当前位置之前的位置可见，其余为不可见
    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)

    # 如果过去的键值长度大于零，则在遮罩的左侧添加零值部分
    if past_key_values_length > 0:
        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)

    # 返回扩展后的遮罩，用于模型的自注意力机制
    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))


# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    将注意力遮罩从 `[bsz, seq_len]` 扩展到 `[bsz, 1, tgt_seq_len, src_seq_len]`。
    """
    # 获取源序列的长度
    src_len = shape_list(mask)[1]
    # 如果未指定目标序列的长度，则使用源序列的长度
    tgt_len = tgt_len if tgt_len is not None else src_len
    # 创建一个常数张量为 1.0
    one_cst = tf.constant(1.0)
    # 将遮罩转换为指定数据类型的张量
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在遮罩的第二维度上进行扩展，使其变为 `[bsz, 1, tgt_seq_len, src_seq_len]`
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    # 返回扩展后的遮罩，用于模型的注意力机制
    return (one_cst - expanded_mask) * LARGE_NEGATIVE


class TFBlenderbotLearnedPositionalEmbedding(keras.layers.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    此模块学习位置嵌入，最多到固定的最大大小。
    """

    def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
        super().__init__(num_embeddings, embedding_dim, **kwargs)

    def call(
        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
    ):
        """Input is expected to be of size [bsz x seqlen]."""
        # 如果未提供位置 ID，则根据输入序列长度生成位置 ID
        if position_ids is None:
            seq_len = input_shape[1]
            position_ids = tf.range(seq_len, delta=1, name="range")
            position_ids += past_key_values_length

        # 调用父类的 call 方法，使用位置 ID 生成位置嵌入
        return super().call(tf.cast(position_ids, dtype=tf.int32))


# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot
class TFBlenderbotAttention(keras.layers.Layer):
    """Multi-headed attention from "Attention Is All You Need"
    多头注意力机制，源自于《Attention Is All You Need》"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
        # 其他参数
        ):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

        self.num_heads = num_heads
        self.dropout = keras.layers.Dropout(dropout)
        self.head_dim = embed_dim // num_heads
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")


    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
        # Reshape and transpose the input tensor to match the expected multi-head shape
        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))


    def call(
        self,
        hidden_states: tf.Tensor,
        key_value_states: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        # Main call function defining how the transformer layer processes inputs


    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "k_proj", None) is not None:
            with tf.name_scope(self.k_proj.name):
                # Build the linear transformation layer for keys
                self.k_proj.build([None, None, self.embed_dim])
        if getattr(self, "q_proj", None) is not None:
            with tf.name_scope(self.q_proj.name):
                # Build the linear transformation layer for queries
                self.q_proj.build([None, None, self.embed_dim])
        if getattr(self, "v_proj", None) is not None:
            with tf.name_scope(self.v_proj.name):
                # Build the linear transformation layer for values
                self.v_proj.build([None, None, self.embed_dim])
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                # Build the final output projection layer
                self.out_proj.build([None, None, self.embed_dim])
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot
class TFBlenderbotEncoderLayer(keras.layers.Layer):
    def __init__(self, config: BlenderbotConfig, **kwargs):
        super().__init__(**kwargs)
        # 初始化层的参数，包括嵌入维度和注意力机制相关组件
        self.embed_dim = config.d_model
        self.self_attn = TFBlenderbotAttention(
            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
        )
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        self.dropout = keras.layers.Dropout(config.dropout)
        self.activation_fn = get_tf_activation(config.activation_function)
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        self.config = config

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        layer_head_mask: tf.Tensor,
        training: Optional[bool] = False,
    ):
        """
        Args:
            hidden_states (`tf.Tensor`): 输入到该层的张量，形状为 *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): 注意力掩码张量，大小为 *(batch, 1, tgt_len, src_len)*，其中填充元素由非常大的负值表示。
            layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码张量，大小为 *(encoder_attention_heads,)*
        """
        residual = hidden_states
        # 对输入的 hidden_states 进行 LayerNormalization 处理
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 使用 self_attn 处理隐藏状态，得到新的 hidden_states 和注意力权重 self_attn_weights
        hidden_states, self_attn_weights, _ = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
        )

        # 断言确保 self attn 操作没有改变查询的形状
        tf.debugging.assert_equal(
            shape_list(hidden_states),
            shape_list(residual),
            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
        )

        # 应用 dropout，并将残差连接到处理后的 hidden_states
        hidden_states = self.dropout(hidden_states, training=training)
        hidden_states = residual + hidden_states

        residual = hidden_states
        # 再次进行 LayerNormalization 处理
        hidden_states = self.final_layer_norm(hidden_states)
        # 应用激活函数和 dropout 到全连接层 fc1
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = self.activation_dropout(hidden_states, training=training)
        # 经过全连接层 fc2
        hidden_states = self.fc2(hidden_states)
        hidden_states = self.dropout(hidden_states, training=training)
        # 将残差连接到最终的 hidden_states 输出
        hidden_states = residual + hidden_states

        return hidden_states, self_attn_weights
    # 定义模型构建方法，设置输入形状为可选，通常用于构建神经网络模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        
        # 如果存在 self_attn 属性，进行相应的构建操作
        if getattr(self, "self_attn", None) is not None:
            # 在 TensorFlow 中使用命名空间，用于区分不同的操作和变量
            with tf.name_scope(self.self_attn.name):
                # 构建 self_attn 层
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，进行相应的构建操作
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                # 构建 self_attn_layer_norm 层，并指定输入形状
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，进行相应的构建操作
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                # 构建 fc1 层，并指定输入形状
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，进行相应的构建操作
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                # 构建 fc2 层，并指定输入形状为 encoder_ffn_dim
                self.fc2.build([None, None, self.config.encoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，进行相应的构建操作
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                # 构建 final_layer_norm 层，并指定输入形状
                self.final_layer_norm.build([None, None, self.embed_dim])
# 从transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer复制而来，将MBart->Blenderbot
class TFBlenderbotDecoderLayer(keras.layers.Layer):
    def __init__(self, config: BlenderbotConfig, **kwargs):
        super().__init__(**kwargs)
        # 初始化层，设定嵌入维度为config中的d_model
        self.embed_dim = config.d_model
        # 创建self attention层，使用TFBlenderbotAttention
        self.self_attn = TFBlenderbotAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="self_attn",
            is_decoder=True,
        )
        # Dropout层，使用配置中的dropout率
        self.dropout = keras.layers.Dropout(config.dropout)
        # 激活函数，根据配置获取对应的TensorFlow激活函数
        self.activation_fn = get_tf_activation(config.activation_function)
        # 激活函数的dropout层，使用配置中的activation_dropout率
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)

        # LayerNormalization层，用于self attention后的归一化
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 创建encoder-decoder attention层，使用TFBlenderbotAttention
        self.encoder_attn = TFBlenderbotAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="encoder_attn",
            is_decoder=True,
        )
        # LayerNormalization层，用于encoder-decoder attention后的归一化
        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
        # 全连接层1，使用配置中的decoder_ffn_dim作为units数目
        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
        # 全连接层2，输出维度与嵌入维度相同
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 最终的LayerNormalization层，用于全连接层输出的归一化
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 保存配置参数
        self.config = config

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor | None = None,
        encoder_hidden_states: tf.Tensor | None = None,
        encoder_attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        cross_attn_layer_head_mask: tf.Tensor | None = None,
        past_key_value: Tuple[tf.Tensor] | None = None,
        training: Optional[bool] = False,
        # 该方法定义了Blenderbot解码器层的前向传播逻辑，包括self attention和encoder-decoder attention的处理
    # 构建函数，用于构建神经网络层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，不进行重复构建
        if self.built:
            return
        # 将标志位设置为已构建状态
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self_attention 层
        if getattr(self, "self_attn", None) is not None:
            # 使用 self_attn 的名称作为命名空间
            with tf.name_scope(self.self_attn.name):
                # 调用 self_attn 对象的 build 方法
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 self_attention 层的 layer normalization
        if getattr(self, "self_attn_layer_norm", None) is not None:
            # 使用 self_attn_layer_norm 的名称作为命名空间
            with tf.name_scope(self.self_attn_layer_norm.name):
                # 调用 self_attn_layer_norm 对象的 build 方法，指定输入形状
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 encoder_attn 属性，则构建 encoder_attention 层
        if getattr(self, "encoder_attn", None) is not None:
            # 使用 encoder_attn 的名称作为命名空间
            with tf.name_scope(self.encoder_attn.name):
                # 调用 encoder_attn 对象的 build 方法
                self.encoder_attn.build(None)
        
        # 如果存在 encoder_attn_layer_norm 属性，则构建 encoder_attention 层的 layer normalization
        if getattr(self, "encoder_attn_layer_norm", None) is not None:
            # 使用 encoder_attn_layer_norm 的名称作为命名空间
            with tf.name_scope(self.encoder_attn_layer_norm.name):
                # 调用 encoder_attn_layer_norm 对象的 build 方法，指定输入形状
                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建第一个全连接层
        if getattr(self, "fc1", None) is not None:
            # 使用 fc1 的名称作为命名空间
            with tf.name_scope(self.fc1.name):
                # 调用 fc1 对象的 build 方法，指定输入形状
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建第二个全连接层
        if getattr(self, "fc2", None) is not None:
            # 使用 fc2 的名称作为命名空间
            with tf.name_scope(self.fc2.name):
                # 调用 fc2 对象的 build 方法，指定输入形状为解码器的 FFN 维度
                self.fc2.build([None, None, self.config.decoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建最终的 layer normalization 层
        if getattr(self, "final_layer_norm", None) is not None:
            # 使用 final_layer_norm 的名称作为命名空间
            with tf.name_scope(self.final_layer_norm.name):
                # 调用 final_layer_norm 对象的 build 方法，指定输入形状
                self.final_layer_norm.build([None, None, self.embed_dim])
# TFBlenderbotPreTrainedModel 类继承自 TFPreTrainedModel 类，是 Blenderbot 模型的 TensorFlow 2.0 Keras 实现。
class TFBlenderbotPreTrainedModel(TFPreTrainedModel):
    # 指定配置类为 BlenderbotConfig
    config_class = BlenderbotConfig
    # 模型的基础名称前缀为 "model"
    base_model_prefix = "model"
    # 打印人类输入的文本 UTTERANCE
    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
    >>> print("Human: ", UTTERANCE)
    
    # 使用 tokenizer 对输入的文本进行处理，并返回 TensorFlow 张量格式的输入
    >>> inputs = tokenizer([UTTERANCE], return_tensors="tf")
    
    # 使用预训练模型生成回复文本的 ID
    >>> reply_ids = model.generate(**inputs)
    
    # 打印机器人生成的回复文本，跳过特殊符号解码
    >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
    
    # 打印人类输入的 REPLY 文本
    >>> REPLY = "I'm not sure"
    >>> print("Human: ", REPLY)
    
    # 构建下一个对话文本 NEXT_UTTERANCE，包含前一个对话内容和新的问题
    >>> NEXT_UTTERANCE = (
    ...     "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
    ...     "Are they trying to lose weight or are they just trying to be healthier?</s> "
    ...     "<s> I'm not sure."
    ... )
    
    # 使用 tokenizer 对下一个对话文本进行处理，并返回 TensorFlow 张量格式的输入
    >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf")
    
    # 使用预训练模型生成下一个对话文本的 ID
    >>> next_reply_ids = model.generate(**inputs)
    
    # 打印机器人生成的下一个对话文本，跳过特殊符号解码
    >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
"""

BLENDERBOT_INPUTS_DOCSTRING = r"""
"""


@keras_serializable
class TFBlenderbotEncoder(keras.layers.Layer):
    config_class = BlenderbotConfig
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`TFBlenderbotEncoderLayer`].

    Args:
        config: BlenderbotConfig
    """

    def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        self.dropout = keras.layers.Dropout(config.dropout)  # 初始化一个丢弃层，用于在训练过程中随机丢弃输入
        self.layerdrop = config.encoder_layerdrop  # 从配置中获取层丢弃率，表示在每个训练步骤中丢弃编码器层的概率
        self.padding_idx = config.pad_token_id  # 获取填充标记的索引，用于处理输入序列的填充
        self.max_source_positions = config.max_position_embeddings  # 获取最大源序列位置数，用于限制输入序列的最大长度
        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0  # 根据配置是否缩放嵌入向量的大小

        self.embed_tokens = embed_tokens  # 用于输入序列的嵌入令牌
        self.embed_positions = TFBlenderbotLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            name="embed_positions",
        )  # 初始化学习的位置嵌入层，用于将输入序列的位置编码成向量
        self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]  # 创建多层编码器层
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")  # 初始化层归一化层，用于每个层输出的归一化处理

    def get_embed_tokens(self):
        return self.embed_tokens  # 返回当前嵌入令牌

    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens  # 设置新的嵌入令牌

    @unpack_inputs
    def call(
        self,
        input_ids=None,
        inputs_embeds=None,
        attention_mask=None,
        head_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):  # 定义 Transformer 编码器的前向传播函数
        """
        参数：
            input_ids: 输入的 token IDs
            inputs_embeds: 替代的嵌入输入
            attention_mask: 注意力掩码，用于指示哪些位置需要注意哪些位置不需要
            head_mask: 多头注意力机制的掩码
            output_attentions: 是否输出注意力权重
            output_hidden_states: 是否输出所有隐藏状态
            return_dict: 是否返回字典格式的输出
            training: 是否处于训练模式
        返回：
            根据配置返回不同格式的输出
        """
        # 以下是前向传播的具体实现，根据输入参数进行不同的计算和处理

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)  # 构建位置嵌入层
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])  # 构建层归一化层
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)  # 构建每一层的编码器层


@keras_serializable
class TFBlenderbotDecoder(keras.layers.Layer):
    config_class = BlenderbotConfig
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`]

    Args:
        config: BlenderbotConfig
        embed_tokens: output embedding
    """
    # 初始化方法，用于创建一个新的TFBlenderbotDecoder对象
    def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将传入的配置对象保存到实例变量中
        self.config = config
        # 将配置中的填充标记ID保存到实例变量中
        self.padding_idx = config.pad_token_id
        # 将传入的嵌入层对象保存到实例变量中
        self.embed_tokens = embed_tokens
        # 从配置中获取解码器层dropout的比例并保存到实例变量中
        self.layerdrop = config.decoder_layerdrop
        # 创建一个学习的位置嵌入对象并保存到实例变量中
        self.embed_positions = TFBlenderbotLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            name="embed_positions",
        )
        # 如果配置中指定了缩放嵌入，则计算并保存嵌入缩放因子；否则设置为1.0
        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
        # 创建解码器层列表，每个解码器层都使用给定的配置对象进行初始化，并保存到实例变量中
        self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
        # 创建一个层归一化层对象，设置epsilon为1e-5，并保存到实例变量中
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")

        # 创建一个dropout层对象，并保存到实例变量中，使用配置中的dropout比例
        self.dropout = keras.layers.Dropout(config.dropout)

    # 获取嵌入层对象的方法
    def get_embed_tokens(self):
        return self.embed_tokens

    # 设置嵌入层对象的方法
    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens

    # 使用@unpack_inputs装饰器标记的调用方法，定义了Blenderbot解码器的前向传播逻辑
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        inputs_embeds=None,
        attention_mask=None,
        position_ids=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        # 省略了前向传播的具体实现，根据参数配置实现解码器的逻辑

    # 构建方法，在第一次调用call方法时被调用，用于构建模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 设置模型已构建标志为True
        self.built = True
        # 如果实例中存在embed_positions属性，则构建embed_positions对象
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)
        # 如果实例中存在layer_norm属性，则构建layer_norm对象
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])
        # 遍历解码器层列表中的每一层，分别构建每一层解码器层对象
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)
# 使用装饰器将类标记为可序列化，适用于Keras
@keras_serializable
class TFBlenderbotMainLayer(keras.layers.Layer):
    # 配置类为BlenderbotConfig
    config_class = BlenderbotConfig

    # 初始化方法，接收BlenderbotConfig实例和其他关键字参数
    def __init__(self, config: BlenderbotConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 将传入的配置对象保存为属性
        self.config = config

        # 创建共享的嵌入层，用于编码器和解码器共享的词汇表和模型尺寸
        self.shared = keras.layers.Embedding(
            input_dim=config.vocab_size,  # 输入维度为词汇表大小
            output_dim=config.d_model,     # 输出维度为模型维度
            embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),  # 初始化嵌入层的权重
            name="model.shared",  # 层的名称
        )
        
        # 附加属性，指定层的预期名称范围（用于加载/存储权重）
        self.shared.load_weight_prefix = "model.shared"

        # 创建编码器对象，传入配置对象和共享的嵌入层
        self.encoder = TFBlenderbotEncoder(config, self.shared, name="encoder")

        # 创建解码器对象，传入配置对象和共享的嵌入层
        self.decoder = TFBlenderbotDecoder(config, self.shared, name="decoder")

    # 获取输入嵌入层的方法
    def get_input_embeddings(self):
        return self.shared

    # 设置输入嵌入层的方法
    def set_input_embeddings(self, new_embeddings):
        # 更新共享的嵌入层
        self.shared = new_embeddings
        # 更新编码器和解码器中的嵌入层
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    # 使用装饰器解包输入参数的方法
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        decoder_position_ids=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
        past_key_values=None,
        inputs_embeds=None,
        decoder_inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
        **kwargs,
        ):
            # 如果用户没有提供隐藏状态的输出，则使用模型配置中的默认设置
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )

            # 如果没有提供编码器输出，则调用编码器进行前向传播
            if encoder_outputs is None:
                encoder_outputs = self.encoder(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    training=training,
                )
            # 如果 return_dict=True 并且用户传递了一个元组作为 encoder_outputs，则将其包装在 TFBaseModelOutput 中
            elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
                encoder_outputs = TFBaseModelOutput(
                    last_hidden_state=encoder_outputs[0],
                    hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                    attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
                )
            # 如果 return_dict=False 并且用户传递了 TFBaseModelOutput 作为 encoder_outputs，则将其包装在元组中
            elif not return_dict and not isinstance(encoder_outputs, tuple):
                encoder_outputs = encoder_outputs.to_tuple()

            # 使用解码器进行解码操作
            decoder_outputs = self.decoder(
                decoder_input_ids,
                attention_mask=decoder_attention_mask,
                position_ids=decoder_position_ids,
                encoder_hidden_states=encoder_outputs[0],
                encoder_attention_mask=attention_mask,
                head_mask=decoder_head_mask,
                cross_attn_head_mask=cross_attn_head_mask,
                past_key_values=past_key_values,
                inputs_embeds=decoder_inputs_embeds,
                use_cache=use_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                training=training,
            )

            # 如果 return_dict=False，则将解码器输出和编码器输出合并并返回
            if not return_dict:
                return decoder_outputs + encoder_outputs

            # 如果 return_dict=True，则将解码器输出和编码器输出合并为 TFSeq2SeqModelOutput 类型并返回
            return TFSeq2SeqModelOutput(
                last_hidden_state=decoder_outputs.last_hidden_state,
                past_key_values=decoder_outputs.past_key_values,
                decoder_hidden_states=decoder_outputs.hidden_states,
                decoder_attentions=decoder_outputs.attentions,
                cross_attentions=decoder_outputs.cross_attentions,
                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
                encoder_hidden_states=encoder_outputs.hidden_states,
                encoder_attentions=encoder_outputs.attentions,
            )
    # 定义模型的构建方法，当输入形状为None时表示该方法可接受任意输入形状
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        
        # 共享/共同权重期望在模型基础命名空间中
        # 在 tf.name_scope 的末尾添加 "/"（但不是开头！）将其放置在根命名空间而不是当前命名空间
        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
            # 构建共享部分模型
            self.shared.build(None)
        
        # 如果存在编码器部分，进入编码器的命名空间并构建
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        
        # 如果存在解码器部分，进入解码器的命名空间并构建
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)
# 添加模型的文档字符串，说明这是一个输出原始隐藏状态的 BLENDERBOT 模型，没有特定的输出头部分
@add_start_docstrings(
    "The bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.",
    BLENDERBOT_START_DOCSTRING,
)
class TFBlenderbotModel(TFBlenderbotPreTrainedModel):
    def __init__(self, config: BlenderbotConfig, *inputs, **kwargs):
        # 调用父类的初始化方法，传入配置和其他输入参数
        super().__init__(config, *inputs, **kwargs)

        # 创建 TFBlenderbotMainLayer 实例作为模型的主要组成部分
        self.model = TFBlenderbotMainLayer(config, name="model")

    # 返回编码器部分的方法
    def get_encoder(self):
        return self.model.encoder

    # 返回解码器部分的方法
    def get_decoder(self):
        return self.model.decoder

    @classmethod
    # 从预训练模型加载模型的类方法
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        if pretrained_model_name_or_path == "facebook/blenderbot-90M":
            # 如果加载的是 facebook/blenderbot-90M 模型，则发出未来警告
            from ..blenderbot_small import TFBlenderbotSmallModel

            warnings.warn(
                "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
                " checkpoint `facebook/small_blenderbot-90M` with"
                " `TFBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')`"
                " instead.",
                FutureWarning,
            )
            # 返回 TFBlenderbotSmallModel 的预训练模型
            return TFBlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path)

        # 否则调用父类的 from_pretrained 方法加载模型
        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSeq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的调用方法，接收多个输入参数
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        decoder_input_ids: tf.Tensor | None = None,
        decoder_attention_mask: tf.Tensor | None = None,
        decoder_position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        decoder_head_mask: tf.Tensor | None = None,
        cross_attn_head_mask: tf.Tensor | None = None,
        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
        past_key_values: List[tf.Tensor] | None = None,
        inputs_embeds: tf.Tensor | None = None,
        decoder_inputs_embeds: tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
        # 调用模型的方法，传入以下参数，并接收返回的输出
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回模型的输出
        return outputs

    # 从 transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output 复制而来
    def serving_output(self, output):
        # 根据配置判断是否需要处理过去键值（past_key_values）
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 根据配置判断是否需要输出解码器隐藏状态（decoder_hidden_states）
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 根据配置判断是否需要输出解码器注意力权重（decoder_attentions）
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 根据配置判断是否需要输出交叉注意力权重（cross_attentions）
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 根据配置判断是否需要输出编码器隐藏状态（encoder_hidden_states）
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 根据配置判断是否需要输出编码器注意力权重（encoder_attentions）
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 构建并返回 TFSeq2SeqModelOutput 对象
        return TFSeq2SeqModelOutput(
            last_hidden_state=output.last_hidden_state,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 如果模型存在，使用模型的名称构建
        if getattr(self, "model", None) is not None:
            with tf.name_scope(self.model.name):
                self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(keras.layers.Layer):
    """
    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
    so all weights have to be registered in a layer.
    """

    def __init__(self, shape, initializer, trainable, name, **kwargs):
        super().__init__(name=name, **kwargs)
        # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
        # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
        # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
        # 添加偏置权重作为层的一部分，以便在模型保存和加载时能够正确处理
        self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)

    def call(self, x):
        # 在输入张量 x 上添加偏置向量 self.bias
        return x + self.bias


@add_start_docstrings(
    "The BLENDERBOT Model with a language modeling head. Can be used for summarization.",
    BLENDERBOT_START_DOCSTRING,
)
class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausalLanguageModelingLoss):
    _keys_to_ignore_on_load_unexpected = [
        r"model.encoder.embed_tokens.weight",
        r"model.decoder.embed_tokens.weight",
    ]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 创建 TFBlenderbotMainLayer 实例，并命名为 "model"，作为模型的核心组件
        self.model = TFBlenderbotMainLayer(config, name="model")
        # 根据配置中的参数设置是否使用缓存
        self.use_cache = config.use_cache
        # 创建 BiasLayer 实例作为模型输出的偏置向量，名为 "final_logits_bias"
        # 该偏置向量用于调整模型最终输出的 logits，设置为不可训练以保持一致性
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
        )

    def get_decoder(self):
        # 获取模型的解码器（decoder）部分
        return self.model.decoder

    def get_encoder(self):
        # 获取模型的编码器（encoder）部分
        return self.model.encoder

    def get_output_embeddings(self):
        # 获取模型的输出嵌入层
        return self.get_input_embeddings()

    def set_output_embeddings(self, value):
        # 设置模型的输出嵌入层
        self.set_input_embeddings(value)

    def get_bias(self):
        # 返回模型当前使用的偏置向量，以字典形式返回，键为 "final_logits_bias"
        return {"final_logits_bias": self.bias_layer.bias}

    def set_bias(self, value):
        # 用给定的偏置值替换当前模型中的偏置层，确保正确的序列化和反序列化过程
        vocab_size = value["final_logits_bias"].shape[-1]
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
        )
        self.bias_layer.bias.assign(value["final_logits_bias"])

    @classmethod
    # 根据预训练模型名称或路径加载模型，并传递给模型的参数和关键字参数
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        # 如果预训练模型名称或路径是特定的字符串
        if pretrained_model_name_or_path == "facebook/blenderbot-90M":
            # 从模块中导入 TFBlenderbotSmallForConditionalGeneration 类
            from ..blenderbot_small import TFBlenderbotSmallForConditionalGeneration

            # 发出警告，说明特定检查点已弃用，并建议新的检查点名称和使用方式
            warnings.warn(
                "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
                " checkpoint `facebook/small_blenderbot-90M` with"
                " `TFBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')`"
                " instead.",
                FutureWarning,
            )
            # 返回从预训练模型加载的 TFBlenderbotSmallForConditionalGeneration 实例
            return TFBlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)

        # 调用父类的 from_pretrained 方法，传递预训练模型名称或路径以及其他参数和关键字参数
        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

    # 将装饰器应用于 call 方法，以添加模型输入和输出的文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE)
    def call(
        self,
        # 模型的输入张量，可以为 None
        input_ids: tf.Tensor | None = None,
        # 注意力遮罩张量，可以为 None
        attention_mask: tf.Tensor | None = None,
        # 解码器输入的 ID 张量，可以为 None
        decoder_input_ids: tf.Tensor | None = None,
        # 解码器的注意力遮罩张量，可以为 None
        decoder_attention_mask: tf.Tensor | None = None,
        # 解码器的位置 ID 张量，可以为 None
        decoder_position_ids: tf.Tensor | None = None,
        # 头部遮罩张量，可以为 None
        head_mask: tf.Tensor | None = None,
        # 解码器头部遮罩张量，可以为 None
        decoder_head_mask: tf.Tensor | None = None,
        # 跨注意力头部遮罩张量，可以为 None
        cross_attn_head_mask: tf.Tensor | None = None,
        # 编码器输出，可以为元组或 TFBaseModelOutput 类型
        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
        # 过去键值列表，可以为 None
        past_key_values: List[tf.Tensor] | None = None,
        # 输入嵌入张量，可以为 None
        inputs_embeds: tf.Tensor | None = None,
        # 解码器输入嵌入张量，可以为 None
        decoder_inputs_embeds: tf.Tensor | None = None,
        # 是否使用缓存，可以为 None
        use_cache: Optional[bool] = None,
        # 是否输出注意力权重，可以为 None
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，可以为 None
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典类型结果，可以为 None
        return_dict: Optional[bool] = None,
        # 标签张量，可以为 None
        labels: tf.Tensor | None = None,
        # 是否处于训练模式，默认为 False
        training: Optional[bool] = False,
    ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
        r"""
        labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        """
        # 如果给定了标签，则处理标签，将所有标记为 pad_token_id 的标签改为 -100，其余保持不变
        if labels is not None:
            labels = tf.where(
                labels == self.config.pad_token_id,
                tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
                labels,
            )
            # 如果未提供解码器的输入，根据标签生成解码器的输入
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        # 使用模型进行前向传播
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # 计算语言模型的 logits
        lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
        lm_logits = self.bias_layer(lm_logits)
        
        # 计算掩码语言模型的损失，如果没有标签则损失为 None
        masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)

        # 如果 return_dict 为 False，则按照元组形式返回输出
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
        
        # 如果 return_dict 为 True，则按照 TFSeq2SeqLMOutput 类的实例形式返回输出
        return TFSeq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,  # 索引 1 的 d outputs
            decoder_hidden_states=outputs.decoder_hidden_states,  # 索引 2 的 d outputs
            decoder_attentions=outputs.decoder_attentions,  # 索引 3 的 d outputs
            cross_attentions=outputs.cross_attentions,  # 索引 4 的 d outputs
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,  # 索引 0 的 encoder outputs
            encoder_hidden_states=outputs.encoder_hidden_states,  # 索引 1 的 e outputs
            encoder_attentions=outputs.encoder_attentions,  # 索引 2 的 e outputs
        )

    # 从 transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output 复制而来
    # 定义一个方法用于处理模型的输出，根据配置选择性地包含不同的输出信息
    def serving_output(self, output):
        # 如果配置要求使用缓存，则从输出中获取过去的键-值对
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置要求输出隐藏状态，则将解码器的隐藏状态转换为张量
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将解码器的注意力权重转换为张量
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置要求输出交叉注意力权重，则将交叉注意力权重转换为张量
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置要求输出隐藏状态，则将编码器的隐藏状态转换为张量
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将编码器的注意力权重转换为张量
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 返回一个 TFSeq2SeqLMOutput 对象，包含处理后的输出信息
        return TFSeq2SeqLMOutput(
            logits=output.logits,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    # 从 transformers 库中的 TFBartForConditionalGeneration 类的方法 prepare_inputs_for_generation 复制而来
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        decoder_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果存在过去的键-值对，根据此情况截取 decoder_input_ids
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 如果有 decoder_attention_mask，使用 XLA 编译执行
        if decoder_attention_mask is not None:  # xla
            # 计算累积的位置 IDs，并取最后一个位置
            decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
        # 如果没有 XLA + 存在过去的键-值对
        elif past_key_values is not None:  # no xla + past_key_values
            # 获取过去键-值对的第一个元素的第一个维度的长度作为位置 IDs
            decoder_position_ids = past_key_values[0][0].shape[2]
        else:  # 没有 XLA + 没有过去的键-值对
            # 创建 decoder_input_ids 的位置 IDs
            decoder_position_ids = tf.range(decoder_input_ids.shape[1])

        # 返回一个包含准备好用于生成的输入参数的字典
        return {
            "input_ids": None,  # encoder_outputs 已定义，不需要 input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_position_ids": decoder_position_ids,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 更改此项以避免缓存（可能用于调试）
        }
    # 定义一个方法用于构建网络层，支持接收输入形状参数，如果已经构建过则直接返回
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 检查是否存在模型属性，如果存在，则使用 TensorFlow 的名称空间来构建模型
        if getattr(self, "model", None) is not None:
            with tf.name_scope(self.model.name):
                # 调用模型的build方法来构建模型，传入None表示不指定输入形状
                self.model.build(None)
        
        # 检查是否存在偏置层属性，如果存在，则使用 TensorFlow 的名称空间来构建偏置层
        if getattr(self, "bias_layer", None) is not None:
            with tf.name_scope(self.bias_layer.name):
                # 调用偏置层的build方法来构建偏置层，传入None表示不指定输入形状
                self.bias_layer.build(None)

`.\models\blenderbot\tokenization_blenderbot.py`

# coding=utf-8
# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for Blenderbot."""

import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple

import regex as re  # 引入 regex 库，用于处理正则表达式

from ...tokenization_utils import AddedToken, PreTrainedTokenizer  # 导入自定义的 Token 和预训练 Tokenizer
from ...utils import logging  # 导入日志模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",  # 词汇表文件名
    "merges_file": "merges.txt",  # 合并文件名
    "tokenizer_config_file": "tokenizer_config.json",  # 分词器配置文件名
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
    "merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
    "tokenizer_config_file": {
        "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}  # 预训练位置嵌入的大小

@lru_cache()
# 从 transformers.models.roberta.tokenization_roberta 中复制，用于将字节转换为 Unicode 字符
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


# 从 transformers.models.roberta.tokenization_roberta 中复制，用于获取单词中的符号对
def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    # 对单词中除第一个字符外的每个字符进行迭代
    for char in word[1:]:
        # 将前一个字符和当前字符作为一个元组加入到集合中
        pairs.add((prev_char, char))
        # 更新前一个字符为当前字符，以便下一次迭代使用
        prev_char = char
    # 返回包含所有字符对的集合
    return pairs
# 定义 BlenderbotTokenizer 类，继承自 PreTrainedTokenizer
class BlenderbotTokenizer(PreTrainedTokenizer):
    """
    Constructs a Blenderbot tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```
    >>> from transformers import BlenderbotTokenizer

    >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
    >>> tokenizer.add_prefix_space = False
    >>> tokenizer("Hello world")["input_ids"]
    [47, 921, 86, 1085, 2]

    >>> tokenizer(" Hello world")["input_ids"]
    [6950, 1085, 2]
    ```

    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

    <Tip>

    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).

    </Tip>

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Specifies the error handling scheme to use for decoding bytes to UTF-8.
            See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for details.

        bos_token (`str`, *optional*, defaults to `"<s>"`):
            Beginning of sequence token used during pretraining. Often employed as a sequence classifier token.

            <Tip>
            This token is not typically used as the beginning of sequence when special tokens are employed. 
            Instead, the `cls_token` is used.
            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            End of sequence token.

            <Tip>
            When constructing sequences with special tokens, this is not used as the end of sequence.
            The `sep_token` is used instead.
            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            Separator token used for constructing sequences from multiple sources, 
            such as for sequence classification or question answering.

        cls_token (`str`, *optional*, defaults to `"<s>"`):
            Classifier token used in sequence classification tasks. It is the first token in the sequence when using special tokens.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            Token representing unknown words or tokens not in the vocabulary.

        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            Token used for padding sequences to equal lengths during batching.

        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            Token used during masked language modeling, indicating positions where the model will predict.

        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Indicates whether to add an initial space to the input, treating the leading word like any other word.

    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    # 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.__init__中复制而来，用于初始化Blenderbot的Tokenizer类
    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        **kwargs,
    ):
        # 如果bos_token是字符串，则创建一个对应的AddedToken对象，用于表示序列开始的特殊标记
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        # 如果pad_token是字符串，则创建一个对应的AddedToken对象，用于表示填充的特殊标记
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        # 如果eos_token是字符串，则创建一个对应的AddedToken对象，用于表示序列结束的特殊标记
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        # 如果unk_token是字符串，则创建一个对应的AddedToken对象，用于表示未知标记的特殊标记
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        # 如果sep_token是字符串，则创建一个对应的AddedToken对象，用于表示分隔符的特殊标记
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
        # 如果cls_token是字符串，则创建一个对应的AddedToken对象，用于表示类别标记的特殊标记
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
    
        # mask_token的行为类似于普通单词，即在其前面包含空格
        mask_token = (
            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
            if isinstance(mask_token, str)
            else mask_token
        )
    
        # 这些特殊标记不包含在vocab.json中，因此将它们按正确顺序添加
        # 用UTF-8编码打开vocab_file，并加载其中的编码器内容为字典self.encoder
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 通过self.encoder创建反向映射字典self.decoder
        self.decoder = {v: k for k, v in self.encoder.items()}
        # 设置解码过程中的错误处理方式
        self.errors = errors
        # 创建字节到Unicode的编码映射字典self.byte_encoder
        self.byte_encoder = bytes_to_unicode()
        # 通过self.byte_encoder创建反向映射字典self.byte_decoder
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        # 用UTF-8编码打开merges_file，读取内容并分割成行，排除第一行和最后一行空行后，将其转换为元组列表bpe_merges
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        # 将bpe_merges列表中的每个合并规则字符串转换为元组，并构建合并规则到索引的映射字典self.bpe_ranks
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        # 初始化缓存字典self.cache为空字典
        self.cache = {}
        # 设置是否在特殊标记前添加空格的标志
        self.add_prefix_space = add_prefix_space
    
        # 应该添加re.IGNORECASE，以便对缩写的大写版本进行BPE合并
        # 编译正则表达式，用于识别缩写、字母和数字、非空白非字母数字字符、空白（排除非空白字符后的空白）
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
    
        # 调用父类的初始化方法，传递参数设置
        super().__init__(
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
    # 返回当前词汇表的大小，即编码器的长度
    def vocab_size(self):
        return len(self.encoder)

    # 从Blenderbot的词汇表中获取完整的词汇表，包括添加的特殊标记
    def get_vocab(self):
        # 复制编码器中的内容到vocab字典中
        vocab = dict(self.encoder).copy()
        # 将添加的特殊标记编码器内容更新到vocab字典中
        vocab.update(self.added_tokens_encoder)
        return vocab

    # 根据Blenderbot的BPE算法处理给定的token，返回处理后的字符串
    def bpe(self, token):
        # 如果token已经在缓存中，则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        # 使用Blenderbot的BPE算法处理token，生成pairs
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            # 找到当前pairs中优先级最低的bigram
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = " ".join(word)
        self.cache[token] = word
        return word

    # 使用Blenderbot的BPE算法对给定的文本进行分词，返回分词后的结果
    def _tokenize(self, text):
        """Tokenize a string."""
        bpe_tokens = []
        # 使用正则表达式找到所有匹配的token，并逐个处理
        for token in re.findall(self.pat, text):
            # 将token编码成字节，并通过Blenderbot的字节编码器映射成unicode字符串
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
            # 使用Blenderbot的BPE算法对编码后的token进行分词，将分词结果添加到bpe_tokens列表中
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    # 将给定的token转换为其在Blenderbot词汇表中的ID，如果token不存在，则使用未知标记的ID
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 将给定的ID转换为其在Blenderbot词汇表中对应的token，如果ID不存在，则返回对应的未知标记
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index)
    # 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string复制而来，将Roberta->Blenderbot，RoBERTa->Blenderbot
    def convert_tokens_to_string(self, tokens):
        """将一系列的tokens（字符串）转换为单个字符串。"""
        # 将tokens列表中的所有字符串连接成一个字符串
        text = "".join(tokens)
        # 使用self.byte_decoder中的映射将text中的每个字符解码为UTF-8编码的字符串
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text

    # 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary复制而来，将Roberta->Blenderbot，RoBERTa->Blenderbot
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果save_directory不是一个目录，则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"词汇表路径 ({save_directory}) 应为一个目录")
            return
        # 构建词汇文件的路径，如果提供了filename_prefix，则使用它作为前缀
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 构建合并文件的路径，如果提供了filename_prefix，则使用它作为前缀
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将self.encoder中的内容以UTF-8编码格式写入vocab_file
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将BPE（Byte Pair Encoding）的tokens和它们的索引写入merge_file
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            # 按照token_index排序self.bpe_ranks.items()，并将每个bpe_tokens列表写入文件
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"保存词汇到 {merge_file}: BPE合并索引不是连续的。请确保分词器未损坏！"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        return vocab_file, merge_file

    # 从transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask复制而来，将Roberta->Blenderbot，RoBERTa->Blenderbot
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        # 返回一个掩码，指示哪些token是特殊token（如[PAD]、[CLS]、[SEP]等）
        pass
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # Check if the token list already has special tokens; if so, delegate to superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If no special tokens are present and there is only one token list, add special tokens at the start and end
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        # If there are two token lists, add special tokens appropriately for sequence pairs
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # Define special tokens for separation and classification
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If there's only one sequence, return a list of zeros of appropriate length
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        # If there are two sequences, return a list of zeros of appropriate length
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.prepare_for_tokenization with Roberta->Blenderbot, RoBERTa->Blenderbot
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        """
        Prepare the text for tokenization, ensuring correct formatting based on tokenizer settings.

        Args:
            text (str): The input text to be tokenized.
            is_split_into_words (bool, optional): Whether the text is already split into words.
            **kwargs: Additional keyword arguments.

        Returns:
            Tuple[str, Dict]: A tuple containing the processed text and any additional kwargs.
        """
        # Determine if a prefix space should be added and apply if necessary
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
        # Return processed text and remaining keyword arguments
        return (text, kwargs)
    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A Blenderbot sequence has the following format:
        - single sequence: ` X </s>`
    
        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*):
                Will be ignored
        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 将 token_ids_0 和 EOS（结束符号）的 token ID 进行连接，构建包含特殊标记的模型输入
        return token_ids_0 + [self.eos_token_id]
    
    @property
    def default_chat_template(self):
        """
        A very simple chat template that just adds whitespace between messages.
        """
        # 如果未为此分词器定义聊天模板，则记录警告并返回默认的聊天模板字符串
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # 返回一个简单的聊天模板字符串，用于在消息之间添加空格
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
            "{{ message['content'] }}"
            "{% if not loop.last %}{{ '  ' }}{% endif %}"
            "{% endfor %}"
            "{{ eos_token }}"
        )

`.\models\blenderbot\tokenization_blenderbot_fast.py`

# 引入必要的模块和库
import json  # 用于处理 JSON 格式的数据
from typing import List, Optional, Tuple  # 引入类型提示相关的模块

from tokenizers import pre_tokenizers, processors  # 从 tokenizers 库引入预处理器和处理器

from ...tokenization_utils_base import AddedToken, BatchEncoding  # 从本地模块引入相应的类和函数
from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 从本地模块引入 PreTrainedTokenizerFast 类
from ...utils import logging  # 从本地模块引入日志记录器
from .tokenization_blenderbot import BlenderbotTokenizer  # 从当前目录的 tokenization_blenderbot 模块引入 BlenderbotTokenizer 类

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 BlenderbotTokenizerFast 类的静态属性：包含各个文件的名称
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
    "tokenizer_config_file": "tokenizer_config.json",
}

# 定义预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
    "merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
    "tokenizer_config_file": {
        "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
    },
}

# 定义预训练模型的位置嵌入尺寸
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}


class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
    """
    快速实现的 Blenderbot 分词器，基于 HuggingFace 的 tokenizers 库，衍生自 GPT-2 分词器，使用字节级别的 BPE。

    这个分词器经过训练，将空格视为词元的一部分（类似于 sentencepiece），因此一个词在句子开头（无空格）和其他位置编码会不同：

    ```
    >>> from transformers import BlenderbotTokenizerFast

    >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
    >>> tokenizer("Hello world")["input_ids"]
    [6950, 1085, 2]

    >>> tokenizer(" Hello world")["input_ids"]
    [6950, 1085, 2]
    ```

    如果要避免这种行为，可以在实例化分词器时或调用时传递 add_prefix_space=True，但由于模型不是这样预训练的，可能会降低性能。

    <Tip>

    当使用 is_split_into_words=True 时，需要以 add_prefix_space=True 实例化这个分词器。

    </Tip>

    这个分词器继承自 [`PreTrainedTokenizerFast`]，其中包含大部分主要方法。用户应该
    """
    pass  # 类定义结束，暂无额外的代码逻辑
    # 设置 Transformer 模型的词汇文件名称常量，这些文件包含了模型训练时使用的词汇表和合并文件
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射，指定了预训练模型使用的各类词汇文件的位置
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 使用预训练模型的位置嵌入尺寸作为最大模型输入尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    
    # 定义模型输入的名称列表，包括输入的标记和注意力掩码
    model_input_names = ["input_ids", "attention_mask"]
    
    # 指定使用的慢速标记化器类为BlenderbotTokenizer
    slow_tokenizer_class = BlenderbotTokenizer

    # 从transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.__init__方法复制而来，
    # 用于初始化BlenderbotTokenizer类
    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        add_prefix_space=False,
        trim_offsets=True,
        **kwargs,
    ):
        # 如果 mask_token 是字符串，则创建一个新的 AddedToken 对象，否则直接使用传入的 mask_token 对象
        mask_token = (
            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
            if isinstance(mask_token, str)
            else mask_token
        )
        # 调用父类的构造函数，初始化 BlenderbotTokenizerFast 对象
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            trim_offsets=trim_offsets,
            **kwargs,
        )

        # 获取当前的预处理器（pre_tokenizer）状态，并将其转换为 JSON 格式
        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
        # 如果当前预处理器的 add_prefix_space 属性与传入的 add_prefix_space 不一致，则更新预处理器状态
        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
            pre_tok_state["add_prefix_space"] = add_prefix_space
            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

        # 设置对象属性 add_prefix_space
        self.add_prefix_space = add_prefix_space

        # 定义 tokenizer_component 变量为 "post_processor"，获取后处理器实例
        tokenizer_component = "post_processor"
        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
        # 如果后处理器实例存在，则获取其状态信息
        if tokenizer_component_instance:
            state = json.loads(tokenizer_component_instance.__getstate__())

            # 如果状态中包含 "sep"，则将其值转换为元组
            if "sep" in state:
                state["sep"] = tuple(state["sep"])
            # 如果状态中包含 "cls"，则将其值转换为元组
            if "cls" in state:
                state["cls"] = tuple(state["cls"])

            changes_to_apply = False

            # 如果状态中的 add_prefix_space 与传入的 add_prefix_space 不一致，则更新状态
            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
                state["add_prefix_space"] = add_prefix_space
                changes_to_apply = True

            # 如果状态中的 trim_offsets 与传入的 trim_offsets 不一致，则更新状态
            if state.get("trim_offsets", trim_offsets) != trim_offsets:
                state["trim_offsets"] = trim_offsets
                changes_to_apply = True

            # 如果有更新需要应用，则创建新的后处理器对象并设置回 backend_tokenizer
            if changes_to_apply:
                component_class = getattr(processors, state.pop("type"))
                new_value = component_class(**state)
                setattr(self.backend_tokenizer, tokenizer_component, new_value)
    def mask_token(self) -> str:
        """
        `str`: 获取掩码标记，用于训练具有掩码语言建模功能的模型。如果在未设置的情况下使用，则记录错误信息。

        Blenderbot 分词器有一个特殊的掩码标记，用于在填充掩码流水线中使用。掩码标记将贪婪地包括 *<mask>* 前面的空格。
        """
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
            return None
        return str(self._mask_token)

    @mask_token.setter
    def mask_token(self, value):
        """
        重写掩码标记的默认行为，使其能够包含前置空格。

        这是为了与所有基于 Roberta 的先前使用的模型保持向后兼容性。
        """
        # 掩码标记行为类似普通单词，即包含前置空格，因此设置 lstrip 为 True
        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
        self._mask_token = value

    # 从 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._batch_encode_plus 复制，替换 RoBERTa 为 Blenderbot
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        return super()._batch_encode_plus(*args, **kwargs)

    # 从 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._encode_plus 复制，替换 RoBERTa 为 Blenderbot
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = kwargs.get("is_split_into_words", False)

        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
            "to use it with pretokenized inputs."
        )

        return super()._encode_plus(*args, **kwargs)

    # 从 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.save_vocabulary 复制，替换 RoBERTa 为 Blenderbot
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

    # 从 transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.create_token_type_ids_from_sequences 复制，替换 RoBERTa 为 Blenderbot
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ):
        """
        根据 token_ids_0 和（可选）token_ids_1 创建 token 类型 ID。

        如果使用预分词的输入，需要用 add_prefix_space=True 来实例化 {self.__class__.__name__}。
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        # Define special tokens
        sep = [self.sep_token_id]  # Separator token ID
        cls = [self.cls_token_id]  # Classification token ID

        # If only one sequence provided
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]  # Return mask of zeros
        # If two sequences provided
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]  # Return mask of zeros

    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A Blenderbot sequence has the following format:
        - single sequence: ` X </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*):
                Will be ignored
        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # Concatenate input tokens with end-of-sequence token
        return token_ids_0 + [self.eos_token_id]

    @property
    # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template
    def default_chat_template(self):
        """
        A very simple chat template that just adds whitespace between messages.
        """
        # Issue a warning message if no chat template is defined
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # Return default chat template with placeholders
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
            "{{ message['content'] }}"
            "{% if not loop.last %}{{ '  ' }}{% endif %}"
            "{% endfor %}"
            "{{ eos_token }}"
        )

`.\models\blenderbot\init.py`

# 导入类型检查工具，用于检查类型是否存在
from typing import TYPE_CHECKING

# 导入依赖的模块和异常类
# _LazyModule: 惰性加载模块
# is_flax_available: 检查是否存在Flax库
# is_tf_available: 检查是否存在TensorFlow库
# is_tokenizers_available: 检查是否存在Tokenizers库
# is_torch_available: 检查是否存在PyTorch库
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义导入结构字典，列出各个模块的相关导入内容
_import_structure = {
    "configuration_blenderbot": [
        "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "BlenderbotConfig",
        "BlenderbotOnnxConfig",
    ],
    "tokenization_blenderbot": ["BlenderbotTokenizer"],
}

# 检查是否存在Tokenizers库，若不存在则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在Tokenizers库，则添加tokenization_blenderbot_fast模块到_import_structure
    _import_structure["tokenization_blenderbot_fast"] = ["BlenderbotTokenizerFast"]

# 检查是否存在PyTorch库，若不存在则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在PyTorch库，则添加modeling_blenderbot模块到_import_structure
    _import_structure["modeling_blenderbot"] = [
        "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BlenderbotForCausalLM",
        "BlenderbotForConditionalGeneration",
        "BlenderbotModel",
        "BlenderbotPreTrainedModel",
    ]

# 检查是否存在TensorFlow库，若不存在则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在TensorFlow库，则添加modeling_tf_blenderbot模块到_import_structure
    _import_structure["modeling_tf_blenderbot"] = [
        "TFBlenderbotForConditionalGeneration",
        "TFBlenderbotModel",
        "TFBlenderbotPreTrainedModel",
    ]

# 检查是否存在Flax库，若不存在则抛出OptionalDependencyNotAvailable异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在Flax库，则添加modeling_flax_blenderbot模块到_import_structure
    _import_structure["modeling_flax_blenderbot"] = [
        "FlaxBlenderbotForConditionalGeneration",
        "FlaxBlenderbotModel",
        "FlaxBlenderbotPreTrainedModel",
    ]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 从configuration_blenderbot模块导入指定内容
    from .configuration_blenderbot import (
        BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BlenderbotConfig,
        BlenderbotOnnxConfig,
    )
    # 从tokenization_blenderbot模块导入指定内容
    from .tokenization_blenderbot import BlenderbotTokenizer

    # 检查是否存在Tokenizers库，若不存在则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果存在Tokenizers库，则从tokenization_blenderbot_fast模块导入指定内容
        from .tokenization_blenderbot_fast import BlenderbotTokenizerFast

    # 检查是否存在PyTorch库，若不存在则抛出OptionalDependencyNotAvailable异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果条件不成立，则导入以下 Blenderbot 模型相关的内容
    else:
        from .modeling_blenderbot import (
            BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
            BlenderbotForCausalLM,
            BlenderbotForConditionalGeneration,
            BlenderbotModel,
            BlenderbotPreTrainedModel,
        )

    try:
        # 检查 TensorFlow 是否可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 TensorFlow 不可用，不做任何处理，继续执行后续代码
        pass
    else:
        # 如果 TensorFlow 可用，则导入以下 TensorFlow 版本的 Blenderbot 模型相关内容
        from .modeling_tf_blenderbot import (
            TFBlenderbotForConditionalGeneration,
            TFBlenderbotModel,
            TFBlenderbotPreTrainedModel,
        )

    try:
        # 检查 Flax 是否可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 Flax 不可用，不做任何处理，继续执行后续代码
        pass
    else:
        # 如果 Flax 可用，则导入以下 Flax 版本的 Blenderbot 模型相关内容
        from .modeling_flax_blenderbot import (
            FlaxBlenderbotForConditionalGeneration,
            FlaxBlenderbotModel,
            FlaxBlenderbotPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于对当前模块进行动态修改
    import sys
    # 使用 sys.modules[__name__] 将当前模块的引用指向 _LazyModule 类的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\blenderbot_small\configuration_blenderbot_small.py`

"""
BlenderbotSmall model configuration

This module defines the configuration class `BlenderbotSmallConfig` for the BlenderbotSmall model.
It specifies how the model should be instantiated and configured. It inherits from `PretrainedConfig`
and provides defaults similar to the `facebook/blenderbot_small-90M` architecture.

Example:

>>> from transformers import BlenderbotSmallConfig, BlenderbotSmallModel

>>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
>>> configuration = BlenderbotSmallConfig()

>>> # Initializing a model (with random weights) from the facebook/blenderbot_small-90M style configuration
>>> model = BlenderbotSmallModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
"""

from collections import OrderedDict  # 导入有序字典类
from typing import Any, Mapping, Optional  # 导入类型提示相关的类和函数

from ... import PreTrainedTokenizer  # 导入预训练标记器类
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...file_utils import TensorType, is_torch_available  # 导入文件工具类和检查是否有torch可用的函数
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast  # 导入ONNX相关配置类
from ...onnx.utils import compute_effective_axis_dimension  # 导入计算有效轴维度的函数
from ...utils import logging  # 导入日志工具类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/config.json",
    # 预训练配置存档映射表，指定了模型名称及其配置文件的URL
    # 查看所有BlenderbotSmall模型请访问https://huggingface.co/models?filter=blenderbot_small
}


class BlenderbotSmallConfig(PretrainedConfig):
    r"""
    BlenderbotSmall模型的配置类，用于存储[`BlenderbotSmallModel`]的配置。
    它用于根据指定的参数实例化BlenderbotSmall模型，定义模型架构。
    使用默认值实例化配置将生成类似于BlenderbotSmall [facebook/blenderbot_small-90M](https://huggingface.co/facebook/blenderbot_small-90M)架构的配置。

    配置对象继承自[`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读[`PretrainedConfig`]的文档。

    Example:

    ```
    >>> from transformers import BlenderbotSmallConfig, BlenderbotSmallModel

    >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
    >>> configuration = BlenderbotSmallConfig()

    >>> # Initializing a model (with random weights) from the facebook/blenderbot_small-90M style configuration
    >>> model = BlenderbotSmallModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "blenderbot-small"  # 模型类型字符串
    keys_to_ignore_at_inference = ["past_key_values"]  # 推理过程中要忽略的键列表
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}  # 属性映射表
    # 初始化函数，用于创建一个新的Transformer模型实例
    def __init__(
        self,
        vocab_size=50265,  # 词汇表大小，默认为50265
        max_position_embeddings=512,  # 最大位置编码长度，默认为512
        encoder_layers=8,  # 编码器层数，默认为8层
        encoder_ffn_dim=2048,  # 编码器中FFN层的维度，默认为2048
        encoder_attention_heads=16,  # 编码器中注意力头的数量，默认为16个
        decoder_layers=8,  # 解码器层数，默认为8层
        decoder_ffn_dim=2048,  # 解码器中FFN层的维度，默认为2048
        decoder_attention_heads=16,  # 解码器中注意力头的数量，默认为16个
        encoder_layerdrop=0.0,  # 编码器层随机丢弃的概率，默认为0.0
        decoder_layerdrop=0.0,  # 解码器层随机丢弃的概率，默认为0.0
        use_cache=True,  # 是否使用缓存，默认为True
        is_encoder_decoder=True,  # 是否是编码解码模型，默认为True
        activation_function="gelu",  # 激活函数类型，默认为GELU
        d_model=512,  # 模型的维度，默认为512
        dropout=0.1,  # 全局dropout概率，默认为0.1
        attention_dropout=0.0,  # 注意力模块的dropout概率，默认为0.0
        activation_dropout=0.0,  # 激活函数的dropout概率，默认为0.0
        init_std=0.02,  # 参数初始化的标准差，默认为0.02
        decoder_start_token_id=1,  # 解码器的起始token ID，默认为1
        scale_embedding=False,  # 是否对嵌入进行缩放，默认为False
        pad_token_id=0,  # 填充token的ID，默认为0
        bos_token_id=1,  # 起始token的ID，默认为1
        eos_token_id=2,  # 结束token的ID，默认为2
        forced_eos_token_id=2,  # 强制结束token的ID，默认为2
        **kwargs,
    ):
        self.vocab_size = vocab_size  # 设置词汇表大小属性
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置编码长度属性
        self.d_model = d_model  # 设置模型维度属性
        self.encoder_ffn_dim = encoder_ffn_dim  # 设置编码器FFN层维度属性
        self.encoder_layers = encoder_layers  # 设置编码器层数属性
        self.encoder_attention_heads = encoder_attention_heads  # 设置编码器注意力头数属性
        self.decoder_ffn_dim = decoder_ffn_dim  # 设置解码器FFN层维度属性
        self.decoder_layers = decoder_layers  # 设置解码器层数属性
        self.decoder_attention_heads = decoder_attention_heads  # 设置解码器注意力头数属性
        self.dropout = dropout  # 设置全局dropout概率属性
        self.attention_dropout = attention_dropout  # 设置注意力模块dropout概率属性
        self.activation_dropout = activation_dropout  # 设置激活函数dropout概率属性
        self.activation_function = activation_function  # 设置激活函数类型属性
        self.init_std = init_std  # 设置参数初始化标准差属性
        self.encoder_layerdrop = encoder_layerdrop  # 设置编码器层随机丢弃概率属性
        self.decoder_layerdrop = decoder_layerdrop  # 设置解码器层随机丢弃概率属性
        self.use_cache = use_cache  # 设置是否使用缓存属性
        self.num_hidden_layers = encoder_layers  # 设置隐藏层总数属性为编码器层数
        self.scale_embedding = scale_embedding  # 设置是否缩放嵌入属性，若为True，则缩放因子为sqrt(d_model)

        # 调用父类Transformer的初始化函数，传递相关参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            decoder_start_token_id=decoder_start_token_id,
            forced_eos_token_id=forced_eos_token_id,
            **kwargs,
        )
# 从 transformers.models.bart.configuration_bart.BartOnnxConfig 复制了 BlenderbotSmallOnnxConfig 类定义
class BlenderbotSmallOnnxConfig(OnnxSeq2SeqConfigWithPast):
    # 定义 inputs 属性，返回输入映射的有序字典
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是 "default" 或者 "seq2seq-lm"
        if self.task in ["default", "seq2seq-lm"]:
            # 定义常见的输入映射
            common_inputs = OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),  # 输入序列的批次和编码器序列
                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),  # 注意力掩码的批次和编码器序列
                ]
            )

            # 如果使用过去状态
            if self.use_past:
                common_inputs["decoder_input_ids"] = {0: "batch"}  # 解码器输入的批次
                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}  # 解码器注意力掩码的批次和过去解码器序列 + 序列
            else:
                common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}  # 解码器输入的批次和解码器序列
                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}  # 解码器注意力掩码的批次和解码器序列

            # 如果使用过去状态，则填充过去键值
            if self.use_past:
                self.fill_with_past_key_values_(common_inputs, direction="inputs")
        # 如果任务是 "causal-lm"
        elif self.task == "causal-lm":
            # TODO: 解决这种情况。
            # 定义常见的输入映射
            common_inputs = OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),  # 输入序列的批次和编码器序列
                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),  # 注意力掩码的批次和编码器序列
                ]
            )
            # 如果使用过去状态
            if self.use_past:
                num_encoder_layers, _ = self.num_layers
                # 为每一层的过去键值添加输入映射
                for i in range(num_encoder_layers):
                    common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}  # 过去键的批次和过去序列 + 序列
                    common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}  # 过去值的批次和过去序列 + 序列
        else:
            # 定义常见的输入映射
            common_inputs = OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),  # 输入序列的批次和编码器序列
                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),  # 注意力掩码的批次和编码器序列
                    ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),  # 解码器输入的批次和解码器序列
                    ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),  # 解码器注意力掩码的批次和解码器序列
                ]
            )

        # 返回输入映射的字典
        return common_inputs

    # 定义 outputs 属性，返回输出映射的字典
    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是 "default" 或者 "seq2seq-lm"
        if self.task in ["default", "seq2seq-lm"]:
            # 调用父类的 outputs 方法获取通用的输出映射
            common_outputs = super().outputs
        else:
            # 调用父类 OnnxConfigWithPast 的 outputs 方法获取通用的输出映射
            common_outputs = super(OnnxConfigWithPast, self).outputs
            # 如果使用过去状态
            if self.use_past:
                num_encoder_layers, _ = self.num_layers
                # 为每一层的当前状态添加输出映射
                for i in range(num_encoder_layers):
                    common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}  # 当前键的批次和过去序列 + 序列
                    common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}  # 当前值的批次和过去序列 + 序列
        # 返回输出映射的字典
        return common_outputs
    # 定义一个方法 `_generate_dummy_inputs_for_default_and_seq2seq_lm`，用于生成默认和序列到序列语言模型的虚拟输入数据
    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
        self,
        tokenizer: PreTrainedTokenizer,  # 参数：预训练的分词器对象，用于处理输入数据的分词和编码
        batch_size: int = -1,             # 参数：批大小，默认为-1，表示使用预设的批大小
        seq_length: int = -1,             # 参数：序列长度，默认为-1，表示使用预设的序列长度
        is_pair: bool = False,            # 参数：是否为成对数据，默认为False，表示不是成对数据
        framework: Optional[TensorType] = None,  # 参数：框架类型，可选的张量类型，用于特定框架的处理
    ) -> Mapping[str, Any]:
        # 生成编码器输入数据
        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, seq_length, is_pair, framework
        )

        # 生成解码器输入数据
        decoder_seq_length = seq_length if not self.use_past else 1
        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, decoder_seq_length, is_pair, framework
        )
        # 为解码器输入添加前缀
        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
        # 整合编码器和解码器的输入数据
        common_inputs = dict(**encoder_inputs, **decoder_inputs)

        # 如果使用过去状态
        if self.use_past:
            # 检查是否安装了PyTorch
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch
            
            # 获取输入数据的批次大小和编码器序列长度
            batch, encoder_seq_length = common_inputs["input_ids"].shape
            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
            
            # 定义编码器和解码器的形状
            encoder_shape = (
                batch,
                num_encoder_attention_heads,
                encoder_seq_length,
                self._config.hidden_size // num_encoder_attention_heads,
            )
            decoder_past_length = decoder_seq_length + 3
            decoder_shape = (
                batch,
                num_decoder_attention_heads,
                decoder_past_length,
                self._config.hidden_size // num_decoder_attention_heads,
            )

            # 扩展解码器的注意力掩码
            common_inputs["decoder_attention_mask"] = torch.cat(
                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
            )

            # 初始化过去键值列表
            common_inputs["past_key_values"] = []

            # 根据模型配置中的编码器和解码器层数，创建过去键值对
            num_encoder_layers, num_decoder_layers = self.num_layers
            min_num_layers = min(num_encoder_layers, num_decoder_layers)
            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"

            # 为每一层添加初始的过去键值对
            for _ in range(min_num_layers):
                common_inputs["past_key_values"].append(
                    (
                        torch.zeros(decoder_shape),
                        torch.zeros(decoder_shape),
                        torch.zeros(encoder_shape),
                        torch.zeros(encoder_shape),
                    )
                )
            
            # TODO: test this.
            # 对于剩余的层数，根据模型的不同，添加适当的过去键值对
            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
            for _ in range(min_num_layers, max_num_layers):
                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
        
        # 返回整合后的输入数据字典
        return common_inputs
    # 生成用于因果语言模型的虚拟输入数据集
    def _generate_dummy_inputs_for_causal_lm(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 调用另一个生成序列分类和问答虚拟输入数据集的方法，获取共同的输入部分
        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, seq_length, is_pair, framework
        )

        # 如果需要使用过去的键值对（past_key_values）
        if self.use_past:
            # 检查是否安装了 PyTorch，如果没有则抛出错误
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch
            # 获取 batch 和 seqlen
            batch, seqlen = common_inputs["input_ids"].shape
            # 计算过去键值对的长度，比当前序列长度多 2
            past_key_values_length = seqlen + 2
            # 解析编码器层数和注意力头数
            num_encoder_layers, _ = self.num_layers
            num_encoder_attention_heads, _ = self.num_attention_heads
            # 定义过去键值对的形状
            past_shape = (
                batch,
                num_encoder_attention_heads,
                past_key_values_length,
                self._config.hidden_size // num_encoder_attention_heads,
            )

            # 获取注意力掩码的数据类型
            mask_dtype = common_inputs["attention_mask"].dtype
            # 将新生成的过去键值对长度的注意力掩码拼接到原始注意力掩码后面
            common_inputs["attention_mask"] = torch.cat(
                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
            )
            # 生成初始的过去键值对列表，每层编码器对应一个空的过去键值对元组
            common_inputs["past_key_values"] = [
                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
            ]
        
        # 返回生成的共同输入字典
        return common_inputs

    # 生成用于序列分类和问答模型的虚拟输入数据集
    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 从 OnnxConfig.generate_dummy_inputs 复制的方法，用于保持代码清晰
        # 如果动态轴 (-1)，我们使用固定维度的 2 个样本以避免 ONNX 进行的优化
        batch_size = compute_effective_axis_dimension(
            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
        )

        # 如果动态轴 (-1)，我们使用固定维度的 8 个 token 以避免 ONNX 进行的优化
        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
        seq_length = compute_effective_axis_dimension(
            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
        )

        # 根据计算的 batch 和 sequence 生成虚拟输入数据
        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
        common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
        # 返回生成的共同输入字典
        return common_inputs
    # 生成虚拟输入数据，返回一个包含各种任务通用输入的字典
    def generate_dummy_inputs(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 如果任务是默认任务或序列到序列语言模型
        if self.task in ["default", "seq2seq-lm"]:
            # 调用默认任务和序列到序列语言模型的虚拟输入生成函数
            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )

        # 如果任务是因果语言模型
        elif self.task == "causal-lm":
            # 调用因果语言模型的虚拟输入生成函数
            common_inputs = self._generate_dummy_inputs_for_causal_lm(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )
        else:
            # 调用序列分类和问题回答的虚拟输入生成函数（适用于其它任务）
            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )

        # 返回通用输入字典
        return common_inputs

    # 将过去的键值扁平化处理的内部方法
    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
        # 如果任务是默认任务或序列到序列语言模型
        if self.task in ["default", "seq2seq-lm"]:
            # 调用父类的方法来扁平化过去的键值
            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
        else:
            # 使用带有过去键值的 ONNX 序列到序列配置的父类方法来扁平化过去的键值
            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
                flattened_output, name, idx, t
            )

`.\models\blenderbot_small\modeling_blenderbot_small.py`

# coding=utf-8
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch BlenderbotSmall model."""


import copy
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_blenderbot_small import BlenderbotSmallConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "BlenderbotSmallConfig"


BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/blenderbot_small-90M",
    # See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
]


# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    # 创建一个新的张量，形状与输入相同，用于存储右移后的输入ids
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将输入ids的内容向右移动一个位置
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    # 将decoder起始token id放到每个序列的开头
    shifted_input_ids[:, 0] = decoder_start_token_id

    # 如果pad_token_id未定义，抛出异常
    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # 将标签中可能的-100值替换为pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


# Copied from transformers.models.blenderbot.modeling_blenderbot.BlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall
class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        super().__init__(num_embeddings, embedding_dim)
    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
        """
        `input_ids_shape` is expected to be [bsz x seqlen].
        Forward pass of the model.
        """
        # 从输入的 `input_ids_shape` 中提取 batch size (`bsz`) 和 sequence length (`seq_len`)
        bsz, seq_len = input_ids_shape[:2]
        
        # 根据 `past_key_values_length` 和当前 `seq_len` 创建一个序列，表示位置编码的位置
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        
        # 调用父类的 `forward` 方法，传入位置编码的序列 `positions`，并返回结果
        return super().forward(positions)
# 从transformers.models.bart.modeling_bart.BartAttention复制过来，将Bart替换为BlenderbotSmall
class BlenderbotSmallAttention(nn.Module):
    """来自'Attention Is All You Need'论文的多头注意力机制"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[BlenderbotSmallConfig] = None,
    ):
        super().__init__()
        # 初始化模型参数
        self.embed_dim = embed_dim  # 嵌入维度
        self.num_heads = num_heads  # 注意力头的数量
        self.dropout = dropout  # dropout概率
        self.head_dim = embed_dim // num_heads  # 每个头的维度
        self.config = config  # BlenderbotSmall配置对象

        # 检查embed_dim是否能被num_heads整除
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim必须能被num_heads整除 (当前 `embed_dim`: {self.embed_dim}"
                f" 和 `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子
        self.is_decoder = is_decoder  # 是否为解码器
        self.is_causal = is_causal  # 是否使用因果注意力

        # 线性变换层，用于生成查询、键、值和输出
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将张量重塑为适合多头注意力的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 实现前向传播逻辑
        pass  # 由于这只是模型定义，前向传播逻辑尚未实现


# 从transformers.models.bart.modeling_bart.BartEncoderLayer复制过来，将Bart替换为BlenderbotSmall，BART替换为BLENDERBOT_SMALL
class BlenderbotSmallEncoderLayer(nn.Module):
    def __init__(self, config: BlenderbotSmallConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 嵌入维度，从配置中获取

        # 自注意力层及其归一化层
        self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout  # dropout概率
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数
        self.activation_dropout = config.activation_dropout  # 激活函数的dropout概率
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)  # 第一个全连接层
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)  # 第二个全连接层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终的归一化层
    # 定义一个方法 `forward`，用于模型的前向传播
    def forward(
        self,
        # 输入的隐藏状态，形状为 `(batch, seq_len, embed_dim)`
        hidden_states: torch.FloatTensor,
        # 注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`，用极大负值表示填充元素
        attention_mask: torch.FloatTensor,
        # 给定层中的注意力头部掩码，形状为 `(encoder_attention_heads,)`
        layer_head_mask: torch.FloatTensor,
        # 是否输出所有注意力层的注意力张量，默认为 `False`
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 将输入的隐藏状态作为残差连接的起点
        residual = hidden_states
        # 调用自注意力机制 `self_attn` 进行处理，获取处理后的隐藏状态、注意力权重及可能的所有注意力层输出
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 对处理后的隐藏状态进行 dropout 操作
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差与处理后的隐藏状态相加，形成新的隐藏状态
        hidden_states = residual + hidden_states
        # 对新的隐藏状态进行自注意力层的归一化处理
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 将当前隐藏状态作为下一层的残差连接起点
        residual = hidden_states
        # 经过激活函数后的处理
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 对处理后的隐藏状态进行 dropout 操作
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 经过第二个线性层 `fc2` 处理
        hidden_states = self.fc2(hidden_states)
        # 对处理后的隐藏状态进行 dropout 操作
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差与处理后的隐藏状态相加，形成新的隐藏状态
        hidden_states = residual + hidden_states
        # 对新的隐藏状态进行最终层归一化处理
        hidden_states = self.final_layer_norm(hidden_states)

        # 如果隐藏状态的数据类型为 `torch.float16` 并且存在无穷大或 NaN 值
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            # 对隐藏状态进行截断处理，确保数值范围在可接受的范围内
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 输出结果为包含新的隐藏状态的元组
        outputs = (hidden_states,)

        # 如果需要输出所有注意力层的注意力权重
        if output_attentions:
            # 将注意力权重也添加到输出结果中
            outputs += (attn_weights,)

        # 返回输出结果
        return outputs
# 定义一个字典，用于存储针对 BlenderbotSmall 的注意力机制类的映射关系，目前只包含 "eager" 类型
BLENDERBOT_SMALL_ATTENTION_CLASSES = {
    "eager": BlenderbotSmallAttention,
}

# 从 transformers.models.bart.modeling_bart.BartDecoderLayer 复制而来，将 Bart 替换为 BlenderbotSmall，BART 替换为 BLENDERBOT_SMALL
class BlenderbotSmallDecoderLayer(nn.Module):
    def __init__(self, config: BlenderbotSmallConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 从配置中获取嵌入维度

        # 初始化自注意力层，根据配置选择具体的注意力类，并设置相关参数
        self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            is_causal=True,
            config=config,
        )
        self.dropout = config.dropout  # 设置丢弃率
        self.activation_fn = ACT2FN[config.activation_function]  # 获取激活函数
        self.activation_dropout = config.activation_dropout  # 设置激活函数的丢弃率

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 初始化自注意力层的 LayerNorm

        # 初始化编码器注意力层，同样根据配置选择注意力类，并设置参数
        self.encoder_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            config=config,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 初始化编码器注意力层的 LayerNorm

        # 初始化第一个全连接层和第二个全连接层
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)

        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 初始化最终的 LayerNorm

    # 前向传播函数，定义模型的计算流程，接受一系列输入参数并返回输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    # 定义一个方法，用于生成模型输入的虚拟数据
    def dummy_inputs(self):
        # 获取配置中的填充标记 ID
        pad_token = self.config.pad_token_id
        # 创建一个张量作为模型输入的示例，包含两个样本的输入序列
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
        # 构建虚拟输入数据字典，包括注意力掩码和输入 ID
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),  # 生成对应的注意力掩码，标记填充位置为 False
            "input_ids": input_ids,  # 将创建的输入 ID 添加到字典中
            "decoder_input_ids": input_ids,  # 将输入 ID 作为解码器的输入 ID，这里简单地复用编码器的输入
        }
        # 返回包含虚拟输入数据的字典
        return dummy_inputs
# 定义 BlenderbotSmallStartDocstring 常量，包含模型文档字符串，描述模型继承自 PreTrainedModel 类。
BLENDERBOT_SMALL_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BlenderbotSmallConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义 BlenderbotSmallGenerationExample 常量，包含对话示例的文档字符串，展示如何使用模型进行对话生成。
BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
    Conversation example:

    ```
    >>> from transformers import AutoTokenizer, BlenderbotSmallForConditionalGeneration

    >>> mname = "facebook/blenderbot_small-90M"
    >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
    >>> print("Human: ", UTTERANCE)
    Human:  My friends are cool but they eat too many carbs.

    >>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
    >>> reply_ids = model.generate(**inputs)
    >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
    Bot:  what kind of carbs do they eat? i don't know much about carbs.

    >>> REPLY = "I'm not sure"
    >>> print("Human: ", REPLY)
    Human: I'm not sure

    >>> NEXT_UTTERANCE = (
    ...     "My friends are cool but they eat too many carbs.__end__ __start__what kind of carbs do they eat? "
    ...     "i don't know much about carbs__end__ "
    ...     "__start__ I'm not sure."
    ... )
    >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
    >>> next_reply_ids = model.generate(**inputs)
    >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
    Bot:  they eat a lot of carbs. carbs are high in fat, protein, and fats.
    ```
"""

# 定义 BlenderbotSmallInputsDocstring 常量，目前为空字符串，用于描述模型输入的文档字符串。
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
"""


class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`BlenderbotSmallEncoderLayer`].

    Args:
        config: BlenderbotSmallConfig
        embed_tokens (nn.Embedding): output embedding
    """
    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        
        # 从配置中获取dropout和encoder层的dropout比例
        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop
        
        # 设置embedding维度和padding索引
        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        # 如果配置中设置了scale_embedding，则使用sqrt(embed_dim)作为embedding的缩放因子，否则为1.0
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
        
        # 如果提供了embed_tokens，则直接使用，否则创建一个新的embedding
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
        
        # 创建学习后的位置embedding
        self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )
        # 创建encoder层的ModuleList，包含多个BlenderbotSmallEncoderLayer实例
        self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)])
        # 对embedding层进行LayerNorm处理
        self.layernorm_embedding = nn.LayerNorm(embed_dim)
        
        # 设置梯度检查点为False
        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
# 定义了一个继承自BlenderbotSmallPreTrainedModel的Transformer解码器类，包含多个BlenderbotSmallDecoderLayer层
class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
    """
    Transformer解码器，由config.decoder_layers个BlenderbotSmallDecoderLayer层组成。

    Args:
        config: BlenderbotSmallConfig的实例，包含模型配置信息
        embed_tokens (nn.Embedding): 输出的嵌入层
    """

    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        self.dropout = config.dropout  # 从配置中获取dropout率
        self.layerdrop = config.decoder_layerdrop  # 从配置中获取层间dropout率
        self.padding_idx = config.pad_token_id  # 从配置中获取填充token的索引
        self.max_target_positions = config.max_position_embeddings  # 从配置中获取最大目标位置数
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0  # 根据配置决定是否对嵌入进行缩放

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens  # 如果提供了embed_tokens，则使用提供的嵌入层
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)  # 否则创建新的嵌入层

        self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )  # 学习得到的位置嵌入层

        self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)])  # 创建多个解码层
        self.layernorm_embedding = nn.LayerNorm(config.d_model)  # 嵌入层的LayerNorm

        self.gradient_checkpointing = False  # 是否使用梯度检查点（暂未启用）

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens  # 获取输入的嵌入层

    def set_input_embeddings(self, value):
        self.embed_tokens = value  # 设置输入的嵌入层

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        # 省略了具体的前向传播逻辑，在实际代码中应该包含完整的Transformer解码器的前向传播过程
        pass
    # 返回模型的解码器
    def get_decoder(self):
        return self.decoder

    # 应用装饰器，将 BLENDERBOT_SMALL_INPUTS_DOCSTRING 添加到模型前向传播方法的文档字符串中
    # 使用 replace_return_docstrings 函数，将输出类型设为 Seq2SeqModelOutput，并替换配置类为 _CONFIG_FOR_DOC
    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs
        attention_mask: Optional[torch.Tensor] = None,  # 输入的注意力掩码
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器的 token IDs
        decoder_attention_mask: Optional[torch.LongTensor] = None,  # 解码器的注意力掩码
        head_mask: Optional[torch.Tensor] = None,  # 多头注意力机制的掩码
        decoder_head_mask: Optional[torch.Tensor] = None,  # 解码器的多头注意力机制掩码
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力机制的掩码
        encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,  # 编码器的输出
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 用于存储过去的键值对
        inputs_embeds: Optional[torch.Tensor] = None,  # 嵌入输入
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器嵌入输入
        use_cache: Optional[bool] = None,  # 是否使用缓存
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出结果
# 添加文档字符串描述 BlenderbotSmallForConditionalGeneration 类，它是带有语言建模头部的 BlenderbotSmall 模型，可用于摘要生成。
@add_start_docstrings(
    "The BlenderbotSmall Model with a language modeling head. Can be used for summarization.",
    BLENDERBOT_SMALL_START_DOCSTRING,
)
class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
    # 设置基础模型前缀为 "model"
    base_model_prefix = "model"
    # 在加载时忽略的关键字列表，缺失时的处理方式
    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
    # 指定需要共享权重的键名列表
    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]

    # 初始化方法，接受 BlenderbotSmallConfig 类型的参数 config
    def __init__(self, config: BlenderbotSmallConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 使用给定的 config 创建 BlenderbotSmallModel 实例，并赋值给 self.model
        self.model = BlenderbotSmallModel(config)
        # 注册一个用于偏置的缓冲张量，形状为 (1, self.model.shared.num_embeddings)，初始化为零
        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
        # 创建一个线性层 lm_head，用于生成最终的输出
        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回当前模型的编码器
    def get_encoder(self):
        return self.model.get_encoder()

    # 返回当前模型的解码器
    def get_decoder(self):
        return self.model.get_decoder()

    # 调整 token embeddings 的大小，返回调整后的新的嵌入层
    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
        # 调用父类的 resize_token_embeddings 方法，返回新的嵌入层
        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        # 调整 final_logits_bias 的大小以匹配新的嵌入层
        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
        return new_embeddings

    # 调整 final_logits_bias 的大小以匹配新的 token 数量
    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
        # 获取当前 final_logits_bias 的旧 token 数量
        old_num_tokens = self.final_logits_bias.shape[-1]
        # 如果新 token 数量小于等于旧 token 数量，则直接截取现有的部分作为新的偏置
        if new_num_tokens <= old_num_tokens:
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        else:
            # 否则，创建额外的零偏置，拼接在现有偏置后面，以扩展偏置大小
            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        # 注册调整后的 final_logits_bias 为新的偏置
        self.register_buffer("final_logits_bias", new_bias)

    # 返回语言建模头部 lm_head
    def get_output_embeddings(self):
        return self.lm_head

    # 设置新的输出嵌入到 lm_head
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 将文档字符串添加到模型前向方法，描述输入格式和返回结果
    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
    # 替换返回值文档字符串为 Seq2SeqLMOutput 类型，使用 _CONFIG_FOR_DOC 配置类
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 添加结束文档字符串 BLENDERBOT_SMALL_GENERATION_EXAMPLE
    @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
    # 定义一个前向传播函数，用于执行模型的前向推理过程
    def forward(
        self,
        # 输入序列的标识符，通常是一个长整型张量
        input_ids: Optional[torch.LongTensor] = None,
        # 注意力掩码，用于指示模型在哪些位置需要进行注意力计算
        attention_mask: Optional[torch.Tensor] = None,
        # 解码器的输入序列标识符，可选参数
        decoder_input_ids: Optional[torch.LongTensor] = None,
        # 解码器的注意力掩码，指示解码器哪些位置需要注意力计算
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        # 头部掩码，用于屏蔽特定注意力头部的计算
        head_mask: Optional[torch.Tensor] = None,
        # 解码器头部掩码，用于解码器屏蔽特定注意力头部的计算
        decoder_head_mask: Optional[torch.Tensor] = None,
        # 交叉注意力头部掩码，用于屏蔽编码器和解码器之间的交叉注意力头部的计算
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 编码器输出，可以是元组或基本模型输出的联合类型
        encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,
        # 过去的键值对，用于存储过去计算的注意力权重
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        # 输入嵌入，用于直接提供输入的嵌入表示
        inputs_embeds: Optional[torch.Tensor] = None,
        # 解码器输入嵌入，用于直接提供解码器输入的嵌入表示
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        # 标签，通常是一个长整型张量，用于模型的监督训练
        labels: Optional[torch.LongTensor] = None,
        # 是否使用缓存，用于控制是否返回缓存项
        use_cache: Optional[bool] = None,
        # 是否输出注意力权重信息
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态信息
        output_hidden_states: Optional[bool] = None,
        # 是否以返回字典形式输出
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Tuple of masked language modeling loss and model outputs if not in `return_dict` mode,
            otherwise a `Seq2SeqLMOutput` containing various model outputs.
        """
        # Determine whether to use the provided `return_dict` or the default from `self.config`
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # If labels are provided, adjust `use_cache` and set `decoder_input_ids` if not already provided
        if labels is not None:
            if use_cache:
                # Warn about the deprecated use of `use_cache` when `labels` are provided
                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                # Shift the `labels` to the right to align with decoder input format
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        # Pass the input arguments to the underlying model for computation
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Compute language modeling logits and add bias for final logits
        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias

        masked_lm_loss = None
        # If labels are provided, compute the masked language modeling loss
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        # Return the appropriate output based on `return_dict` mode
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return a structured `Seq2SeqLMOutput` containing relevant model outputs
        return Seq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用了过去的键值（即past_key_values不为None），则根据过去的长度修剪decoder_input_ids
        if past_key_values is not None:
            # 获取过去键值中的第一个元素的长度（过去长度）
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法可能已经只传递最后一个输入ID
            if decoder_input_ids.shape[1] > past_length:
                # 如果decoder_input_ids的长度大于过去长度，则移除前缀长度为过去长度
                remove_prefix_length = past_length
            else:
                # 默认的旧行为：仅保留最后一个ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 修剪decoder_input_ids，仅保留从remove_prefix_length到结尾的部分
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回包含生成所需输入的字典
        return {
            "input_ids": None,  # encoder_outputs已定义，不需要input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 更改此项以避免缓存（可能用于调试）
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化重新排序后的过去键值
        reordered_past = ()
        # 对每一层的过去键值进行重新排序
        for layer_past in past_key_values:
            # 对于每个过去状态，按照beam_idx重新排序（转换为相同设备）
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                + layer_past[2:],
            )
        # 返回重新排序后的过去键值
        return reordered_past
# 从transformers.models.bart.modeling_bart.BartDecoderWrapper复制并修改为BlenderbotSmallDecoderWrapper
class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel):
    """
    这个包装类是一个辅助类，用于在因果语言模型与EncoderDecoderModel框架结合使用时正确加载预训练检查点。
    """

    def __init__(self, config):
        super().__init__(config)
        # 创建BlenderbotSmallDecoder对象作为该类的decoder属性
        self.decoder = BlenderbotSmallDecoder(config)

    def forward(self, *args, **kwargs):
        # 调用self.decoder的forward方法，将所有参数传递给decoder对象
        return self.decoder(*args, **kwargs)


# 从transformers.models.bart.modeling_bart.BartForCausalLM复制并修改为BlenderbotSmallForCausalLM
class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        # 深拷贝配置对象，设置is_decoder为True，is_encoder_decoder为False，并调用父类的初始化方法
        config = copy.deepcopy(config)
        config.is_decoder = True
        config.is_encoder_decoder = False
        super().__init__(config)
        # 创建BlenderbotSmallDecoderWrapper对象作为该类的model属性
        self.model = BlenderbotSmallDecoderWrapper(config)

        # 创建一个线性层作为lm_head属性，输出尺寸为config.vocab_size，输入尺寸为config.hidden_size，无偏置
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回self.model.decoder的embed_tokens属性作为输入嵌入层
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置self.model.decoder的embed_tokens属性为给定的value
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        # 返回self.lm_head作为输出嵌入层
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置self.lm_head为给定的new_embeddings
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        # 设置self.model.decoder为给定的decoder对象
        self.model.decoder = decoder

    def get_decoder(self):
        # 返回self.model.decoder属性
        return self.model.decoder

    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        这个方法定义了模型的前向传播逻辑，支持各种可选参数。
        """
        # 实现在BlenderbotSmallDecoderWrapper对象上的前向传播，将所有参数传递给decoder对象
        return self.model(input_ids=input_ids, attention_mask=attention_mask,
                          encoder_hidden_states=encoder_hidden_states,
                          encoder_attention_mask=encoder_attention_mask, head_mask=head_mask,
                          cross_attn_head_mask=cross_attn_head_mask, past_key_values=past_key_values,
                          inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache,
                          output_attentions=output_attentions, output_hidden_states=output_hidden_states,
                          return_dict=return_dict)

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
    ):
        """
        准备生成过程的输入，支持各种可选参数。
        """
        # 实现在BlenderbotSmallDecoderWrapper对象上的准备生成输入的逻辑，传递所有参数给decoder对象
        raise NotImplementedError
    ):
        # 如果模型在编码器-解码器模型中作为解码器使用，那么解码器的注意力遮罩将即时创建
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经只传递了最后一个输入ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认为旧行为：只保留最终ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]
        # 第一步，解码器缓存状态为空
        return {
            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

`.\models\blenderbot_small\modeling_flax_blenderbot_small.py`

# 设置文件编码为 UTF-8
# 版权声明：2021 年 Facebook, Inc. 和 HuggingFace Inc. 团队保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权使用本文件；
# 除非符合许可证要求，否则不得使用本文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件按"原样"分发，
# 没有任何形式的明示或暗示担保或条件。
# 有关详细信息，请参阅许可证。
""" Flax BlenderbotSmall 模型。"""

# 导入必要的库和模块
import math
import random
from functools import partial
from typing import Callable, Optional, Tuple

# 导入 Flax 相关模块和类
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey

# 导入模型输出类和实用函数
from ...modeling_flax_outputs import (
    FlaxBaseModelOutput,
    FlaxBaseModelOutputWithPastAndCrossAttentions,
    FlaxCausalLMOutputWithCrossAttentions,
    FlaxSeq2SeqLMOutput,
    FlaxSeq2SeqModelOutput,
)
from ...modeling_flax_utils import (
    ACT2FN,
    FlaxPreTrainedModel,
    append_call_sample_docstring,
    append_replace_return_docstrings,
    overwrite_call_docstring,
)
from ...utils import add_start_docstrings, logging, replace_return_docstrings
from .configuration_blenderbot_small import BlenderbotSmallConfig

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 用于文档的模型检查点和配置
_CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M"
_CONFIG_FOR_DOC = "BlenderbotSmallConfig"

# BlenderbotSmall 模型的起始文档字符串
BLENDERBOT_SMALL_START_DOCSTRING = r"""
    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
    # 参数说明：
    # config ([`BlenderbotSmallConfig`]): 模型配置类，包含模型的所有参数。
    #    使用配置文件初始化模型时，仅加载模型的配置，不加载与模型相关的权重。
    #    若要加载模型权重，请查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法。
    # dtype (`jax.numpy.dtype`, *可选*, 默认为 `jax.numpy.float32`):
    #    计算时的数据类型。可选项包括 `jax.numpy.float32`、`jax.numpy.float16`（在GPU上）和
    #    `jax.numpy.bfloat16`（在TPU上）。
    #
    #    可用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定了dtype，则所有计算将使用给定的 `dtype` 进行。
    #
    #    **注意这仅指定计算的数据类型，不影响模型参数的数据类型。**
    #
    #    如果要更改模型参数的数据类型，请参见 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""
BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
"""


BLENDERBOT_SMALL_ENCODE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
            
            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            
            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING = r"""
"""


# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = jnp.zeros_like(input_ids)  # 创建一个与输入数组相同形状的全零数组
    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])  # 将输入数组向右移动一个位置
    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)  # 设置起始位置的标记
    
    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)  # 替换特殊标记为pad_token_id
    return shifted_input_ids


# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->BlenderbotSmall
class FlaxBlenderbotSmallAttention(nn.Module):
    config: BlenderbotSmallConfig  # 配置对象
    embed_dim: int  # 嵌入维度
    num_heads: int  # 头的数量
    dropout: float = 0.0  # dropout率，默认为0.0
    causal: bool = False  # 是否为因果（causal）注意力
    bias: bool = True  # 是否包含偏置项
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型，使用jnp.float32
    # 设置函数，用于初始化模型参数
    def setup(self) -> None:
        # 计算每个注意力头的维度
        self.head_dim = self.embed_dim // self.num_heads
        # 检查embed_dim是否能被num_heads整除，否则抛出数值错误
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {self.num_heads})."
            )

        # 定义一个偏函数dense，用于创建带有预设参数的全连接层
        dense = partial(
            nn.Dense,
            self.embed_dim,
            use_bias=self.bias,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )

        # 创建查询、键、值投影层以及输出投影层
        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
        self.out_proj = dense()

        # 创建一个dropout层，用于模型训练时的随机失活
        self.dropout_layer = nn.Dropout(rate=self.dropout)

        # 如果需要因果注意力机制，创建一个因果掩码
        if self.causal:
            self.causal_mask = make_causal_mask(
                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
            )

    # 将隐藏状态按照注意力头分割
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))

    # 将分割后的注意力头重新合并
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

    # 使用JAX库的compact装饰器定义一个紧凑模型组件
    @nn.compact
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slightly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """

        # detect if we're initializing by absence of existing cache data.
        # 检测是否需要初始化，通过检查缓存数据是否存在来判断
        is_initialized = self.has_variable("cache", "cached_key")

        # initialize or retrieve cached key and value states with zeros of appropriate shape and type
        # 初始化或获取缓存的键和值状态，使用适当形状和类型的零值
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)

        # initialize or retrieve cache index, starting from 0
        # 初始化或获取缓存索引，起始为0
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # extract batch dimensions and other relevant dimensions from cached key shape
            # 提取批量维度和其他相关维度，从缓存键的形状中
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape

            # update cached key and value with new 1d spatial slices based on current cache index
            # 使用当前缓存索引更新缓存键和值的新的一维空间切片
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)

            # update cached_key and cached_value variables with new values
            # 更新 cached_key 和 cached_value 变量的值
            cached_key.value = key
            cached_value.value = value

            # determine number of updated cache vectors from the current query shape
            # 确定从当前查询形状更新的缓存向量数量
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors

            # create a pad mask for causal attention to avoid attending to future elements
            # 创建一个用于因果注意力的填充掩码，以避免关注未来元素
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )

            # combine pad_mask with existing attention_mask if provided
            # 如果提供了 attention_mask，则与其结合
            attention_mask = combine_masks(pad_mask, attention_mask)

        # return updated key, value, and attention_mask
        # 返回更新后的 key、value 和 attention_mask
        return key, value, attention_mask
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayer with Bart->BlenderbotSmall
class FlaxBlenderbotSmallEncoderLayer(nn.Module):
    config: BlenderbotSmallConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self) -> None:
        self.embed_dim = self.config.d_model  # 从配置中获取模型的嵌入维度
        self.self_attn = FlaxBlenderbotSmallAttention(  # 创建自注意力机制实例
            config=self.config,
            embed_dim=self.embed_dim,
            num_heads=self.config.encoder_attention_heads,
            dropout=self.config.attention_dropout,
            dtype=self.dtype,
        )
        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)  # 创建自注意力层规范化实例
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)  # 创建丢弃层实例
        self.activation_fn = ACT2FN[self.config.activation_function]  # 根据配置选择激活函数
        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)  # 创建激活函数丢弃层实例
        self.fc1 = nn.Dense(
            self.config.encoder_ffn_dim,  # 配置中编码器前馈网络的维度
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),  # 使用正态分布初始化权重
        )
        self.fc2 = nn.Dense(
            self.embed_dim,  # 嵌入维度
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),  # 使用正态分布初始化权重
        )
        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)  # 创建最终层规范化实例

    def __call__(
        self,
        hidden_states: jnp.ndarray,  # 隐藏状态张量
        attention_mask: jnp.ndarray,  # 注意力掩码张量
        output_attentions: bool = True,  # 是否输出注意力权重
        deterministic: bool = True,  # 是否使用确定性计算
    ) -> Tuple[jnp.ndarray]:
        residual = hidden_states  # 保存原始隐藏状态，用于残差连接

        # 自注意力计算
        hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)

        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)  # 应用丢弃层
        hidden_states = residual + hidden_states  # 残差连接
        hidden_states = self.self_attn_layer_norm(hidden_states)  # 自注意力层规范化

        residual = hidden_states  # 保存残差连接后的状态

        # 前馈网络计算
        hidden_states = self.activation_fn(self.fc1(hidden_states))  # 应用激活函数和第一个全连接层
        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)  # 应用激活函数的丢弃层
        hidden_states = self.fc2(hidden_states)  # 第二个全连接层
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)  # 应用丢弃层
        hidden_states = residual + hidden_states  # 残差连接
        hidden_states = self.final_layer_norm(hidden_states)  # 最终层规范化

        outputs = (hidden_states,)  # 输出隐藏状态作为元组的第一个元素

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，则作为元组的第二个元素添加到输出中

        return outputs


# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection with Bart->BlenderbotSmall
class FlaxBlenderbotSmallEncoderLayerCollection(nn.Module):
    config: BlenderbotSmallConfig
    dtype: jnp.dtype = jnp.float32  # 计算的数据类型

    def setup(self):
        self.layers = [
            FlaxBlenderbotSmallEncoderLayer(self.config, name=str(i), dtype=self.dtype)  # 创建编码器层实例列表
            for i in range(self.config.encoder_layers)  # 根据配置中编码器层数创建
        ]
        self.layerdrop = self.config.encoder_layerdrop  # 设置编码器层的丢弃率
    # 定义一个调用方法，用于执行模型的前向传播
    def __call__(
        self,
        hidden_states,  # 输入的隐藏状态张量
        attention_mask,  # 注意力掩码，用于指示哪些位置需要注意
        deterministic: bool = True,  # 是否使用确定性推断
        output_attentions: bool = False,  # 是否输出注意力权重
        output_hidden_states: bool = False,  # 是否输出所有隐藏状态
        return_dict: bool = True,  # 是否返回字典形式的输出
    ):
        # 如果需要输出注意力权重，则初始化空元组用于存储所有注意力权重
        all_attentions = () if output_attentions else None
        # 如果需要输出所有隐藏状态，则初始化空元组用于存储所有隐藏状态
        all_hidden_states = () if output_hidden_states else None

        # 遍历所有编码器层
        for encoder_layer in self.layers:
            # 如果需要输出所有隐藏状态，则将当前隐藏状态添加到all_hidden_states中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
            # 添加层丢弃（参见 https://arxiv.org/abs/1909.11556 进行描述）
            dropout_probability = random.uniform(0, 1)
            # 如果不是确定性推断且随机dropout概率小于层丢弃率，则跳过该层
            if not deterministic and (dropout_probability < self.layerdrop):
                layer_outputs = (None, None)
            else:
                # 否则，调用当前编码器层进行前向传播
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    output_attentions,
                    deterministic,
                )
            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力权重，则将当前层的注意力权重添加到all_attentions中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 如果需要输出所有隐藏状态，则将最终的隐藏状态添加到all_hidden_states中
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # 构建模型的输出结果，包括最终的隐藏状态、所有隐藏状态和所有注意力权重
        outputs = (hidden_states, all_hidden_states, all_attentions)

        # 如果不需要以字典形式返回结果，则返回一个元组，过滤掉None值
        if not return_dict:
            return tuple(v for v in outputs if v is not None)

        # 否则，以FlaxBaseModelOutput的形式返回结果，包括最终的隐藏状态、所有隐藏状态和所有注意力权重
        return FlaxBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
# 从 transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayer 复制并修改为使用 BlenderbotSmall
class FlaxBlenderbotSmallDecoderLayer(nn.Module):
    # 配置参数对象，指定为 BlenderbotSmallConfig 类型
    config: BlenderbotSmallConfig
    # 数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化函数，设置层的属性
    def setup(self) -> None:
        # 设定嵌入维度为模型配置中的 d_model
        self.embed_dim = self.config.d_model
        # 使用 BlenderbotSmallAttention 定义自注意力机制
        self.self_attn = FlaxBlenderbotSmallAttention(
            config=self.config,
            embed_dim=self.embed_dim,
            num_heads=self.config.decoder_attention_heads,
            dropout=self.config.attention_dropout,
            causal=True,
            dtype=self.dtype,
        )
        # 定义 dropout 层，用于模型训练时的随机失活
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
        # 激活函数，根据配置中的激活函数类型选择对应的函数
        self.activation_fn = ACT2FN[self.config.activation_function]
        # 激活函数的 dropout 层，用于激活函数的输出时的随机失活
        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)

        # 定义自注意力机制的 LayerNorm 层
        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
        # 定义与编码器注意力相关的注意力机制
        self.encoder_attn = FlaxBlenderbotSmallAttention(
            config=self.config,
            embed_dim=self.embed_dim,
            num_heads=self.config.decoder_attention_heads,
            dropout=self.config.attention_dropout,
            dtype=self.dtype,
        )
        # 编码器注意力的 LayerNorm 层
        self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
        
        # 第一个全连接层，用于进行线性变换
        self.fc1 = nn.Dense(
            self.config.decoder_ffn_dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )
        # 第二个全连接层，输出维度为嵌入维度，用于线性变换
        self.fc2 = nn.Dense(
            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
        )
        # 最终的 LayerNorm 层，用于模型输出的标准化
        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

    # 调用函数，定义层的前向传播逻辑
    def __call__(
        self,
        hidden_states: jnp.ndarray,  # 输入的隐藏状态
        attention_mask: jnp.ndarray,  # 注意力遮罩，掩盖无效位置
        encoder_hidden_states: Optional[jnp.ndarray] = None,  # 编码器隐藏状态（可选）
        encoder_attention_mask: Optional[jnp.ndarray] = None,  # 编码器注意力遮罩（可选）
        init_cache: bool = False,  # 是否初始化缓存（默认为 False）
        output_attentions: bool = True,  # 是否输出注意力权重（默认为 True）
        deterministic: bool = True,  # 是否确定性推断模式（默认为 True）
        # 函数定义未完，需继续编写
    ) -> Tuple[jnp.ndarray]:
        # 将输入的 hidden_states 保存为 residual，用于后续残差连接
        residual = hidden_states

        # 自注意力机制
        # 调用 self_attn 方法进行自注意力计算，得到更新后的 hidden_states 和 self_attn_weights
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
        )
        # 应用 dropout 层，根据 deterministic 参数确定是否使用确定性 dropout
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 添加残差连接
        hidden_states = residual + hidden_states
        # 应用自注意力层的 LayerNormalization
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 跨注意力块
        cross_attn_weights = None
        # 如果存在 encoder_hidden_states，则执行以下操作
        if encoder_hidden_states is not None:
            # 将当前的 hidden_states 保存为 residual
            residual = hidden_states

            # 执行 encoder_attn 方法进行跨注意力计算，得到更新后的 hidden_states 和 cross_attn_weights
            hidden_states, cross_attn_weights = self.encoder_attn(
                hidden_states=hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
            )
            # 应用 dropout 层，根据 deterministic 参数确定是否使用确定性 dropout
            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
            # 添加残差连接
            hidden_states = residual + hidden_states
            # 应用跨注意力层的 LayerNormalization
            hidden_states = self.encoder_attn_layer_norm(hidden_states)

        # 全连接层
        # 将当前的 hidden_states 保存为 residual
        residual = hidden_states
        # 应用激活函数 activation_fn 到 fc1 全连接层
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用 activation_dropout_layer，根据 deterministic 参数确定是否使用确定性 dropout
        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
        # 应用 fc2 全连接层
        hidden_states = self.fc2(hidden_states)
        # 应用 dropout 层，根据 deterministic 参数确定是否使用确定性 dropout
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 添加残差连接
        hidden_states = residual + hidden_states
        # 应用最终的 LayerNormalization
        hidden_states = self.final_layer_norm(hidden_states)

        # 准备输出
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将 self_attn_weights 和 cross_attn_weights 添加到 outputs 中
        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        # 返回最终的 outputs
        return outputs
# 从transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection复制而来，修改为BlenderbotSmall模型
class FlaxBlenderbotSmallDecoderLayerCollection(nn.Module):
    # 使用BlenderbotSmallConfig配置
    config: BlenderbotSmallConfig
    # 计算过程中使用的数据类型
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    def setup(self):
        # 创建decoder层列表，根据配置中的decoder_layers数量
        self.layers = [
            FlaxBlenderbotSmallDecoderLayer(self.config, name=str(i), dtype=self.dtype)
            for i in range(self.config.decoder_layers)
        ]
        # 设置layer drop参数
        self.layerdrop = self.config.decoder_layerdrop

    def __call__(
        self,
        hidden_states,
        attention_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 如果需要输出隐藏状态，则初始化空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化空元组
        all_self_attns = () if output_attentions else None
        # 如果需要输出交叉注意力权重，并且encoder_hidden_states不为None，则初始化空元组
        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

        # 遍历每个decoder层
        for decoder_layer in self.layers:
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到all_hidden_states中
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
                # 添加LayerDrop机制（参见https://arxiv.org/abs/1909.11556）
            # 生成0到1之间的随机数作为dropout概率
            dropout_probability = random.uniform(0, 1)
            # 如果不是确定性推断，并且dropout_probability小于layerdrop值，则将输出置为None
            if not deterministic and (dropout_probability < self.layerdrop):
                layer_outputs = (None, None, None)
            else:
                # 否则，调用当前decoder层进行前向传播计算
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    init_cache=init_cache,
                    output_attentions=output_attentions,
                    deterministic=deterministic,
                )

            # 更新隐藏状态为当前decoder层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力权重，则将当前层的自注意力权重添加到all_self_attns中
            if output_attentions:
                all_self_attns += (layer_outputs[1],)

                # 如果encoder_hidden_states不为None，则将当前层的交叉注意力权重添加到all_cross_attentions中
                if encoder_hidden_states is not None:
                    all_cross_attentions += (layer_outputs[2],)

        # 如果需要输出最终的隐藏状态，则将最终隐藏状态添加到all_hidden_states中
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # 组装输出结果列表
        outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]

        # 如果return_dict为False，则返回元组形式的输出列表
        if not return_dict:
            return tuple(v for v in outputs if v is not None)

        # 否则，返回带有过去和交叉注意力的FlaxBaseModelOutputWithPastAndCrossAttentions对象
        return FlaxBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )


class FlaxBlenderbotSmallEncoder(nn.Module):
    # 使用BlenderbotSmallConfig配置
    config: BlenderbotSmallConfig
    # 编码器token的嵌入层
    embed_tokens: nn.Embed
    # 定义默认数据类型为 jax 中的 float32，用于计算过程中的数据类型
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 初始化方法，设置模型中的 dropout 层和一些与 embedding 相关的属性
    def setup(self):
        # 根据配置参数初始化 dropout 层
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)

        # 获取配置中的 embedding 维度大小
        embed_dim = self.config.d_model
        # 获取配置中的填充索引
        self.padding_idx = self.config.pad_token_id
        # 获取配置中的最大位置编码长度
        self.max_source_positions = self.config.max_position_embeddings
        # 根据配置是否缩放 embedding 的初始化权重
        self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0

        # 初始化位置编码的嵌入层
        self.embed_positions = nn.Embed(
            self.config.max_position_embeddings,
            embed_dim,
            # 使用正态分布初始化权重，标准差为配置中的初始化标准差
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
        )

        # 初始化多层编码器
        self.layers = FlaxBlenderbotSmallEncoderLayerCollection(self.config, self.dtype)
        
        # 初始化 embedding 的 LayerNorm 层
        self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

    # 模型的调用方法，接收输入和各种标志位，执行模型的前向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        # 获取输入张量的形状信息
        input_shape = input_ids.shape
        # 将输入张量展平为二维张量，保留最后一个维度的形状
        input_ids = input_ids.reshape(-1, input_shape[-1])

        # 使用 token embedding 对输入 token 进行嵌入，并根据缩放因子缩放
        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        # 根据位置编码的位置 IDs 获取位置编码的嵌入
        embed_pos = self.embed_positions(position_ids)

        # 将 token embedding 和位置编码的嵌入相加得到最终的隐藏状态
        hidden_states = inputs_embeds + embed_pos
        # 对隐藏状态进行 LayerNorm 归一化处理
        hidden_states = self.layernorm_embedding(hidden_states)
        # 对归一化后的隐藏状态应用 dropout，根据 deterministic 标志位决定是否使用确定性 dropout
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)

        # 将隐藏状态传入多层编码器中进行编码
        outputs = self.layers(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果 return_dict 为 False，则直接返回编码器的输出
        if not return_dict:
            return outputs

        # 否则，返回一个包含模型输出各部分的字典结构
        return FlaxBaseModelOutput(
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 定义了一个名为FlaxBlenderbotSmallDecoder的类，继承自nn.Module
class FlaxBlenderbotSmallDecoder(nn.Module):
    # 类变量config，类型为BlenderbotSmallConfig，用于存储模型配置信息
    config: BlenderbotSmallConfig
    # 类变量embed_tokens，类型为nn.Embed，用于存储嵌入层信息
    embed_tokens: nn.Embed
    # 类变量dtype，默认为jnp.float32，表示计算过程中的数据类型

    # 初始化方法setup，用于配置模型的各个组件
    def setup(self):
        # 初始化dropout_layer，用于实现随机失活
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)

        # 从config中获取嵌入维度
        embed_dim = self.config.d_model
        # 从config中获取填充token的索引
        self.padding_idx = self.config.pad_token_id
        # 从config中获取目标位置的最大值
        self.max_target_positions = self.config.max_position_embeddings
        # 初始化embed_scale，根据scale_embedding参数决定是否开启缩放
        self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0

        # 初始化embed_positions，用于嵌入位置信息
        self.embed_positions = nn.Embed(
            self.config.max_position_embeddings,  # 嵌入位置的最大数量
            embed_dim,                           # 嵌入的维度
            embedding_init=jax.nn.initializers.normal(self.config.init_std),  # 使用正态分布初始化嵌入矩阵
        )

        # 初始化layers，即解码器的层集合
        self.layers = FlaxBlenderbotSmallDecoderLayerCollection(self.config, self.dtype)
        # 初始化layernorm_embedding，用于对输入嵌入进行层归一化
        self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

    # 实现调用方法，定义了模型的前向计算过程
    def __call__(
        self,
        input_ids,                               # 输入的token id
        attention_mask,                          # 注意力掩码
        position_ids,                            # 位置id
        encoder_hidden_states: Optional[jnp.ndarray] = None,  # 编码器隐藏状态，默认为None
        encoder_attention_mask: Optional[jnp.ndarray] = None,  # 编码器注意力掩码，默认为None
        init_cache: bool = False,                # 是否初始化缓存，默认为False
        output_attentions: bool = False,         # 是否输出注意力权重，默认为False
        output_hidden_states: bool = False,      # 是否输出隐藏状态，默认为False
        return_dict: bool = True,                # 是否返回字典格式的输出，默认为True
        deterministic: bool = True,             # 是否确定性计算，默认为True
    ):
        # 获取输入tensor的形状
        input_shape = input_ids.shape
        # 重塑input_ids的形状为(batch_size * seq_length, embed_dim)
        input_ids = input_ids.reshape(-1, input_shape[-1])

        # 根据input_ids获取对应的嵌入表示，并乘以embed_scale进行缩放
        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        # 嵌入位置信息
        positions = self.embed_positions(position_ids)

        # 对输入嵌入进行层归一化处理
        inputs_embeds = self.layernorm_embedding(inputs_embeds)
        # 将位置嵌入加到输入嵌入上形成最终的隐藏状态表示
        hidden_states = inputs_embeds + positions

        # 使用dropout_layer对隐藏状态进行随机失活处理
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)

        # 调用layers的前向计算方法，处理隐藏状态，返回相应的输出
        outputs = self.layers(
            hidden_states,
            attention_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果return_dict为False，则直接返回outputs
        if not return_dict:
            return outputs

        # 如果return_dict为True，则构造包含额外信息的输出对象并返回
        return FlaxBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


# 从transformers.models.bart.modeling_flax_bart.FlaxBartModule复制而来，修改Bart为BlenderbotSmall
class FlaxBlenderbotSmallModule(nn.Module):
    # 类变量config，类型为BlenderbotSmallConfig，用于存储模型配置信息
    config: BlenderbotSmallConfig
    # 类变量dtype，默认为jnp.float32，表示计算过程中的数据类型
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    # 初始化方法，设置共享的嵌入层，编码器和解码器模块
    def setup(self):
        self.shared = nn.Embed(
            self.config.vocab_size,
            self.config.d_model,
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
            dtype=self.dtype,
        )

        # 初始化编码器模块，使用小型Blenderbot编码器
        self.encoder = FlaxBlenderbotSmallEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
        # 初始化解码器模块，使用小型Blenderbot解码器，共享相同的嵌入层
        self.decoder = FlaxBlenderbotSmallDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared)

    # 返回当前对象中的编码器模块
    def _get_encoder_module(self):
        return self.encoder

    # 返回当前对象中的解码器模块
    def _get_decoder_module(self):
        return self.decoder

    # 实现对象的调用接口，用于进行序列到序列的转换任务
    def __call__(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask,
        position_ids,
        decoder_position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        # 编码器模块处理输入序列，生成编码器输出
        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 解码器模块处理解码器输入序列，使用编码器输出来辅助生成解码器输出
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],  # 使用编码器的隐藏状态作为解码器的输入
            encoder_attention_mask=attention_mask,      # 使用编码器的注意力掩码
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 如果不要求返回字典形式，则将编码器和解码器输出直接拼接返回
        if not return_dict:
            return decoder_outputs + encoder_outputs

        # 返回经过序列到序列模型包装的输出结果
        return FlaxSeq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )
# 定义一个自定义的 Flax 模型类，继承自 FlaxPreTrainedModel
class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
    # 设置配置类为 BlenderbotSmallConfig
    config_class = BlenderbotSmallConfig
    # 基础模型前缀为 "model"
    base_model_prefix: str = "model"
    # 模块类初始化为 None，将在实例化时赋值

    def __init__(
        self,
        config: BlenderbotSmallConfig,
        input_shape: Tuple[int] = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 使用模块类创建模块实例
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 调用父类的初始化方法
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化输入张量 input_ids，数据类型为整型
        input_ids = jnp.zeros(input_shape, dtype="i4")
        # 确保初始化步骤适用于 FlaxBlenderbotSmallForSequenceClassificationModule
        input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id)
        # 初始化 attention_mask 为全 1 的张量，与 input_ids 形状相同
        attention_mask = jnp.ones_like(input_ids)
        # 将 decoder_input_ids 初始化为 input_ids
        decoder_input_ids = input_ids
        # 将 decoder_attention_mask 初始化为全 1 的张量，与 input_ids 形状相同
        decoder_attention_mask = jnp.ones_like(input_ids)

        # 获取 batch_size 和 sequence_length
        batch_size, sequence_length = input_ids.shape
        # 初始化 position_ids 为广播后的序列索引张量
        position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
        # 初始化 decoder_position_ids 为广播后的序列索引张量
        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 分割随机数生成器
        params_rng, dropout_rng = jax.random.split(rng)
        # 创建随机数字典
        rngs = {"params": params_rng, "dropout": dropout_rng}

        # 使用模块的初始化方法生成随机参数
        random_params = self.module.init(
            rngs,
            input_ids,
            attention_mask,
            decoder_input_ids,
            decoder_attention_mask,
            position_ids,
            decoder_position_ids,
        )["params"]

        # 如果传入了已有的参数，则将随机生成的参数与已有参数合并
        if params is not None:
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            return freeze(unflatten_dict(params))
        else:
            # 否则，直接返回随机生成的参数
            return random_params
    # 初始化缓存用于快速自回归解码
    def init_cache(self, batch_size, max_length, encoder_outputs):
        r"""
        Args:
            batch_size (`int`):
                用于快速自回归解码的批处理大小。定义了初始化缓存的批处理大小。
            max_length (`int`):
                自回归解码的最大可能长度。定义了初始化缓存的序列长度。
            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
                `encoder_outputs` 包括 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)。
                `last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`，
                *可选* 是编码器最后一层输出的隐藏状态序列。用于解码器的交叉注意力。

        """
        # 初始化解码器的输入 ID，全部为1
        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
        # 解码器的注意力掩码与输入 ID 相同，全部为1
        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
        # 解码器的位置 ID，广播到与输入 ID 相同的形状
        decoder_position_ids = jnp.broadcast_to(
            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
        )

        # 定义内部函数 `_decoder_forward`，用于调用解码器模块
        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
            decoder_module = module._get_decoder_module()
            return decoder_module(
                decoder_input_ids,
                decoder_attention_mask,
                decoder_position_ids,
                **kwargs,
            )

        # 初始化模型的变量，用于初始化缓存
        init_variables = self.module.init(
            jax.random.PRNGKey(0),
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],
            init_cache=True,
            method=_decoder_forward,  # 只需调用解码器来初始化缓存
        )
        # 解冻并返回初始化的缓存变量
        return unfreeze(init_variables["cache"])

    @add_start_docstrings(BLENDERBOT_SMALL_ENCODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BlenderbotSmallConfig)
    def encode(
        self,
        input_ids: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        r"""
        Returns:

        Example:

        ```
        >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration

        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")

        >>> text = "My friends are cool but they eat too many carbs."
        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
        >>> encoder_outputs = model.encode(**inputs)
        ```"""
        # 初始化输出注意力的设置，如果未指定则使用模型配置的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 初始化输出隐藏状态的设置，如果未指定则使用模型配置的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 初始化返回字典的设置，如果未指定则使用模型配置的默认值
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 如果未提供注意力掩码，则创建一个全为1的注意力掩码，与输入张量形状相同
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)
        # 如果未提供位置编码，则使用输入张量的形状创建位置编码
        if position_ids is None:
            batch_size, sequence_length = input_ids.shape
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 如果需要处理任何伪随机数生成器，则创建一个空字典来存储这些伪随机数生成器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 定义一个内部函数来执行编码器的前向传播
        def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
            encode_module = module._get_encoder_module()
            return encode_module(input_ids, attention_mask, position_ids, **kwargs)

        # 调用模型的 apply 方法，执行编码器的前向传播
        return self.module.apply(
            {"params": params or self.params},  # 使用给定的参数或默认参数执行模型前向传播
            input_ids=jnp.array(input_ids, dtype="i4"),  # 将输入张量转换为 Flax 所需的数据类型和格式
            attention_mask=jnp.array(attention_mask, dtype="i4"),  # 将注意力掩码转换为 Flax 所需的数据类型和格式
            position_ids=jnp.array(position_ids, dtype="i4"),  # 将位置编码转换为 Flax 所需的数据类型和格式
            output_attentions=output_attentions,  # 指定是否输出注意力
            output_hidden_states=output_hidden_states,  # 指定是否输出隐藏状态
            return_dict=return_dict,  # 指定是否以字典形式返回结果
            deterministic=not train,  # 指定是否处于训练模式
            rngs=rngs,  # 提供任何伪随机数生成器
            method=_encoder_forward,  # 指定执行的方法
        )

    @add_start_docstrings(BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BlenderbotSmallConfig
    )
    # 定义解码方法，接受一系列输入参数，并可选地返回一个字典形式的输出
    def decode(
        self,
        decoder_input_ids,
        encoder_outputs,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        past_key_values: dict = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    def __call__(
        self,
        input_ids: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        decoder_input_ids: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        # 设置输出注意力权重的选项，如果未指定则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态的选项，如果未指定则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回字典的选项，如果未指定则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 准备编码器输入
        if attention_mask is None:
            # 如果未提供注意力遮罩，则创建一个全为1的遮罩，形状与input_ids相同
            attention_mask = jnp.ones_like(input_ids)
        if position_ids is None:
            # 如果未提供位置编码，则根据input_ids的形状创建位置编码
            batch_size, sequence_length = input_ids.shape
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 准备解码器输入
        if decoder_input_ids is None:
            # 如果未提供解码器输入的token ids，则通过向右移动input_ids创建解码器的输入
            decoder_input_ids = shift_tokens_right(
                input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id
            )
        if decoder_attention_mask is None:
            # 如果未提供解码器的注意力遮罩，则创建一个全为1的遮罩，形状与decoder_input_ids相同
            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
        if decoder_position_ids is None:
            # 如果未提供解码器的位置编码，则根据decoder_input_ids的形状创建位置编码
            batch_size, sequence_length = decoder_input_ids.shape
            decoder_position_ids = jnp.broadcast_to(
                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
            )

        # 处理需要的任何随机数生成器（PRNG）
        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}

        # 调用模块的apply方法，传递所需参数和设置
        return self.module.apply(
            {"params": params or self.params},
            input_ids=jnp.array(input_ids, dtype="i4"),
            attention_mask=jnp.array(attention_mask, dtype="i4"),
            position_ids=jnp.array(position_ids, dtype="i4"),
            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=not train,
            rngs=rngs,
        )
# 添加文档字符串到类定义，描述 BlenderbotSmall 模型的基本信息和功能
@add_start_docstrings(
    "The bare BlenderbotSmall Model transformer outputting raw hidden-states without any specific head on top.",
    BLENDERBOT_SMALL_START_DOCSTRING,
)
# 定义 FlaxBlenderbotSmallModel 类，继承自 FlaxBlenderbotSmallPreTrainedModel 类
class FlaxBlenderbotSmallModel(FlaxBlenderbotSmallPreTrainedModel):
    # 配置信息为 BlenderbotSmallConfig 类型的对象
    config: BlenderbotSmallConfig
    # 计算使用的数据类型为 jnp.float32
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    # 模块类为 FlaxBlenderbotSmallModule
    module_class = FlaxBlenderbotSmallModule

# 调用函数 append_call_sample_docstring，添加样例调用文档字符串到 FlaxBlenderbotSmallModel 类中
append_call_sample_docstring(FlaxBlenderbotSmallModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)


# 从 transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule 复制而来，将 Bart 改为 BlenderbotSmall
# 定义 FlaxBlenderbotSmallForConditionalGenerationModule 类，继承自 nn.Module
class FlaxBlenderbotSmallForConditionalGenerationModule(nn.Module):
    # 配置信息为 BlenderbotSmallConfig 类型的对象
    config: BlenderbotSmallConfig
    # 计算使用的数据类型为 jnp.float32
    dtype: jnp.dtype = jnp.float32
    # 偏置初始化函数为 jax.nn.initializers.zeros
    bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros

    # 设置函数，初始化模型和 lm_head
    def setup(self):
        # 使用配置和数据类型初始化 FlaxBlenderbotSmallModule 模型
        self.model = FlaxBlenderbotSmallModule(config=self.config, dtype=self.dtype)
        # 初始化 lm_head，使用 Dense 层，无偏置，数据类型为 dtype，初始化方式为正态分布
        self.lm_head = nn.Dense(
            self.model.shared.num_embeddings,
            use_bias=False,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )
        # 初始化 final_logits_bias，作为模型参数，维度为 (1, num_embeddings)，初始化方式为 bias_init
        self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings))

    # 获取编码器模块
    def _get_encoder_module(self):
        return self.model.encoder

    # 获取解码器模块
    def _get_decoder_module(self):
        return self.model.decoder

    # 定义 __call__ 方法，接受多个输入参数和标志位，执行条件生成任务
    def __call__(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask,
        position_ids,
        decoder_position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        # 使用模型进行推理，返回包含输出的字典
        outputs = self.model(
            input_ids=input_ids,  # 输入的token IDs
            attention_mask=attention_mask,  # 输入的注意力掩码
            decoder_input_ids=decoder_input_ids,  # 解码器的token IDs
            decoder_attention_mask=decoder_attention_mask,  # 解码器的注意力掩码
            position_ids=position_ids,  # 位置编码
            decoder_position_ids=decoder_position_ids,  # 解码器位置编码
            output_attentions=output_attentions,  # 是否输出注意力权重
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            return_dict=return_dict,  # 是否返回字典格式的输出
            deterministic=deterministic,  # 是否确定性推断
        )

        hidden_states = outputs[0]  # 提取模型输出的隐藏状态

        if self.config.tie_word_embeddings:
            # 如果配置了共享词嵌入，从模型变量中获取共享的嵌入层
            shared_embedding = self.model.variables["params"]["shared"]["embedding"]
            # 应用共享嵌入到隐藏状态上得到语言模型的logits
            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
        else:
            # 否则直接使用语言模型头部处理隐藏状态得到logits
            lm_logits = self.lm_head(hidden_states)

        # 将最终logits加上偏置项，使用jax中的stop_gradient函数确保偏置项不参与梯度计算
        lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))

        if not return_dict:
            # 如果不返回字典格式的输出，则将logits和其它输出作为元组返回
            output = (lm_logits,) + outputs[1:]
            return output

        # 返回FlaxSeq2SeqLMOutput格式的输出，包括logits和其它相关的隐藏状态和注意力权重
        return FlaxSeq2SeqLMOutput(
            logits=lm_logits,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )
@add_start_docstrings(
    "The BLENDERBOT_SMALL Model with a language modeling head. Can be used for summarization.",
    BLENDERBOT_SMALL_START_DOCSTRING,
)
class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedModel):
    module_class = FlaxBlenderbotSmallForConditionalGenerationModule
    dtype: jnp.dtype = jnp.float32

    @add_start_docstrings(BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BlenderbotSmallConfig)
    def decode(
        self,
        decoder_input_ids,
        encoder_outputs,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        past_key_values: dict = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        deterministic: bool = True,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        """
        Decodes the input sequence using the model for conditional generation.

        Args:
            decoder_input_ids: Tensor of decoder input IDs.
            encoder_outputs: Output of the encoder model.
            encoder_attention_mask: Optional tensor indicating which positions in the encoder output should not be attended to.
            decoder_attention_mask: Optional tensor specifying which positions in the decoder input should not be attended to.
            decoder_position_ids: Optional tensor specifying positional IDs for the decoder input.
            past_key_values: Optional dictionary containing cached key-value pairs for fast decoding.
            output_attentions: Whether to output attentions.
            output_hidden_states: Whether to output hidden states.
            return_dict: Whether to return a dictionary.
            deterministic: Whether to apply deterministic computation.
            params: Optional parameters for the model.
            dropout_rng: Random number generator for dropout.

        Returns:
            FlaxCausalLMOutputWithCrossAttentions: Model outputs including logits, past key values, and optionally attentions and hidden states.
        """
        # Function body is implemented in the actual method, no further comment needed here.
        pass

    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        max_length,
        attention_mask: Optional[jax.Array] = None,
        decoder_attention_mask: Optional[jax.Array] = None,
        encoder_outputs=None,
        **kwargs,
    ):
        """
        Prepares inputs for the generation process.

        Args:
            decoder_input_ids: Tensor of decoder input IDs.
            max_length: Maximum length of the generated sequence.
            attention_mask: Optional tensor indicating which positions should be attended to.
            decoder_attention_mask: Optional tensor specifying which positions in the decoder input should not be attended to.
            encoder_outputs: Optional outputs of the encoder model.
            **kwargs: Additional keyword arguments.

        Returns:
            dict: Dictionary containing prepared inputs for the generation process.
                Includes past key values, encoder outputs, encoder attention mask, decoder attention mask, and decoder position IDs.
        """
        # initializing the cache
        batch_size, seq_length = decoder_input_ids.shape

        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)

        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
        # But since the decoder uses a causal mask, those positions are masked anyways.
        # Thus we can create a single static attention_mask here, which is more efficient for compilation

        # Create an extended attention mask for the decoder
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        if decoder_attention_mask is not None:
            # Calculate position IDs from decoder_attention_mask
            position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
            # Update the extended_attention_mask with decoder_attention_mask values
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
        else:
            # Broadcast positional IDs if decoder_attention_mask is not provided
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        return {
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "encoder_attention_mask": attention_mask,
            "decoder_attention_mask": extended_attention_mask,
            "decoder_position_ids": position_ids,
        }

    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        """
        Updates model inputs for the generation process based on model outputs.

        Args:
            model_outputs: Outputs from the model.
            model_kwargs: Original input arguments for the model.

        Returns:
            dict: Updated model input arguments including past key values and adjusted decoder position IDs.
        """
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
        return model_kwargs
    # 导入所需的库和模型
    >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
    
    # 使用预训练的 Blenderbot 模型初始化生成模型
    >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
    # 使用预训练的 tokenizer 初始化分词器
    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
    
    # 待总结的文章内容
    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
    # 使用 tokenizer 处理文章，限定最大长度为 1024，并转换为 NumPy 数据结构
    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np")
    
    # 生成摘要
    >>> summary_ids = model.generate(inputs["input_ids"]).sequences
    # 解码生成的摘要内容，去除特殊标记并保留原始分词方式
    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
    
    # 掩码填充示例：
    
    >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration
    
    # 使用预训练的 tokenizer 初始化分词器
    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
    # 待处理的文本带有掩码标记
    >>> TXT = "My friends are <mask> but they eat too many carbs."
    
    # 使用预训练的 Blenderbot 模型初始化生成模型
    >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
    # 将文本转换为输入的 token IDs，并转换为 NumPy 数据结构
    >>> input_ids = tokenizer([TXT], return_tensors="np")["input_ids"]
    # 获取模型的 logits
    >>> logits = model(input_ids).logits
    
    # 确定掩码位置的索引
    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
    # 对 logits 应用 softmax 函数，沿着指定的轴计算概率
    >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
    # 获取概率最高的前 k 个预测结果和它们的值
    >>> values, predictions = jax.lax.top_k(probs)
    
    # 解码预测结果并按空格分割成单词列表
    >>> tokenizer.decode(predictions).split()
"""
给 FlaxBlenderbotSmallForConditionalGeneration 类的调用覆盖文档字符串，
使用 BLENDERBOT_SMALL_INPUTS_DOCSTRING 和 FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING 进行扩展。
"""
overwrite_call_docstring(
    FlaxBlenderbotSmallForConditionalGeneration,
    BLENDERBOT_SMALL_INPUTS_DOCSTRING + FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING,
)

"""
为 FlaxBlenderbotSmallForConditionalGeneration 类附加或替换返回文档字符串，
设置输出类型为 FlaxSeq2SeqLMOutput，配置类为 _CONFIG_FOR_DOC。
"""
append_replace_return_docstrings(
    FlaxBlenderbotSmallForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
)

`.\models\blenderbot_small\modeling_tf_blenderbot_small.py`

# coding=utf-8
# 版权所有 2021 年 Facebook, Inc 和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证的规定，否则您不能使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件是基于“按原样”分发的，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的详情，请参阅许可证。
""" TF 2.0 BlenderbotSmall 模型。"""


from __future__ import annotations

import random  # 导入随机数模块
from typing import List, Optional, Tuple, Union  # 导入类型提示模块

import numpy as np  # 导入 NumPy 库
import tensorflow as tf  # 导入 TensorFlow 库

from ...activations_tf import get_tf_activation  # 从本地导入 TensorFlow 激活函数
from ...modeling_tf_outputs import (  # 从本地导入 TensorFlow 模型输出类
    TFBaseModelOutput,
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFSeq2SeqLMOutput,
    TFSeq2SeqModelOutput,
)

# 公共 API
from ...modeling_tf_utils import (  # 从本地导入 TensorFlow 模型工具类和函数
    TFCausalLanguageModelingLoss,
    TFPreTrainedModel,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax  # 从本地导入 TensorFlow 工具函数
from ...utils import (  # 从本地导入通用工具函数
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_blenderbot_small import BlenderbotSmallConfig  # 从本地导入 BlenderbotSmall 配置类


logger = logging.get_logger(__name__)  # 获取 logger 对象


_CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M"  # 预训练模型检查点用于文档说明
_CONFIG_FOR_DOC = "BlenderbotSmallConfig"  # BlenderbotSmall 配置用于文档说明


LARGE_NEGATIVE = -1e8  # 设置一个大负数常量，用于某些计算中


# 从 transformers.models.bart.modeling_tf_bart.shift_tokens_right 复制而来
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)  # 将 pad_token_id 转换为 input_ids 的数据类型
    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)  # 将 decoder_start_token_id 转换为 input_ids 的数据类型
    start_tokens = tf.fill(
        (shape_list(input_ids)[0], 1),  # 填充形状为 (input_ids 的行数, 1) 的张量
        tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)  # 使用 decoder_start_token_id 填充
    )
    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)  # 将起始标记与 input_ids 右移一位进行连接
    # 将 labels 中可能的 -100 值替换为 pad_token_id
    shifted_input_ids = tf.where(
        shifted_input_ids == -100,
        tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
        shifted_input_ids,
    )

    # "验证 labels 中仅包含正值和 -100"
    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))

    # 确保通过包装结果在一个空操作中调用断言操作
    with tf.control_dependencies([assert_gte0]):
        shifted_input_ids = tf.identity(shifted_input_ids)

    return shifted_input_ids  # 返回右移后的 input_ids


# 从 transformers.models.bart.modeling_tf_bart._make_causal_mask 复制而来
# 创建一个用于双向自注意力的因果（causal）掩码。
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.
    """
    # 获取批次大小
    bsz = input_ids_shape[0]
    # 获取目标序列长度
    tgt_len = input_ids_shape[1]
    # 创建初始掩码，所有元素为负无穷大（用于softmax后概率接近0）
    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
    # 创建掩码条件，形状为 [tgt_len]
    mask_cond = tf.range(shape_list(mask)[-1])

    # 将对角线以下的元素设置为0，保留对角线及以上的元素
    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)

    # 如果有历史键值长度，则在掩码左侧添加0的列，使其与历史键值对齐
    if past_key_values_length > 0:
        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)

    # 在批次维度和其他维度上复制掩码，以匹配输入的形状
    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))


# 从 transformers.models.bart.modeling_tf_bart._expand_mask 复制过来
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    # 获取源序列长度
    src_len = shape_list(mask)[1]
    # 如果未提供目标长度，则使用源序列长度
    tgt_len = tgt_len if tgt_len is not None else src_len
    # 创建常数张量1.0
    one_cst = tf.constant(1.0)
    # 将掩码转换为常数张量类型
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在维度上复制掩码，以匹配目标长度
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    # 返回扩展后的掩码，其中将1减去掩码值乘以一个大负数（LARGE_NEGATIVE）
    return (one_cst - expanded_mask) * LARGE_NEGATIVE


# 从 transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding 复制过来，将Blenderbot改为BlenderbotSmall
class TFBlenderbotSmallLearnedPositionalEmbedding(keras.layers.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
        super().__init__(num_embeddings, embedding_dim, **kwargs)

    def call(
        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
    ):
        """Input is expected to be of size [bsz x seqlen]."""
        # 如果未提供位置ID，则创建一个从0开始递增的序列，与历史键值长度相加
        if position_ids is None:
            seq_len = input_shape[1]
            position_ids = tf.range(seq_len, delta=1, name="range")
            position_ids += past_key_values_length

        # 调用父类的call方法，传递位置ID并转换为int32类型
        return super().call(tf.cast(position_ids, dtype=tf.int32))


# 从 transformers.models.bart.modeling_tf_bart.TFBartAttention 复制过来，将Bart改为BlenderbotSmall
class TFBlenderbotSmallAttention(keras.layers.Layer):
    """Multi-headed attention from "Attention Is All You Need"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = keras.layers.Dropout(dropout)
        self.head_dim = embed_dim // num_heads  # 计算每个注意力头的维度
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"  # 抛出异常，如果 embed_dim 不能被 num_heads 整除
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子的计算
        self.is_decoder = is_decoder

        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")  # 创建用于 K 矩阵投影的 Dense 层
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")  # 创建用于 Q 矩阵投影的 Dense 层
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")  # 创建用于 V 矩阵投影的 Dense 层
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")  # 创建用于输出矩阵投影的 Dense 层

    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))  # 重新形状化张量，以便多头注意力操作

    def call(
        self,
        hidden_states: tf.Tensor,
        key_value_states: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        # 模型的前向传播函数
        # hidden_states: 输入的隐藏状态张量
        # key_value_states: 可选的键值状态张量
        # past_key_value: 可选的过去键值张量
        # attention_mask: 可选的注意力掩码张量
        # layer_head_mask: 可选的层头掩码张量
        # training: 可选的训练模式标志

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "k_proj", None) is not None:
            with tf.name_scope(self.k_proj.name):
                self.k_proj.build([None, None, self.embed_dim])  # 构建 K 矩阵投影层
        if getattr(self, "q_proj", None) is not None:
            with tf.name_scope(self.q_proj.name):
                self.q_proj.build([None, None, self.embed_dim])  # 构建 Q 矩阵投影层
        if getattr(self, "v_proj", None) is not None:
            with tf.name_scope(self.v_proj.name):
                self.v_proj.build([None, None, self.embed_dim])  # 构建 V 矩阵投影层
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.embed_dim])  # 构建输出矩阵投影层
# Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall

class TFBlenderbotSmallEncoderLayer(keras.layers.Layer):
    def __init__(self, config: BlenderbotSmallConfig, **kwargs):
        super().__init__(**kwargs)
        # 初始化层的参数
        self.embed_dim = config.d_model  # 获取模型的嵌入维度
        # 创建自注意力层对象，用于处理自注意力机制
        self.self_attn = TFBlenderbotSmallAttention(
            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
        )
        # 创建自注意力层后的层归一化层
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 随机失活层，用于在训练期间随机失活部分神经元
        self.dropout = keras.layers.Dropout(config.dropout)
        # 激活函数，根据配置选择合适的激活函数
        self.activation_fn = get_tf_activation(config.activation_function)
        # 激活函数后的激活层随机失活层
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
        # 第一个全连接层，处理前馈神经网络的第一层变换
        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
        # 第二个全连接层，处理前馈神经网络的第二层变换
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 最终层归一化层，处理前馈神经网络的输出
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 保存配置信息
        self.config = config

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: np.ndarray | tf.Tensor | None,
        layer_head_mask: tf.Tensor | None,
        training: Optional[bool] = False,
    ) -> tf.Tensor:
        """
        Args:
            hidden_states (`tf.Tensor`): 输入到层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): 注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`，
                其中填充元素由非常大的负值表示。
            layer_head_mask (`tf.Tensor`): 给定层的注意力头部掩码，形状为 `(encoder_attention_heads,)`
        """
        # 保留输入的残差连接
        residual = hidden_states
        # 使用自注意力层处理输入张量
        hidden_states, self_attn_weights, _ = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
        )

        # 断言确保自注意力层不改变输入张量的形状
        tf.debugging.assert_equal(
            shape_list(hidden_states),
            shape_list(residual),
            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
        )

        # 应用随机失活到处理后的张量
        hidden_states = self.dropout(hidden_states, training=training)
        # 残差连接与处理后的张量相加
        hidden_states = residual + hidden_states
        # 应用层归一化到残差连接后的张量
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 保留新的残差连接
        residual = hidden_states
        # 使用激活函数处理第一个全连接层
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用激活函数后的随机失活
        hidden_states = self.activation_dropout(hidden_states, training=training)
        # 使用第二个全连接层处理张量
        hidden_states = self.fc2(hidden_states)
        # 应用随机失活到处理后的张量
        hidden_states = self.dropout(hidden_states, training=training)
        # 残差连接与处理后的张量相加
        hidden_states = residual + hidden_states
        # 应用层归一化到残差连接后的张量
        hidden_states = self.final_layer_norm(hidden_states)

        # 返回处理后的张量以及自注意力权重
        return hidden_states, self_attn_weights
    # 构建方法用于建立模型的层结构，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self attention 层
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                # 使用 self attention 层的名称作为命名空间，构建该层
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 layer normalization 层
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                # 使用 layer normalization 层的名称作为命名空间，构建该层
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建第一个全连接层
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                # 使用第一个全连接层的名称作为命名空间，构建该层
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建第二个全连接层
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                # 使用第二个全连接层的名称作为命名空间，构建该层
                self.fc2.build([None, None, self.config.encoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建最终的 layer normalization 层
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                # 使用最终 layer normalization 层的名称作为命名空间，构建该层
                self.final_layer_norm.build([None, None, self.embed_dim])
# 从transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer复制而来，将Bart改为BlenderbotSmall
class TFBlenderbotSmallDecoderLayer(keras.layers.Layer):
    def __init__(self, config: BlenderbotSmallConfig, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = config.d_model  # 设置嵌入维度为配置中的模型维度
        self.self_attn = TFBlenderbotSmallAttention(
            embed_dim=self.embed_dim,  # 自注意力层，使用设定的嵌入维度
            num_heads=config.decoder_attention_heads,  # 使用配置中的注意力头数
            dropout=config.attention_dropout,  # 使用配置中的注意力机制dropout率
            name="self_attn",  # 层名称为self_attn
            is_decoder=True,  # 标记为解码器自注意力层
        )
        self.dropout = keras.layers.Dropout(config.dropout)  # Dropout层，使用配置中的dropout率
        self.activation_fn = get_tf_activation(config.activation_function)  # 获取激活函数
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)  # 激活函数的dropout层

        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 自注意力层后的LayerNormalization层

        self.encoder_attn = TFBlenderbotSmallAttention(
            self.embed_dim,  # 编码器注意力层，使用相同的嵌入维度
            config.decoder_attention_heads,  # 使用配置中的注意力头数
            dropout=config.attention_dropout,  # 使用配置中的注意力机制dropout率
            name="encoder_attn",  # 层名称为encoder_attn
            is_decoder=True,  # 标记为解码器编码器注意力层
        )
        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
        # 编码器注意力层后的LayerNormalization层

        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")  # 第一个全连接层，使用配置中的FFN维度
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")  # 第二个全连接层，输出维度与嵌入维度相同
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 最终的LayerNormalization层

        self.config = config  # 保存配置信息

    def call(
        self,
        hidden_states: tf.Tensor,  # 输入的隐藏状态张量
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码张量或数组，可选
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,  # 编码器隐藏状态张量或数组，可选
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,  # 编码器注意力掩码张量或数组，可选
        layer_head_mask: tf.Tensor | None = None,  # 层级头掩码张量，可选
        cross_attn_layer_head_mask: tf.Tensor | None = None,  # 跨注意力头掩码张量，可选
        past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,  # 过去键值元组，可选
        training: Optional[bool] = False,  # 训练标志位，可选
    # 构建函数，用于构建模型的层结构，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 设置标志位，表示模型已经构建
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self attention 层
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 self attention 层的层归一化
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 encoder_attn 属性，则构建 encoder-decoder attention 层
        if getattr(self, "encoder_attn", None) is not None:
            with tf.name_scope(self.encoder_attn.name):
                self.encoder_attn.build(None)
        
        # 如果存在 encoder_attn_layer_norm 属性，则构建 encoder-decoder attention 层的层归一化
        if getattr(self, "encoder_attn_layer_norm", None) is not None:
            with tf.name_scope(self.encoder_attn_layer_norm.name):
                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建第一个全连接层
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建第二个全连接层
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.decoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建最终的层归一化层
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
# TFBlenderbotSmallPreTrainedModel 类的定义，继承自 TFPreTrainedModel。
class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel):
    # 配置类，指定为 BlenderbotSmallConfig
    config_class = BlenderbotSmallConfig
    # 模型基本前缀设置为 "model"
    base_model_prefix = "model"
    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
    >>> print("Human: ", UTTERANCE)
    打印出人类的发言
    
    >>> inputs = tokenizer([UTTERANCE], return_tensors="tf")
    使用分词器对发言进行处理，返回模型输入的张量表示
    
    >>> reply_ids = model.generate(**inputs)
    使用模型生成回复
    
    >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
    打印出生成的机器人回复，跳过特殊标记后的解码结果
    
    >>> REPLY = "I'm not sure"
    >>> print("Human: ", REPLY)
    打印出人类的回复
    
    >>> NEXT_UTTERANCE = (
    ...     "My friends are cool but they eat too many carbs.</s> "
    ...     "<s>what kind of carbs do they eat? i don't know much about carbs.</s> "
    ...     "<s>I'm not sure."
    ... )
    设置下一轮对话的文本
    
    >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf")
    使用分词器处理下一轮对话文本，返回模型输入的张量表示
    
    >>> inputs.pop("token_type_ids")
    移除张量表示中的token_type_ids（标记类型标识符）
    
    >>> next_reply_ids = model.generate(**inputs)
    使用模型生成下一轮对话的回复
    
    >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
    打印出生成的机器人回复，跳过特殊标记后的解码结果
"""

BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
"""


@keras_serializable
class TFBlenderbotSmallEncoder(keras.layers.Layer):
    config_class = BlenderbotSmallConfig
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`TFBlenderbotSmallEncoderLayer`].

    Args:
        config: BlenderbotSmallConfig
    """

    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        self.dropout = keras.layers.Dropout(config.dropout)  # 初始化dropout层，根据配置设置dropout率
        self.layerdrop = config.encoder_layerdrop  # 获取配置中的layerdrop参数，用于层级别的dropout
        self.padding_idx = config.pad_token_id  # 获取配置中的pad_token_id，用于填充的特殊token
        self.max_source_positions = config.max_position_embeddings  # 获取配置中的max_position_embeddings，最大位置嵌入长度
        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0  # 根据配置设置嵌入的缩放因子

        self.embed_tokens = embed_tokens  # 初始化嵌入token
        self.embed_positions = TFBlenderbotSmallLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            name="embed_positions",
        )  # 初始化位置嵌入
        self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]  # 创建多个编码层
        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")  # 初始化嵌入层归一化
        self.embed_dim = config.d_model  # 获取配置中的嵌入维度

    def get_embed_tokens(self):
        return self.embed_tokens  # 返回嵌入token

    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens  # 设置嵌入token

    @unpack_inputs
    def call(
        self,
        input_ids=None,
        inputs_embeds=None,
        attention_mask=None,
        head_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        """
        实现Layer的call方法，用于前向传播

        Args:
            input_ids: 输入的token ids
            inputs_embeds: 嵌入表示
            attention_mask: 注意力掩码
            head_mask: 多头注意力的掩码
            output_attentions: 是否输出注意力权重
            output_hidden_states: 是否输出隐藏状态
            return_dict: 是否返回字典格式结果
            training: 是否为训练模式

        Returns:
            根据配置返回相应的结果
        """
        # 省略具体实现细节，实现模型的前向传播逻辑

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)  # 构建位置嵌入
        if getattr(self, "layernorm_embedding", None) is not None:
            with tf.name_scope(self.layernorm_embedding.name):
                self.layernorm_embedding.build([None, None, self.embed_dim])  # 构建嵌入层的归一化
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)  # 构建每个编码层


@keras_serializable
class TFBlenderbotSmallDecoder(keras.layers.Layer):
    config_class = BlenderbotSmallConfig
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`]

    Args:
        config: BlenderbotSmallConfig
        embed_tokens: output embedding
    """
    # 使用给定的配置和嵌入标记初始化对象，继承父类的初始化方法
    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
        super().__init__(**kwargs)
        # 将配置保存在对象中
        self.config = config
        # 设置填充索引为配置中的填充标记 ID
        self.padding_idx = config.pad_token_id
        # 设置嵌入标记为给定的嵌入标记
        self.embed_tokens = embed_tokens
        # 设置层的丢弃率为配置中的解码器层丢弃率
        self.layerdrop = config.decoder_layerdrop
        # 使用给定的最大位置嵌入数量和模型维度创建位置嵌入对象
        self.embed_positions = TFBlenderbotSmallLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
            name="embed_positions",
        )
        # 如果配置中设置了缩放嵌入，则计算并设置嵌入的缩放因子为模型维度的平方根，否则设为1.0
        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
        # 创建解码器层的列表，每一层使用给定的配置创建一个解码器层对象
        self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
        # 创建用于嵌入层归一化的层归一化对象
        self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")

        # 创建一个丢弃层，使用配置中的丢弃率
        self.dropout = keras.layers.Dropout(config.dropout)

    # 获取当前嵌入标记对象的方法
    def get_embed_tokens(self):
        return self.embed_tokens

    # 设置新的嵌入标记对象的方法
    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens

    # 装饰器，解包输入参数，用于处理call方法的输入参数
    @unpack_inputs
    # 模型的调用方法，处理输入并返回模型的输出
    def call(
        self,
        input_ids=None,
        inputs_embeds=None,
        attention_mask=None,
        position_ids=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        # 方法体的具体实现将在下文注释中描述


注释：
@keras_serializable
class TFBlenderbotSmallMainLayer(keras.layers.Layer):
    # 设定配置类为 BlenderbotSmallConfig
    config_class = BlenderbotSmallConfig

    def __init__(self, config: BlenderbotSmallConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化函数，接收 BlenderbotSmallConfig 对象作为配置参数
        self.config = config
        
        # 创建一个共享的嵌入层，用于共享模型的词汇表和嵌入大小
        self.shared = keras.layers.Embedding(
            input_dim=config.vocab_size,
            output_dim=config.d_model,
            embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
            name="model.shared",
        )
        # 添加一个额外的属性，用于指定层的预期名称范围（用于加载/存储权重）
        self.shared.load_weight_prefix = "model.shared"

        # 创建编码器和解码器层，使用 TFBlenderbotSmallEncoder 和 TFBlenderbotSmallDecoder 类
        self.encoder = TFBlenderbotSmallEncoder(config, self.shared, name="encoder")
        self.decoder = TFBlenderbotSmallDecoder(config, self.shared, name="decoder")

    # 返回共享的嵌入层对象
    def get_input_embeddings(self):
        return self.shared

    # 设置新的输入嵌入层对象，并更新编码器和解码器中的 embed_tokens 属性
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    # 使用装饰器 unpack_inputs，处理输入参数并调用模型
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        decoder_position_ids=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
        past_key_values=None,
        inputs_embeds=None,
        decoder_inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
        **kwargs,
        ):
        # 如果输出隐藏状态参数为 None，则使用模型配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 如果 encoder_outputs 为 None，则调用 encoder 进行编码
        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                training=training,
            )
        # 如果 return_dict=True 且 encoder_outputs 是元组，则将其包装在 TFBaseModelOutput 中
        elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
            encoder_outputs = TFBaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )
        # 如果 return_dict=False 且 encoder_outputs 是 TFBaseModelOutput，则将其转换为元组
        elif not return_dict and not isinstance(encoder_outputs, tuple):
            encoder_outputs = encoder_outputs.to_tuple()

        # 调用 decoder 进行解码，使用 encoder 输出作为其中的一些参数
        decoder_outputs = self.decoder(
            decoder_input_ids,
            attention_mask=decoder_attention_mask,
            position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 如果 return_dict=False，则将 decoder 和 encoder 输出组合后返回
        if not return_dict:
            return decoder_outputs + encoder_outputs

        # 如果 return_dict=True，则根据 TFSeq2SeqModelOutput 的结构返回 decoder 和 encoder 的输出
        return TFSeq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )
    # 构建模型的方法，在输入形状为None时
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回
        if self.built:
            return
        # 设置模型已构建的标志为True
        self.built = True
        
        # 共享/绑定的权重期望位于模型基本命名空间中
        # 将"/"添加到tf.name_scope的末尾（而不是开头！）会将其放置在根命名空间而不是当前命名空间中。
        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
            # 构建共享/绑定模型
            self.shared.build(None)
        
        # 如果存在编码器(encoder)模型，则在其命名空间内构建
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        
        # 如果存在解码器(decoder)模型，则在其命名空间内构建
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)
# 为 TFBlenderbotSmallModel 类添加文档字符串，说明这是一个不带特定顶部头的原始隐藏状态输出的 BLENDERBOT_SMALL 模型。
@add_start_docstrings(
    "The bare BLENDERBOT_SMALL Model outputting raw hidden-states without any specific head on top.",
    BLENDERBOT_SMALL_START_DOCSTRING,
)
class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel):
    def __init__(self, config: BlenderbotSmallConfig, *inputs, **kwargs):
        # 调用父类的构造函数，传递配置和其他输入参数
        super().__init__(config, *inputs, **kwargs)

        # 创建 TFBlenderbotSmallMainLayer 实例作为模型的主要层
        self.model = TFBlenderbotSmallMainLayer(config, name="model")

    # 返回模型的编码器部分
    def get_encoder(self):
        return self.model.encoder

    # 返回模型的解码器部分
    def get_decoder(self):
        return self.model.decoder

    # 定义模型的前向传播方法，接收多个输入参数，输出模型的结果
    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSeq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        decoder_input_ids: tf.Tensor | None = None,
        decoder_attention_mask: tf.Tensor | None = None,
        decoder_position_ids: tf.Tensor | None = None,
        head_mask: tf.Tensor | None = None,
        decoder_head_mask: tf.Tensor | None = None,
        cross_attn_head_mask: tf.Tensor | None = None,
        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
        past_key_values: List[tf.Tensor] | None = None,
        inputs_embeds: tf.Tensor | None = None,
        decoder_inputs_embeds: tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
        # 调用模型的前向传播方法，将输入参数传递给模型并获取输出结果
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回模型的输出结果
        return outputs

    # 从 transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output 复制并注释
    # 该部分功能的具体内容未在提供的代码片段中给出，需要进一步补充
    # 定义一个方法用于处理模型的输出
    def serving_output(self, output):
        # 如果配置要求使用缓存，则从输出中获取过去键值对中的第二个元素；否则设为 None
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置要求输出隐藏状态，则将输出的解码器隐藏状态转换为张量；否则设为 None
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将输出的解码器注意力权重转换为张量；否则设为 None
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置要求输出交叉注意力权重，则将输出的交叉注意力权重转换为张量；否则设为 None
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置要求输出隐藏状态，则将输出的编码器隐藏状态转换为张量；否则设为 None
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将输出的编码器注意力权重转换为张量；否则设为 None
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 返回一个 TFSeq2SeqModelOutput 对象，包含不同类型的模型输出
        return TFSeq2SeqModelOutput(
            last_hidden_state=output.last_hidden_state,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    # 构建方法，用于建立模型结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设定为已构建状态
        self.built = True
        # 如果已存在模型，则在指定的命名空间下构建模型
        if getattr(self, "model", None) is not None:
            with tf.name_scope(self.model.name):
                self.model.build(None)
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(keras.layers.Layer):
    """
    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
    so all weights have to be registered in a layer.
    """

    def __init__(self, shape, initializer, trainable, name, **kwargs):
        super().__init__(name=name, **kwargs)
        # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
        # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
        # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
        # 添加权重到层中，用于偏置项，名称不会进行作用域化处理以便正确序列化
        self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)

    def call(self, x):
        # 在输入张量 x 上加上偏置项
        return x + self.bias


@add_start_docstrings(
    "The BLENDERBOT_SMALL Model with a language modeling head. Can be used for summarization.",
    BLENDERBOT_SMALL_START_DOCSTRING,
)
class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel, TFCausalLanguageModelingLoss):
    _keys_to_ignore_on_load_unexpected = [
        r"model.encoder.embed_tokens.weight",
        r"model.decoder.embed_tokens.weight",
    ]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 创建 TFBlenderbotSmallMainLayer 实例作为模型主体，并命名为 "model"
        self.model = TFBlenderbotSmallMainLayer(config, name="model")
        # 从配置中获取是否使用缓存
        self.use_cache = config.use_cache
        # 创建 BiasLayer 实例作为模型的偏置项，用于最终的 logits，设置为不可训练以保持一致性
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
        )

    def get_decoder(self):
        # 返回模型的解码器部分
        return self.model.decoder

    def get_encoder(self):
        # 返回模型的编码器部分
        return self.model.encoder

    def get_output_embeddings(self):
        # 返回输入嵌入层
        return self.get_input_embeddings()

    def set_output_embeddings(self, value):
        # 设置输出嵌入层
        self.set_input_embeddings(value)

    def get_bias(self):
        # 返回偏置项字典
        return {"final_logits_bias": self.bias_layer.bias}

    def set_bias(self, value):
        # 替换已有的包含偏置项的层，以便正确序列化和反序列化
        vocab_size = value["final_logits_bias"].shape[-1]
        self.bias_layer = BiasLayer(
            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
        )
        # 将新的偏置值赋给偏置层
        self.bias_layer.bias.assign(value["final_logits_bias"])

    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
    # 定义一个方法用于执行模型的前向传播。参数如下：

    # input_ids: 输入的张量，表示模型的输入序列的标识符
    input_ids: tf.Tensor | None = None,

    # attention_mask: 输入的张量，用于指示哪些位置的标识符需要被注意力层忽略
    attention_mask: tf.Tensor | None = None,

    # decoder_input_ids: 解码器的输入序列的标识符
    decoder_input_ids: tf.Tensor | None = None,

    # decoder_attention_mask: 解码器的输入张量，指示哪些位置的标识符需要被注意力层忽略
    decoder_attention_mask: tf.Tensor | None = None,

    # decoder_position_ids: 解码器的位置标识符
    decoder_position_ids: tf.Tensor | None = None,

    # head_mask: 指定哪些注意力头部应该被屏蔽的张量
    head_mask: tf.Tensor | None = None,

    # decoder_head_mask: 解码器的注意力头部的屏蔽张量
    decoder_head_mask: tf.Tensor | None = None,

    # cross_attn_head_mask: 交叉注意力的头部屏蔽张量
    cross_attn_head_mask: tf.Tensor | None = None,

    # encoder_outputs: 编码器输出的可选结果
    encoder_outputs: Optional[TFBaseModelOutput] = None,

    # past_key_values: 解码器过去的键值对列表
    past_key_values: List[tf.Tensor] | None = None,

    # inputs_embeds: 输入的嵌入张量
    inputs_embeds: tf.Tensor | None = None,

    # decoder_inputs_embeds: 解码器的输入嵌入张量
    decoder_inputs_embeds: tf.Tensor | None = None,

    # use_cache: 是否使用缓存的布尔值
    use_cache: Optional[bool] = None,

    # output_attentions: 是否输出注意力权重的布尔值
    output_attentions: Optional[bool] = None,

    # output_hidden_states: 是否输出隐藏状态的布尔值
    output_hidden_states: Optional[bool] = None,

    # return_dict: 是否返回字典格式的输出结果的布尔值
    return_dict: Optional[bool] = None,

    # labels: 标签张量，用于模型训练
    labels: tf.Tensor | None = None,

    # training: 是否为训练模式的布尔值，默认为False
    training: Optional[bool] = False,
    ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
        r"""
        labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Returns a tuple containing either masked_lm_loss and model outputs or a TFSeq2SeqLMOutput object.

        """

        # Adjust labels to replace pad_token_id with -100, preserving dtype
        if labels is not None:
            labels = tf.where(
                labels == self.config.pad_token_id,
                tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
                labels,
            )
            # Set use_cache to False if decoder_input_ids or decoder_inputs_embeds are not provided
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                # Shift labels to the right and prepend decoder_start_token_id
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        # Pass inputs to the model for computation
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # Compute logits and apply bias
        lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
        lm_logits = self.bias_layer(lm_logits)
        # Compute masked language modeling loss if labels are provided
        masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)

        # Return outputs based on return_dict flag
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
        # Return TFSeq2SeqLMOutput object containing relevant model outputs
        return TFSeq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,  # index 1 of d outputs
            decoder_hidden_states=outputs.decoder_hidden_states,  # index 2 of d outputs
            decoder_attentions=outputs.decoder_attentions,  # index 3 of d outputs
            cross_attentions=outputs.cross_attentions,  # index 4 of d outputs
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,  # index 0 of encoder outputs
            encoder_hidden_states=outputs.encoder_hidden_states,  # index 1 of encoder outputs
            encoder_attentions=outputs.encoder_attentions,  # index 2 of encoder outputs
        )

    # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output
    # 定义一个方法用于生成模型输出，将输入的输出对象output转换为TFSeq2SeqLMOutput对象
    def serving_output(self, output):
        # 如果配置允许使用缓存，则从output的过去键值对中获取第一个元素作为past_key_values
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置允许输出隐藏状态，则将output的解码器隐藏状态转换为张量dec_hs
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置允许输出注意力权重，则将output的解码器注意力转换为张量dec_attns
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置允许输出注意力权重，则将output的交叉注意力转换为张量cross_attns
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置允许输出隐藏状态，则将output的编码器隐藏状态转换为张量enc_hs
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置允许输出注意力权重，则将output的编码器注意力转换为张量enc_attns
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 返回一个TFSeq2SeqLMOutput对象，包含logits、past_key_values、decoder_hidden_states、decoder_attentions、
        # cross_attentions、encoder_last_hidden_state、encoder_hidden_states和encoder_attentions等属性
        return TFSeq2SeqLMOutput(
            logits=output.logits,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    # 从transformers库中复制的方法，用于生成生成过程的输入
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        decoder_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果past_key_values不为None，则截取decoder_input_ids的最后一个标记作为输入
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 如果decoder_attention_mask不为None，则使用累积求和操作计算decoder_position_ids
        if decoder_attention_mask is not None:  # xla
            decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
        # 否则如果past_key_values不为None，则根据past_key_values的形状获取decoder_position_ids
        elif past_key_values is not None:  # no xla + past_key_values
            decoder_position_ids = past_key_values[0][0].shape[2]
        # 否则使用tf.range生成decoder_input_ids的位置ids作为decoder_position_ids
        else:  # no xla + no past_key_values
            decoder_position_ids = tf.range(decoder_input_ids.shape[1])

        # 返回一个字典，包含生成过程中的所有输入参数
        return {
            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_position_ids": decoder_position_ids,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        }
    # 定义一个方法 `build`，用于构建神经网络层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记为已构建状态
        self.built = True
        
        # 如果存在名为 `model` 的属性且不为 None，则进入条件
        if getattr(self, "model", None) is not None:
            # 使用 `model` 的名字作为命名空间，构建模型
            with tf.name_scope(self.model.name):
                self.model.build(None)
        
        # 如果存在名为 `bias_layer` 的属性且不为 None，则进入条件
        if getattr(self, "bias_layer", None) is not None:
            # 使用 `bias_layer` 的名字作为命名空间，构建偏置层
            with tf.name_scope(self.bias_layer.name):
                self.bias_layer.build(None)

Transformers-源码解析-十九-

Transformers 源码解析（十九）

.\models\blenderbot\modeling_tf_blenderbot.py

.\models\blenderbot\tokenization_blenderbot.py

.\models\blenderbot\tokenization_blenderbot_fast.py

.\models\blenderbot\__init__.py

.\models\blenderbot_small\configuration_blenderbot_small.py

.\models\blenderbot_small\modeling_blenderbot_small.py

.\models\blenderbot_small\modeling_flax_blenderbot_small.py

.\models\blenderbot_small\modeling_tf_blenderbot_small.py

`.\models\blenderbot\modeling_tf_blenderbot.py`

`.\models\blenderbot\tokenization_blenderbot.py`

`.\models\blenderbot\tokenization_blenderbot_fast.py`

`.\models\blenderbot\init.py`

`.\models\blenderbot_small\configuration_blenderbot_small.py`

`.\models\blenderbot_small\modeling_blenderbot_small.py`

`.\models\blenderbot_small\modeling_flax_blenderbot_small.py`

`.\models\blenderbot_small\modeling_tf_blenderbot_small.py`