Transformers 源码解析（五十一）

`.\models\funnel\modeling_tf_funnel.py`

# coding=utf-8
# Copyright 2020-present Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 Funnel model."""


from __future__ import annotations

import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_funnel import FunnelConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "FunnelConfig"

TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "funnel-transformer/small",  # B4-4-4H768
    "funnel-transformer/small-base",  # B4-4-4H768, no decoder
    "funnel-transformer/medium",  # B6-3x2-3x2H768
    "funnel-transformer/medium-base",  # B6-3x2-3x2H768, no decoder
    "funnel-transformer/intermediate",  # B6-6-6H768
    "funnel-transformer/intermediate-base",  # B6-6-6H768, no decoder
    "funnel-transformer/large",  # B8-8-8H1024
    "funnel-transformer/large-base",  # B8-8-8H1024, no decoder
    "funnel-transformer/xlarge-base",  # B10-10-10H1024
    "funnel-transformer/xlarge",  # B10-10-10H1024, no decoder
]

INF = 1e6


class TFFunnelEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)  # 调用父类的初始化方法

        self.config = config  # 保存配置对象的引用
        self.hidden_size = config.hidden_size  # 从配置对象中获取隐藏层大小
        self.initializer_std = 1.0 if config.initializer_std is None else config.initializer_std  # 设置初始化器的标准差

        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")  # 创建 LayerNormalization 层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout)  # 创建 Dropout 层

    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):  # 定义名称域为 "word_embeddings"
            self.weight = self.add_weight(
                name="weight",  # 参数名为 "weight"
                shape=[self.config.vocab_size, self.hidden_size],  # 权重张量的形状
                initializer=get_initializer(initializer_range=self.initializer_std),  # 使用给定的初始化器初始化权重
            )

        if self.built:  # 如果已经构建过，直接返回
            return
        self.built = True  # 标记为已构建
        if getattr(self, "LayerNorm", None) is not None:  # 如果存在 LayerNorm 层
            with tf.name_scope(self.LayerNorm.name):  # 使用 LayerNorm 层的名称作为名称域
                self.LayerNorm.build([None, None, self.config.d_model])  # 构建 LayerNorm 层

    def call(self, input_ids=None, inputs_embeds=None, training=False):
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        assert not (input_ids is None and inputs_embeds is None)  # 断言输入张量不能同时为空
        assert not (input_ids is not None and inputs_embeds is not None)  # 断言输入张量不能同时不为空

        if input_ids is not None:  # 如果输入张量 input_ids 不为空
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)  # 检查 input_ids 是否在有效范围内
            inputs_embeds = tf.gather(self.weight, input_ids)  # 使用权重张量 self.weight 获取对应的嵌入向量

        final_embeddings = self.LayerNorm(inputs=inputs_embeds)  # 应用 LayerNorm 层
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)  # 应用 Dropout 层

        return final_embeddings  # 返回最终嵌入张量
    """
    Contains helpers for `TFFunnelRelMultiheadAttention`.
    """

    # 类属性，代表 <cls> token 的类型 ID，默认为 2
    cls_token_type_id: int = 2

    def __init__(self, config):
        # 初始化函数，根据传入的配置对象 config 初始化各个实例变量
        self.d_model = config.d_model  # 模型的维度
        self.attention_type = config.attention_type  # 注意力类型
        self.num_blocks = config.num_blocks  # 块的数量
        self.separate_cls = config.separate_cls  # 是否分离 <cls> token
        self.truncate_seq = config.truncate_seq  # 是否截断序列
        self.pool_q_only = config.pool_q_only  # 是否只池化查询（query）
        self.pooling_type = config.pooling_type  # 池化的类型

        self.sin_dropout = keras.layers.Dropout(config.hidden_dropout)  # Sinusoidal dropout
        self.cos_dropout = keras.layers.Dropout(config.hidden_dropout)  # Cosinusoidal dropout
        self.pooling_mult = None  # 池化倍数，初始为 None

    def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False):
        """Returns the attention inputs associated to the inputs of the model."""
        # 初始化注意力输入，根据模型的输入返回相应的注意力输入
        # inputs_embeds 的形状为 batch_size x seq_len x d_model
        # attention_mask 和 token_type_ids 的形状为 batch_size x seq_len
        self.pooling_mult = 1  # 设置池化倍数为 1
        self.seq_len = seq_len = shape_list(inputs_embeds)[1]  # 记录序列的长度
        position_embeds = self.get_position_embeds(seq_len, training=training)  # 获取位置嵌入
        token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None  # 将 token_type_ids 转换为 token_type_mat
        # 根据配置是否分离 <cls> token，创建对应的 mask
        cls_mask = (
            tf.pad(tf.ones([seq_len - 1, seq_len - 1], dtype=inputs_embeds.dtype), [[1, 0], [1, 0]])
            if self.separate_cls
            else None
        )
        return (position_embeds, token_type_mat, attention_mask, cls_mask)

    def token_type_ids_to_mat(self, token_type_ids):
        """Convert `token_type_ids` to `token_type_mat`."""
        # 将 token_type_ids 转换为 token_type_mat，用于区分不同类型的 token
        token_type_mat = tf.equal(tf.expand_dims(token_type_ids, -1), tf.expand_dims(token_type_ids, -2))
        # 将 <cls> token 视为与 A 和 B 都在同一段中
        cls_ids = tf.equal(token_type_ids, tf.constant([self.cls_token_type_id], dtype=token_type_ids.dtype))
        cls_mat = tf.logical_or(tf.expand_dims(cls_ids, -1), tf.expand_dims(cls_ids, -2))
        return tf.logical_or(cls_mat, token_type_mat)

    def stride_pool_pos(self, pos_id, block_index):
        """
        Pool `pos_id` while keeping the cls token separate (if `self.separate_cls=True`).
        """
        # 对 pos_id 进行池化，如果 self.separate_cls=True，则保持 <cls> token 分开处理
        if self.separate_cls:
            # 在分离 <cls> token 的情况下，将 <cls> token 视为前一个块的第一个 token
            # 第一个实际块的位置始终为 1，前一个块的位置将为 `1 - 2 ** block_index`
            cls_pos = tf.constant([-(2**block_index) + 1], dtype=pos_id.dtype)
            # 如果截断序列，则从 pos_id 的第二个位置开始池化
            # 否则从 pos_id 的第一个位置开始池化
            pooled_pos_id = pos_id[1:-1] if self.truncate_seq else pos_id[1:]
            return tf.concat([cls_pos, pooled_pos_id[::2]], 0)
        else:
            # 如果不分离 <cls> token，则直接每隔一个位置进行池化
            return pos_id[::2]
    def relative_pos(self, pos, stride, pooled_pos=None, shift=1):
        """
        Build the relative positional vector between `pos` and `pooled_pos`.
        构建 `pos` 和 `pooled_pos` 之间的相对位置向量。
        """
        if pooled_pos is None:
            pooled_pos = pos

        # Calculate the reference point based on pooled_pos and pos
        ref_point = pooled_pos[0] - pos[0]
        # Calculate the number of elements to remove
        num_remove = shift * shape_list(pooled_pos)[0]
        # Calculate the maximum distance based on the reference point, stride, and number of elements to remove
        max_dist = ref_point + num_remove * stride
        # Calculate the minimum distance based on pooled_pos and pos
        min_dist = pooled_pos[0] - pos[-1]

        # Generate a range tensor from max_dist to min_dist-1 with step -stride
        return tf.range(max_dist, min_dist - 1, -stride)

    def stride_pool(self, tensor, axis):
        """
        Perform pooling by stride slicing the tensor along the given axis.
        在给定的轴上通过步进切片对张量进行池化。
        """
        if tensor is None:
            return None

        # If axis is a list or tuple of ints, recursively perform stride pool for each axis
        if isinstance(axis, (list, tuple)):
            for ax in axis:
                tensor = self.stride_pool(tensor, ax)
            return tensor

        # If tensor is a list or tuple of tensors, recursively perform stride pool for each tensor
        if isinstance(tensor, (tuple, list)):
            return type(tensor)(self.stride_pool(x, axis) for x in tensor)

        # Handle negative axis values
        axis %= len(shape_list(tensor))

        # Determine the axis_slice based on conditions
        axis_slice = slice(None, -1, 2) if self.separate_cls and self.truncate_seq else slice(None, None, 2)
        enc_slice = [slice(None)] * axis + [axis_slice]

        # If separate_cls is True, concatenate the first slice of tensor with tensor along the specified axis
        if self.separate_cls:
            cls_slice = [slice(None)] * axis + [slice(None, 1)]
            tensor = tf.concat([tensor[cls_slice], tensor], axis)
        
        # Return the sliced tensor
        return tensor[enc_slice]

    def pool_tensor(self, tensor, mode="mean", stride=2):
        """Apply 1D pooling to a tensor of size [B x T (x H)]."""
        if tensor is None:
            return None

        # If tensor is a list or tuple of tensors, recursively apply pool_tensor to each tensor
        if isinstance(tensor, (tuple, list)):
            return type(tensor)(self.pool_tensor(x, mode=mode, stride=stride) for x in tensor)

        # Adjust tensor based on separate_cls and truncate_seq conditions
        if self.separate_cls:
            suffix = tensor[:, :-1] if self.truncate_seq else tensor
            tensor = tf.concat([tensor[:, :1], suffix], axis=1)

        ndim = len(shape_list(tensor))
        # Expand tensor dimensions if ndim equals 2
        if ndim == 2:
            tensor = tensor[:, :, None]

        # Perform 1D pooling based on mode (mean, max, min)
        if mode == "mean":
            tensor = tf.nn.avg_pool1d(tensor, stride, strides=stride, data_format="NWC", padding="SAME")
        elif mode == "max":
            tensor = tf.nn.max_pool1d(tensor, stride, strides=stride, data_format="NWC", padding="SAME")
        elif mode == "min":
            tensor = -tf.nn.max_pool1d(-tensor, stride, strides=stride, data_format="NWC", padding="SAME")
        else:
            raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.")

        # Squeeze the tensor if ndim equals 2
        return tf.squeeze(tensor, 2) if ndim == 2 else tensor
    def pre_attention_pooling(self, output, attention_inputs):
        """Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
        # 解包 attention_inputs 中的各个部分
        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
        
        # 如果仅对查询进行池化
        if self.pool_q_only:
            # 如果使用因式化注意力类型
            if self.attention_type == "factorized":
                # 对位置嵌入的前两部分进行池化操作，然后将其余部分保持不变
                position_embeds = self.stride_pool(position_embeds[:2], 0) + position_embeds[2:]
            # 对 token 类型矩阵进行池化
            token_type_mat = self.stride_pool(token_type_mat, 1)
            # 对类别掩码进行池化
            cls_mask = self.stride_pool(cls_mask, 0)
            
            # 对输出进行张量池化操作
            output = self.pool_tensor(output, mode=self.pooling_type)
        else:
            # 池化倍数乘以2
            self.pooling_mult *= 2
            # 如果使用因式化注意力类型
            if self.attention_type == "factorized":
                # 对位置嵌入进行池化操作
                position_embeds = self.stride_pool(position_embeds, 0)
            # 对 token 类型矩阵进行池化，使用步长为 [1, 2]
            token_type_mat = self.stride_pool(token_type_mat, [1, 2])
            # 对类别掩码进行池化，使用步长为 [1, 2]
            cls_mask = self.stride_pool(cls_mask, [1, 2])
            # 对注意力掩码进行张量池化操作，使用模式为 "min"
            attention_mask = self.pool_tensor(attention_mask, mode="min")
            # 对输出进行张量池化操作
            output = self.pool_tensor(output, mode=self.pooling_type)
        
        # 更新 attention_inputs
        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
        # 返回池化后的输出和更新后的 attention_inputs
        return output, attention_inputs

    def post_attention_pooling(self, attention_inputs):
        """Pool the proper parts of `attention_inputs` after the attention layer."""
        # 解包 attention_inputs 中的各个部分
        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
        
        # 如果仅对查询进行池化
        if self.pool_q_only:
            # 池化倍数乘以2
            self.pooling_mult *= 2
            # 如果使用因式化注意力类型
            if self.attention_type == "factorized":
                # 将位置嵌入的前两部分保持不变，对剩余部分进行池化操作
                position_embeds = position_embeds[:2] + self.stride_pool(position_embeds[2:], 0)
            # 对 token 类型矩阵进行池化，使用步长为 2
            token_type_mat = self.stride_pool(token_type_mat, 2)
            # 对类别掩码进行池化，使用步长为 1
            cls_mask = self.stride_pool(cls_mask, 1)
            # 对注意力掩码进行张量池化操作，使用模式为 "min"
            attention_mask = self.pool_tensor(attention_mask, mode="min")
        
        # 更新 attention_inputs
        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
        # 返回更新后的 attention_inputs
        return attention_inputs
def _relative_shift_gather(positional_attn, context_len, shift):
    batch_size, n_head, seq_len, max_rel_len = shape_list(positional_attn)
    # 获取 positional_attn 的形状信息，分别为 batch_size, n_head, seq_len, max_rel_len

    # 对 positional_attn 进行形状重塑，将其变为 [batch_size, n_head, max_rel_len, seq_len]
    positional_attn = tf.reshape(positional_attn, [batch_size, n_head, max_rel_len, seq_len])
    # 从第三个维度开始截取，即将 shift 后的部分保留，得到 [batch_size, n_head, max_rel_len - shift, seq_len]
    positional_attn = positional_attn[:, :, shift:, :]
    # 再次进行形状重塑，得到 [batch_size, n_head, seq_len, max_rel_len - shift]
    positional_attn = tf.reshape(positional_attn, [batch_size, n_head, seq_len, max_rel_len - shift])
    # 从最后一个维度截取 context_len 长度的部分，得到 [batch_size, n_head, seq_len, context_len]
    positional_attn = positional_attn[..., :context_len]
    # 返回处理后的 positional_attn
    return positional_attn


class TFFunnelRelMultiheadAttention(keras.layers.Layer):
    def __init__(self, config, block_index, **kwargs):
        super().__init__(**kwargs)
        # 初始化 TFFunnelRelMultiheadAttention 层，使用 config 中的参数和 block_index

        self.attention_type = config.attention_type
        self.n_head = n_head = config.n_head
        self.d_head = d_head = config.d_head
        self.d_model = d_model = config.d_model
        self.initializer_range = config.initializer_range
        self.block_index = block_index

        # 定义不同的 Dropout 层，分别用于隐藏层和注意力层
        self.hidden_dropout = keras.layers.Dropout(config.hidden_dropout)
        self.attention_dropout = keras.layers.Dropout(config.attention_dropout)

        # 获取初始化器，用于后面的层的权重初始化
        initializer = get_initializer(config.initializer_range)

        # 定义查询、键、值头部的全连接层
        self.q_head = keras.layers.Dense(
            n_head * d_head, use_bias=False, kernel_initializer=initializer, name="q_head"
        )
        self.k_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="k_head")
        self.v_head = keras.layers.Dense(n_head * d_head, kernel_initializer=initializer, name="v_head")

        # 定义后处理层和 LayerNormalization 层
        self.post_proj = keras.layers.Dense(d_model, kernel_initializer=initializer, name="post_proj")
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 定义缩放因子，用于缩放注意力得分
        self.scale = 1.0 / (d_head**0.5)
    # 在神经网络层的构建函数中，用于构建模型的输入形状
    def build(self, input_shape=None):
        # 从对象属性中获取头数、头的维度和模型的维度
        n_head, d_head, d_model = self.n_head, self.d_head, self.d_model
        # 根据指定的初始化范围获取初始化器
        initializer = get_initializer(self.initializer_range)

        # 添加权重变量 r_w_bias，形状为 (n_head, d_head)，用指定初始化器初始化
        self.r_w_bias = self.add_weight(
            shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_w_bias"
        )
        # 添加权重变量 r_r_bias，形状为 (n_head, d_head)，用指定初始化器初始化
        self.r_r_bias = self.add_weight(
            shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_r_bias"
        )
        # 添加权重变量 r_kernel，形状为 (d_model, n_head, d_head)，用指定初始化器初始化
        self.r_kernel = self.add_weight(
            shape=(d_model, n_head, d_head), initializer=initializer, trainable=True, name="r_kernel"
        )
        # 添加权重变量 r_s_bias，形状为 (n_head, d_head)，用指定初始化器初始化
        self.r_s_bias = self.add_weight(
            shape=(n_head, d_head), initializer=initializer, trainable=True, name="r_s_bias"
        )
        # 添加权重变量 seg_embed，形状为 (2, n_head, d_head)，用指定初始化器初始化
        self.seg_embed = self.add_weight(
            shape=(2, n_head, d_head), initializer=initializer, trainable=True, name="seg_embed"
        )

        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True

        # 如果存在 q_head 属性，构建其模型结构
        if getattr(self, "q_head", None) is not None:
            with tf.name_scope(self.q_head.name):
                self.q_head.build([None, None, d_model])
        
        # 如果存在 k_head 属性，构建其模型结构
        if getattr(self, "k_head", None) is not None:
            with tf.name_scope(self.k_head.name):
                self.k_head.build([None, None, d_model])
        
        # 如果存在 v_head 属性，构建其模型结构
        if getattr(self, "v_head", None) is not None:
            with tf.name_scope(self.v_head.name):
                self.v_head.build([None, None, d_model])
        
        # 如果存在 post_proj 属性，构建其模型结构
        if getattr(self, "post_proj", None) is not None:
            with tf.name_scope(self.post_proj.name):
                self.post_proj.build([None, None, n_head * d_head])
        
        # 如果存在 layer_norm 属性，构建其模型结构
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, d_model])
    def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
        """Relative attention score for the positional encodings"""
        # q_head has shape batch_size x sea_len x n_head x d_head
        
        if self.attention_type == "factorized":
            # Notations from the paper, appending A.2.2, final formula (https://arxiv.org/abs/2006.03236)
            # phi and pi have shape seq_len x d_model, psi and omega have shape context_len x d_model
            
            phi, pi, psi, omega = position_embeds
            
            # Shape n_head x d_head
            u = self.r_r_bias * self.scale
            
            # Shape d_model x n_head x d_head
            w_r = self.r_kernel
            
            # Shape batch_size x sea_len x n_head x d_model
            q_r_attention = tf.einsum("binh,dnh->bind", q_head + u, w_r)
            q_r_attention_1 = q_r_attention * phi[:, None]
            q_r_attention_2 = q_r_attention * pi[:, None]
            
            # Shape batch_size x n_head x seq_len x context_len
            positional_attn = tf.einsum("bind,jd->bnij", q_r_attention_1, psi) + tf.einsum(
                "bind,jd->bnij", q_r_attention_2, omega
            )
        
        else:
            # Notations from the paper, appending A.2.1, final formula (https://arxiv.org/abs/2006.03236)
            # Grab the proper positional encoding, shape max_rel_len x d_model
            
            if shape_list(q_head)[1] != context_len:
                shift = 2
                r = position_embeds[self.block_index][1]
            else:
                shift = 1
                r = position_embeds[self.block_index][0]
            
            # Shape n_head x d_head
            v = self.r_r_bias * self.scale
            
            # Shape d_model x n_head x d_head
            w_r = self.r_kernel
            
            # Shape max_rel_len x n_head x d_model
            r_head = tf.einsum("td,dnh->tnh", r, w_r)
            
            # Shape batch_size x n_head x seq_len x max_rel_len
            positional_attn = tf.einsum("binh,tnh->bnit", q_head + v, r_head)
            
            # Shape batch_size x n_head x seq_len x context_len
            positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
        
        if cls_mask is not None:
            positional_attn *= cls_mask
        
        return positional_attn
    def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
        """Relative attention score for the token_type_ids"""
        # 如果token_type_mat为None，则返回0
        if token_type_mat is None:
            return 0
        # 获取token_type_mat的形状信息
        batch_size, seq_len, context_len = shape_list(token_type_mat)
        
        # q_head的形状为 batch_size x seq_len x n_head x d_head
        # Shape n_head x d_head
        r_s_bias = self.r_s_bias * self.scale
        
        # Shape batch_size x n_head x seq_len x 2
        # 计算相对注意力偏置 token_type_bias
        token_type_bias = tf.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed)
        
        # Shape batch_size x n_head x seq_len x context_len
        # 将token_type_mat扩展为与token_type_bias相同的形状
        token_type_mat = tf.tile(token_type_mat[:, None], [1, shape_list(q_head)[2], 1, 1])
        
        # Shapes batch_size x n_head x seq_len
        # 将token_type_bias分为两部分：diff_token_type 和 same_token_type
        diff_token_type, same_token_type = tf.split(token_type_bias, 2, axis=-1)
        
        # Shape batch_size x n_head x seq_len x context_len
        # 根据token_type_mat的值选择不同的token_type_attn
        token_type_attn = tf.where(
            token_type_mat,
            tf.tile(same_token_type, [1, 1, 1, context_len]),
            tf.tile(diff_token_type, [1, 1, 1, context_len]),
        )

        # 如果存在cls_mask，则将token_type_attn与cls_mask相乘
        if cls_mask is not None:
            token_type_attn *= cls_mask
        
        # 返回计算得到的token_type_attn
        return token_type_attn
    def call(self, query, key, value, attention_inputs, output_attentions=False, training=False):
        # query has shape batch_size x seq_len x d_model
        # key and value have shapes batch_size x context_len x d_model
        # position_embeds, token_type_mat, attention_mask, cls_mask are unpacked from attention_inputs

        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs

        batch_size, seq_len, _ = shape_list(query)
        context_len = shape_list(key)[1]
        n_head, d_head = self.n_head, self.d_head

        # Shape batch_size x seq_len x n_head x d_head
        q_head = tf.reshape(self.q_head(query), [batch_size, seq_len, n_head, d_head])
        # Shapes batch_size x context_len x n_head x d_head
        k_head = tf.reshape(self.k_head(key), [batch_size, context_len, n_head, d_head])
        v_head = tf.reshape(self.v_head(value), [batch_size, context_len, n_head, d_head])

        q_head = q_head * self.scale
        # Shape n_head x d_head
        r_w_bias = self.r_w_bias * self.scale
        # Shapes batch_size x n_head x seq_len x context_len
        content_score = tf.einsum("bind,bjnd->bnij", q_head + r_w_bias, k_head)
        positional_attn = self.relative_positional_attention(position_embeds, q_head, context_len, cls_mask)
        token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)

        # merge attention scores
        attn_score = content_score + positional_attn + token_type_attn

        # perform masking
        if attention_mask is not None:
            attention_mask = tf.cast(attention_mask, dtype=attn_score.dtype)
            attn_score = attn_score - (INF * (1 - attention_mask[:, None, None]))

        # attention probability
        attn_prob = stable_softmax(attn_score, axis=-1)
        attn_prob = self.attention_dropout(attn_prob, training=training)

        # attention output, shape batch_size x seq_len x n_head x d_head
        attn_vec = tf.einsum("bnij,bjnd->bind", attn_prob, v_head)

        # Shape shape batch_size x seq_len x d_model
        attn_out = self.post_proj(tf.reshape(attn_vec, [batch_size, seq_len, n_head * d_head]))
        attn_out = self.hidden_dropout(attn_out, training=training)

        output = self.layer_norm(query + attn_out)
        return (output, attn_prob) if output_attentions else (output,)
# 定义一个名为TFFunnelPositionwiseFFN的自定义层，继承自keras.layers.Layer
class TFFunnelPositionwiseFFN(keras.layers.Layer):

    # 初始化函数，接收config和kwargs参数
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 根据config中的initializer_range获取初始化器
        initializer = get_initializer(config.initializer_range)
        # 创建一个全连接层，输入维度为config.d_model，输出维度为config.d_inner，使用刚初始化的initializer，命名为linear_1
        self.linear_1 = keras.layers.Dense(config.d_inner, kernel_initializer=initializer, name="linear_1")
        # 根据config中的hidden_act获取激活函数
        self.activation_function = get_tf_activation(config.hidden_act)
        # 创建一个dropout层，使用config中的activation_dropout参数
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
        # 创建一个全连接层，输入维度为config.d_inner，输出维度为config.d_model，使用刚初始化的initializer，命名为linear_2
        self.linear_2 = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_2")
        # 创建一个dropout层，使用config中的hidden_dropout参数
        self.dropout = keras.layers.Dropout(config.hidden_dropout)
        # 创建一个LayerNormalization层，输入维度为config.d_model，使用config中的layer_norm_eps参数，命名为layer_norm
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 保存config参数
        self.config = config

    # 定义调用函数，接收hidden和training两个参数
    def call(self, hidden, training=False):
        # 使用linear_1层处理hidden
        h = self.linear_1(hidden)
        # 使用激活函数处理h
        h = self.activation_function(h)
        # 使用activation_dropout层处理h，根据training参数决定是否启用训练模式
        h = self.activation_dropout(h, training=training)
        # 使用linear_2层处理h
        h = self.linear_2(h)
        # 使用dropout层处理h，根据training参数决定是否启用训练模式
        h = self.dropout(h, training=training)
        # 返回LayerNormalization层处理后的结果
        return self.layer_norm(hidden + h)

    # 构建函数，接收input_shape参数，默认为None
    def build(self, input_shape=None):
        # 如果已经构建好了，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 检查linear_1是否存在
        if getattr(self, "linear_1", None) is not None:
            # 在linear_1的作用域内构建该层，指定输入维度为[None, None, self.config.d_model]
            with tf.name_scope(self.linear_1.name):
                self.linear_1.build([None, None, self.config.d_model])
        # 检查linear_2是否存在
        if getattr(self, "linear_2", None) is not None:
            # 在linear_2的作用域内构建该层，指定输入维度为[None, None, self.config.d_inner]
            with tf.name_scope(self.linear_2.name):
                self.linear_2.build([None, None, self.config.d_inner])
        # 检查layer_norm是否存在
        if getattr(self, "layer_norm", None) is not None:
            # 在layer_norm的作用域内构建该层，指定输入维度为[None, None, self.config.d_model]
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])


# 定义一个名为TFFunnelLayer的自定义层，继承自keras.layers.Layer
class TFFunnelLayer(keras.layers.Layer):
    
    # 初始化函数，接收config、block_index和kwargs参数
    def __init__(self, config, block_index, **kwargs):
        super().__init__(**kwargs)
        # 创建一个TFFunnelRelMultiheadAttention层，使用config和block_index参数，命名为attention
        self.attention = TFFunnelRelMultiheadAttention(config, block_index, name="attention")
        # 创建一个TFFunnelPositionwiseFFN层，使用config参数，命名为ffn
        self.ffn = TFFunnelPositionwiseFFN(config, name="ffn")

    # 定义调用函数，接收query、key、value、attention_inputs、output_attentions和training参数
    def call(self, query, key, value, attention_inputs, output_attentions=False, training=False):
        # 使用attention层处理输入数据
        attn = self.attention(
            query, key, value, attention_inputs, output_attentions=output_attentions, training=training
        )
        # 使用ffn层处理attention的结果，根据training参数决定是否启用训练模式
        output = self.ffn(attn[0], training=training)
        # 返回output和attn[1]的元组（如果output_attentions为True），否则返回只含有output的元组
        return (output, attn[1]) if output_attentions else (output,)

    # 构建函数，接收input_shape参数，默认为None
    def build(self, input_shape=None):
        # 如果已经构建好了，则直接返回
        if self.built:
            return
        # 标记为已构建
        self.built = True
        # 检查attention是否存在
        if getattr(self, "attention", None) is not None:
            # 在attention的作用域内构建该层，输入形状为None
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        # 检查ffn是否存在
        if getattr(self, "ffn", None) is not None:
            # 在ffn的作用域内构建该层，输入形状为None
            with tf.name_scope(self.ffn.name):
                self.ffn.build(None)


# 定义一个名为TFFunnelEncoder的自定义层，继承自keras.layers.Layer
class TFFunnelEncoder(keras.layers.Layer):
    # 初始化函数，用于创建一个新的 TFFunnel 模型实例
    def __init__(self, config, **kwargs):
        # 调用父类（可能是超类或基类）的初始化方法，传递关键字参数
        super().__init__(**kwargs)
        # 从配置对象中获取是否分离类别信息的标志
        self.separate_cls = config.separate_cls
        # 从配置对象中获取是否仅使用问题（Query）的池化结果的标志
        self.pool_q_only = config.pool_q_only
        # 从配置对象中获取每个块重复次数的列表
        self.block_repeats = config.block_repeats
        # 使用配置对象创建 TFFunnelAttentionStructure 对象，处理注意力结构
        self.attention_structure = TFFunnelAttentionStructure(config)
        # 创建一个包含多个块的列表，每个块包含多个 TFFunnelLayer 层
        self.blocks = [
            [TFFunnelLayer(config, block_index, name=f"blocks_._{block_index}_._{i}") for i in range(block_size)]
            for block_index, block_size in enumerate(config.block_sizes)
        ]

    # 调用函数，实现 TFFunnel 模型的前向传播
    def call(
        self,
        inputs_embeds,
        attention_mask=None,
        token_type_ids=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        training=False,
    ):
        # 如果输入的注意力掩码是长张量，则需要进行类型转换，因为池化操作不适用于长张量。
        # attention_mask = tf.cast(attention_mask, inputs_embeds.dtype)
        
        # 初始化注意力输入，使用输入的嵌入向量和可能的注意力掩码、标记类型 ID 和训练标志
        attention_inputs = self.attention_structure.init_attention_inputs(
            inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            training=training,
        )
        
        # 将输入的嵌入向量赋值给隐藏状态
        hidden = inputs_embeds

        # 如果需要输出所有隐藏状态，则初始化一个列表，并将当前的输入嵌入向量添加进去
        all_hidden_states = (inputs_embeds,) if output_hidden_states else None
        
        # 如果需要输出所有注意力权重，则初始化一个空元组
        all_attentions = () if output_attentions else None

        # 遍历所有的 Transformer block
        for block_index, block in enumerate(self.blocks):
            # 判断是否需要进行池化操作，条件是隐藏状态的第二维度大于1（有多个 token），并且不是第一个 block
            pooling_flag = shape_list(hidden)[1] > (2 if self.separate_cls else 1)
            pooling_flag = pooling_flag and block_index > 0
            pooled_hidden = tf.zeros(shape_list(hidden))

            # 如果满足池化条件，则调用注意力结构的预池化函数
            if pooling_flag:
                pooled_hidden, attention_inputs = self.attention_structure.pre_attention_pooling(
                    hidden, attention_inputs
                )

            # 遍历当前 block 中的每一层
            for layer_index, layer in enumerate(block):
                # 根据 block_index 获取该 block 的重复次数
                for repeat_index in range(self.block_repeats[block_index]):
                    # 判断当前是否需要池化操作
                    do_pooling = (repeat_index == 0) and (layer_index == 0) and pooling_flag
                    if do_pooling:
                        query = pooled_hidden
                        key = value = hidden if self.pool_q_only else pooled_hidden
                    else:
                        query = key = value = hidden

                    # 调用 Transformer 层进行前向传播
                    layer_output = layer(
                        query, key, value, attention_inputs, output_attentions=output_attentions, training=training
                    )
                    hidden = layer_output[0]

                    # 如果需要池化，则调用注意力结构的后池化函数
                    if do_pooling:
                        attention_inputs = self.attention_structure.post_attention_pooling(attention_inputs)

                    # 如果需要输出注意力权重，则将当前层的注意力权重添加到 all_attentions 中
                    if output_attentions:
                        all_attentions = all_attentions + layer_output[1:]

                    # 如果需要输出所有隐藏状态，则将当前层的隐藏状态添加到 all_hidden_states 中
                    if output_hidden_states:
                        all_hidden_states = all_hidden_states + (hidden,)

        # 如果不需要以字典形式返回结果，则将结果组合成元组返回
        if not return_dict:
            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
        
        # 如果需要以字典形式返回结果，则构建 TFBaseModelOutput 并返回
        return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)

    # 构建方法，用于构建整个模型
    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        
        # 将模型标记为已构建状态
        self.built = True
        
        # 遍历每个 block 中的每个层，并调用其 build 方法构建层
        for block in self.blocks:
            for layer in block:
                with tf.name_scope(layer.name):
                    layer.build(None)
def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False):
    """
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    将张量 `x` 进行上采样，使其在序列长度维度上重复 `stride` 次，以匹配 `target_len` 的长度。
    """
    if stride == 1:
        return x
    if separate_cls:
        cls = x[:, :1]  # 提取张量 `x` 的第一个元素作为 cls
        x = x[:, 1:]  # 去除张量 `x` 的第一个元素后的部分
    output = tf.repeat(x, repeats=stride, axis=1)  # 在序列长度维度上重复张量 `x`，重复次数为 `stride`
    if separate_cls:
        if truncate_seq:
            output = tf.pad(output, [[0, 0], [0, stride - 1], [0, 0]])  # 如果需要截断序列，则在最后一维上进行填充
        output = output[:, : target_len - 1]  # 截取输出张量 `output` 的前 `target_len - 1` 个元素
        output = tf.concat([cls, output], axis=1)  # 将 cls 与处理后的 output 进行连接
    else:
        output = output[:, :target_len]  # 截取输出张量 `output` 的前 `target_len` 个元素
    return output  # 返回处理后的张量 `output`


class TFFunnelDecoder(keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        self.separate_cls = config.separate_cls  # 初始化是否分离 cls 标记
        self.truncate_seq = config.truncate_seq  # 初始化是否截断序列标记
        self.stride = 2 ** (len(config.block_sizes) - 1)  # 初始化上采样步长 `stride`
        self.attention_structure = TFFunnelAttentionStructure(config)  # 初始化注意力结构
        self.layers = [TFFunnelLayer(config, 0, name=f"layers_._{i}") for i in range(config.num_decoder_layers)]  # 初始化解码器层列表

    def call(
        self,
        final_hidden,
        first_block_hidden,
        attention_mask=None,
        token_type_ids=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        training=False,
    ):
        upsampled_hidden = upsample(
            final_hidden,
            stride=self.stride,
            target_len=shape_list(first_block_hidden)[1],
            separate_cls=self.separate_cls,
            truncate_seq=self.truncate_seq,
        )  # 调用上采样函数对 final_hidden 进行处理

        hidden = upsampled_hidden + first_block_hidden  # 将上采样后的 hidden 与第一个块的 hidden 相加
        all_hidden_states = (hidden,) if output_hidden_states else None  # 如果需要输出隐藏状态，则将 hidden 存入元组
        all_attentions = () if output_attentions else None  # 如果需要输出注意力，则初始化空元组

        attention_inputs = self.attention_structure.init_attention_inputs(
            hidden,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            training=training,
        )  # 初始化注意力输入结构

        for layer in self.layers:
            layer_output = layer(
                hidden, hidden, hidden, attention_inputs, output_attentions=output_attentions, training=training
            )  # 对每一层进行处理
            hidden = layer_output[0]  # 获取每一层的输出作为下一层的输入

            if output_attentions:
                all_attentions = all_attentions + layer_output[1:]  # 如果需要输出注意力，则将每一层的注意力加入到 all_attentions 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden,)  # 如果需要输出隐藏状态，则将每一层的隐藏状态加入到 all_hidden_states 中

        if not return_dict:
            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)  # 如果不返回字典，则返回元组形式的结果
        return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)  # 返回 TFBaseModelOutput 类的实例作为字典形式的结果

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True  # 设置标记已构建
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)  # 构建每一层
@keras_serializable
class TFFunnelBaseLayer(keras.layers.Layer):
    """Base model without decoder"""

    # 使用 FunnelConfig 类来配置模型
    config_class = FunnelConfig

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 将传入的配置保存到实例中
        self.config = config
        # 根据配置设置是否输出注意力权重
        self.output_attentions = config.output_attentions
        # 根据配置设置是否输出隐藏状态
        self.output_hidden_states = config.output_hidden_states
        # 根据配置设置是否返回字典形式的输出
        self.return_dict = config.use_return_dict

        # 创建嵌入层对象，并命名为 "embeddings"
        self.embeddings = TFFunnelEmbeddings(config, name="embeddings")
        # 创建编码器层对象，并命名为 "encoder"
        self.encoder = TFFunnelEncoder(config, name="encoder")

    def get_input_embeddings(self):
        # 返回嵌入层对象，用于获取输入的嵌入表示
        return self.embeddings

    def set_input_embeddings(self, value):
        # 设置嵌入层的权重为给定的值，并更新词汇表大小
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    def _prune_heads(self, heads_to_prune):
        # 当前未实现的方法，用于在 TF 2.0 模型中修剪注意力头
        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models

    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        if input_ids is not None and inputs_embeds is not None:
            # 如果同时指定了 input_ids 和 inputs_embeds，则抛出错误
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            # 获取输入的形状
            input_shape = shape_list(input_ids)
        elif inputs_embeds is not None:
            # 获取输入嵌入的形状（去除最后一个维度）
            input_shape = shape_list(inputs_embeds)[:-1]
        else:
            # 如果既没有指定 input_ids 也没有指定 inputs_embeds，则抛出错误
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if attention_mask is None:
            # 如果没有提供注意力掩码，则默认为全 1 的张量，形状与输入相同
            attention_mask = tf.fill(input_shape, 1)

        if token_type_ids is None:
            # 如果没有提供 token_type_ids，则默认为全 0 的张量，形状与输入相同
            token_type_ids = tf.fill(input_shape, 0)

        if inputs_embeds is None:
            # 如果未提供 inputs_embeds，则通过嵌入层获取 input_ids 的嵌入表示
            inputs_embeds = self.embeddings(input_ids, training=training)

        # 将输入嵌入传递给编码器层进行编码
        encoder_outputs = self.encoder(
            inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        return encoder_outputs

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "embeddings", None) is not None:
            # 如果存在嵌入层对象，则构建嵌入层
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        if getattr(self, "encoder", None) is not None:
            # 如果存在编码器对象，则构建编码器
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)


@keras_serializable
class TFFunnelMainLayer(keras.layers.Layer):
    """Base model with decoder"""

    # 使用 FunnelConfig 类来配置模型
    config_class = FunnelConfig
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)  # 调用父类的初始化方法，传递任意额外的关键字参数

        self.config = config  # 将配置对象保存到实例变量中
        self.block_sizes = config.block_sizes  # 从配置中获取块大小并保存到实例变量中
        self.output_attentions = config.output_attentions  # 从配置中获取是否输出注意力权重，并保存到实例变量中
        self.output_hidden_states = config.output_hidden_states  # 从配置中获取是否输出隐藏状态，并保存到实例变量中
        self.return_dict = config.use_return_dict  # 从配置中获取是否使用返回字典，并保存到实例变量中

        self.embeddings = TFFunnelEmbeddings(config, name="embeddings")  # 使用配置创建嵌入层对象并保存到实例变量中
        self.encoder = TFFunnelEncoder(config, name="encoder")  # 使用配置创建编码器对象并保存到实例变量中
        self.decoder = TFFunnelDecoder(config, name="decoder")  # 使用配置创建解码器对象并保存到实例变量中

    def get_input_embeddings(self):
        return self.embeddings  # 返回保存的嵌入层对象

    def set_input_embeddings(self, value):
        self.embeddings.weight = value  # 设置嵌入层的权重为给定值
        self.embeddings.vocab_size = shape_list(value)[0]  # 设置嵌入层的词汇大小为给定值的形状的第一个维度大小

    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError  # 抛出未实现错误，表示此方法在TF 2.0模型库中尚未实现

    @unpack_inputs
    def call(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
        ):
            # 如果同时指定了 input_ids 和 inputs_embeds，则抛出数值错误
            if input_ids is not None and inputs_embeds is not None:
                raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
            # 如果指定了 input_ids，则获取其形状
            elif input_ids is not None:
                input_shape = shape_list(input_ids)
            # 如果指定了 inputs_embeds，则获取其形状但不包括最后一维
            elif inputs_embeds is not None:
                input_shape = shape_list(inputs_embeds)[:-1]
            else:
                # 如果既没有指定 input_ids 也没有指定 inputs_embeds，则抛出数值错误
                raise ValueError("You have to specify either input_ids or inputs_embeds")

        if attention_mask is None:
            # 如果未指定 attention_mask，则用 1 填充，形状为 input_shape
            attention_mask = tf.fill(input_shape, 1)

        if token_type_ids is None:
            # 如果未指定 token_type_ids，则用 0 填充，形状为 input_shape
            token_type_ids = tf.fill(input_shape, 0)

        if inputs_embeds is None:
            # 如果未指定 inputs_embeds，则调用 self.embeddings 构建 embeddings
            inputs_embeds = self.embeddings(input_ids, training=training)

        # 使用 self.encoder 处理 inputs_embeds
        encoder_outputs = self.encoder(
            inputs_embeds,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=True,
            return_dict=return_dict,
            training=training,
        )

        # 使用 self.decoder 处理 encoder 的输出以生成 decoder 的输出
        decoder_outputs = self.decoder(
            final_hidden=encoder_outputs[0],
            first_block_hidden=encoder_outputs[1][self.block_sizes[0]],
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        if not return_dict:
            # 如果不返回字典，则根据需要构建输出元组
            idx = 0
            outputs = (decoder_outputs[0],)
            if output_hidden_states:
                idx += 1
                outputs = outputs + (encoder_outputs[1] + decoder_outputs[idx],)
            if output_attentions:
                idx += 1
                outputs = outputs + (encoder_outputs[2] + decoder_outputs[idx],)
            return outputs

        # 如果返回字典，则构建 TFBaseModelOutput 对象作为返回值
        return TFBaseModelOutput(
            last_hidden_state=decoder_outputs[0],
            hidden_states=(encoder_outputs.hidden_states + decoder_outputs.hidden_states)
            if output_hidden_states
            else None,
            attentions=(encoder_outputs.attentions + decoder_outputs.attentions) if output_attentions else None,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        # 如果存在 self.embeddings 属性，则构建 embeddings
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果存在 self.encoder 属性，则构建 encoder
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果存在 self.decoder 属性，则构建 decoder
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)
class TFFunnelDiscriminatorPredictions(keras.layers.Layer):
    """Prediction module for the discriminator, made up of two dense layers."""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 初始化第一个全连接层，输出维度为 config.d_model
        initializer = get_initializer(config.initializer_range)
        self.dense = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="dense")
        # 获取激活函数并存储
        self.activation_function = get_tf_activation(config.hidden_act)
        # 初始化第二个全连接层，输出维度为 1，用于预测
        self.dense_prediction = keras.layers.Dense(1, kernel_initializer=initializer, name="dense_prediction")
        # 存储配置
        self.config = config

    def call(self, discriminator_hidden_states):
        # 前向传播过程
        # 全连接层操作
        hidden_states = self.dense(discriminator_hidden_states)
        # 应用激活函数
        hidden_states = self.activation_function(hidden_states)
        # 对输出进行压缩成一维，用于预测
        logits = tf.squeeze(self.dense_prediction(hidden_states))
        return logits

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，则直接返回
        # 构建第一个全连接层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.d_model])
        # 构建第二个全连接层
        if getattr(self, "dense_prediction", None) is not None:
            with tf.name_scope(self.dense_prediction.name):
                self.dense_prediction.build([None, None, self.config.d_model])


class TFFunnelMaskedLMHead(keras.layers.Layer):
    """Masked Language Model (MLM) head for TFFunnel model."""

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)
        # 存储配置和嵌入层
        self.config = config
        self.hidden_size = config.hidden_size
        self.input_embeddings = input_embeddings

    def build(self, input_shape):
        # 创建偏置项，并且允许其训练
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
        # 调用父类的 build 方法
        super().build(input_shape)

    def get_output_embeddings(self):
        # 返回输入嵌入层
        return self.input_embeddings

    def set_output_embeddings(self, value):
        # 设置输出嵌入层的权重和词汇大小
        self.input_embeddings.weight = value
        self.input_embeddings.vocab_size = shape_list(value)[0]

    def get_bias(self):
        # 返回偏置项
        return {"bias": self.bias}

    def set_bias(self, value):
        # 设置偏置项的值，并更新配置中的词汇大小
        self.bias = value["bias"]
        self.config.vocab_size = shape_list(value["bias"])[0]

    def call(self, hidden_states, training=False):
        # 前向传播过程
        # 获取序列长度
        seq_length = shape_list(tensor=hidden_states)[1]
        # 将隐藏状态重塑为二维张量
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
        # 计算权重与输入嵌入层的乘积
        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
        # 将结果重新形状为三维张量
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        # 添加偏置项
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)

        return hidden_states


class TFFunnelClassificationHead(keras.layers.Layer):
    """Classification head for TFFunnel model."""
    # 初始化方法，接收配置信息、标签数和额外的关键字参数
    def __init__(self, config, n_labels, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 根据配置中的初始化范围获取初始化器
        initializer = get_initializer(config.initializer_range)
        # 创建一个全连接层，用于隐藏层，设置输出维度为config.d_model，使用指定的初始化器
        self.linear_hidden = keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_hidden")
        # 创建一个Dropout层，用于隐藏层，设置丢弃率为config.hidden_dropout
        self.dropout = keras.layers.Dropout(config.hidden_dropout)
        # 创建一个全连接层，用于输出层，设置输出维度为n_labels，使用相同的初始化器
        self.linear_out = keras.layers.Dense(n_labels, kernel_initializer=initializer, name="linear_out")
        # 保存配置信息
        self.config = config

    # 前向传播方法，接收隐藏层的输入和是否处于训练模式
    def call(self, hidden, training=False):
        # 经过隐藏层的全连接操作
        hidden = self.linear_hidden(hidden)
        # 使用双曲正切激活函数处理隐藏层输出
        hidden = keras.activations.tanh(hidden)
        # 在训练时对隐藏层输出进行丢弃操作
        hidden = self.dropout(hidden, training=training)
        # 经过输出层的全连接操作，得到最终输出
        return self.linear_out(hidden)

    # 构建方法，用于构建模型的各层
    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        # 将模型标记为已构建
        self.built = True
        # 如果存在隐藏层，按照指定的形状构建全连接层
        if getattr(self, "linear_hidden", None) is not None:
            with tf.name_scope(self.linear_hidden.name):
                self.linear_hidden.build([None, None, self.config.d_model])
        # 如果存在输出层，按照指定的形状构建全连接层
        if getattr(self, "linear_out", None) is not None:
            with tf.name_scope(self.linear_out.name):
                self.linear_out.build([None, None, self.config.d_model])
    @staticmethod
    def convert_attention_mask(attention_mask: tf.Tensor, dtype: tf.DType = tf.float32) -> tf.Tensor:
        """
        Converts a 2D Tensor to a boolean mask with shape [batch_size, 1, 1, sequence_length].

        Args:
            attention_mask (:obj:`tf.Tensor`): The attention mask.
            dtype (:obj:`tf.DType`, `optional`, defaults to :obj:`tf.float32`):
                The datatype of the resulting mask tensor.

        Returns:
            :obj:`tf.Tensor`: The boolean mask tensor.
        """
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`


    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!


    </Tip>


    Parameters:
        config ([`XxxConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
这是一个长字符串，用于文档化函数参数说明。
详细说明了模型输入的各个参数及其形状和含义。
"""

@add_start_docstrings(
    """
    基础的Funnel Transformer模型，输出原始隐藏状态，没有上采样头（也称为解码器）或任何特定任务的头部。
    """,
    FUNNEL_START_DOCSTRING,
)
class TFFunnelBaseModel(TFFunnelPreTrainedModel):
    """
    Funnel Transformer模型的基类，继承自TFFunnelPreTrainedModel。

    继承自TFFunnelPreTrainedModel的功能和属性将被此基类继承和使用。
    """
    # 初始化函数，用于创建一个新的Funnel模型实例
    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
        # 调用父类的初始化函数，传入配置和其他可变参数
        super().__init__(config, *inputs, **kwargs)
        # 创建一个TFFunnelBaseLayer的实例作为该模型的核心组件，命名为"funnel"
        self.funnel = TFFunnelBaseLayer(config, name="funnel")

    # 调用函数，将输入传递给funnel模型的前向方法，返回模型输出
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small-base",
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    @unpack_inputs
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFBaseModelOutput]:
        # 调用self.funnel的call方法，将各种输入参数传递给Funnel模型
        return self.funnel(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

    # serving_output函数，用于生成模型服务的输出
    def serving_output(self, output):
        # 创建TFBaseModelOutput实例作为输出，包含last_hidden_state、hidden_states和attentions
        # 注意：hidden_states和attentions未使用tf.convert_to_tensor转换，因为它们维度不同
        return TFBaseModelOutput(
            last_hidden_state=output.last_hidden_state,
            hidden_states=output.hidden_states,
            attentions=output.attentions,
        )

    # build函数，用于构建模型，设置各个组件的连接和初始化
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 将模型标记为已构建
        self.built = True
        # 如果存在self.funnel属性，则在tf的命名作用域内构建funnel模型
        if getattr(self, "funnel", None) is not None:
            with tf.name_scope(self.funnel.name):
                self.funnel.build(None)
@add_start_docstrings(
    """
    The bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.
    """,
    FUNNEL_START_DOCSTRING,
)
class TFFunnelModel(TFFunnelPreTrainedModel):
    """
    Funnel Transformer model for processing raw hidden-states without additional heads.

    Args:
        config (FunnelConfig): The model configuration class instance.

    Attributes:
        funnel (TFFunnelMainLayer): The main layer of the Funnel Transformer.

    """
    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
        super().__init__(config, *inputs, **kwargs)
        # Initialize Funnel main layer
        self.funnel = TFFunnelMainLayer(config, name="funnel")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small",
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFBaseModelOutput]:
        """
        Perform the forward pass of the Funnel model.

        Args:
            input_ids (TFModelInputType | None): Input token IDs.
            attention_mask (np.ndarray | tf.Tensor | None): Mask for attention scores.
            token_type_ids (np.ndarray | tf.Tensor | None): Segment token indices.
            inputs_embeds (np.ndarray | tf.Tensor | None): Embedded inputs.
            output_attentions (Optional[bool]): Whether to output attentions.
            output_hidden_states (Optional[bool]): Whether to output hidden states.
            return_dict (Optional[bool]): Whether to return as dictionary.
            training (bool): Whether in training mode.

        Returns:
            Union[Tuple[tf.Tensor], TFBaseModelOutput]: The model outputs.

        """
        return self.funnel(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

    def serving_output(self, output):
        """
        Format the model output for serving.

        Args:
            output: Output from the model.

        Returns:
            TFBaseModelOutput: Formatted output for serving.

        """
        # Ensure compatibility for non-tensor outputs
        return TFBaseModelOutput(
            last_hidden_state=output.last_hidden_state,
            hidden_states=output.hidden_states,
            attentions=output.attentions,
        )

    def build(self, input_shape=None):
        """
        Build the model layers.

        Args:
            input_shape: Shape of the input tensor.

        """
        if self.built:
            return
        self.built = True
        if getattr(self, "funnel", None) is not None:
            with tf.name_scope(self.funnel.name):
                self.funnel.build(None)


@add_start_docstrings(
    """
    Funnel model with a binary classification head on top as used during pretraining for identifying generated tokens.
    """,
    FUNNEL_START_DOCSTRING,
)
class TFFunnelForPreTraining(TFFunnelPreTrainedModel):
    """
    Funnel Transformer model for pretraining with a binary classification head.

    Args:
        config (FunnelConfig): The model configuration class instance.

    Attributes:
        funnel (TFFunnelMainLayer): The main layer of the Funnel Transformer.
        discriminator_predictions (TFFunnelDiscriminatorPredictions): Predictions layer for discriminator.

    """
    def __init__(self, config: FunnelConfig, **kwargs) -> None:
        super().__init__(config, **kwargs)

        # Initialize Funnel main layer and discriminator predictions layer
        self.funnel = TFFunnelMainLayer(config, name="funnel")
        self.discriminator_predictions = TFFunnelDiscriminatorPredictions(config, name="discriminator_predictions")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TFFunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        **kwargs,
    ) -> Union[Tuple[tf.Tensor], TFFunnelForPreTrainingOutput]:
        r"""
        模型调用方法，接收多个输入参数，生成预测输出或模型状态。

        Returns:
            返回一个元组或 TFFunnelForPreTrainingOutput 对象，包含模型的输出 logits 和可能的状态信息。

        Examples:
        
        ```
        >>> from transformers import AutoTokenizer, TFFunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = TFFunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
        >>> logits = model(inputs).logits
        ```"""
        # 使用输入调用模型的主干网络（如 Funnel），生成鉴别器的隐藏状态
        discriminator_hidden_states = self.funnel(
            input_ids,
            attention_mask,
            token_type_ids,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取鉴别器的序列输出（通常是隐藏状态的第一个元素）
        discriminator_sequence_output = discriminator_hidden_states[0]
        # 将鉴别器序列输出传递给鉴别器预测模块，生成最终的预测 logits
        logits = self.discriminator_predictions(discriminator_sequence_output)

        # 如果不要求返回字典形式的输出，则返回 logits 和其它鉴别器隐藏状态
        if not return_dict:
            return (logits,) + discriminator_hidden_states[1:]

        # 否则，返回包含 logits、隐藏状态和注意力权重的 TFFunnelForPreTrainingOutput 对象
        return TFFunnelForPreTrainingOutput(
            logits=logits,
            hidden_states=discriminator_hidden_states.hidden_states,
            attentions=discriminator_hidden_states.attentions,
        )

    def serving_output(self, output):
        # 输出服务化接口，不将 hidden_states 和 attentions 转换为 Tensor，因为它们具有不同的维度
        return TFFunnelForPreTrainingOutput(
            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
        )

    def build(self, input_shape=None):
        # 模型构建方法，如果已经构建过则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在主干网络 (funnel)，则在命名空间下构建它
        if getattr(self, "funnel", None) is not None:
            with tf.name_scope(self.funnel.name):
                self.funnel.build(None)
        # 如果存在鉴别器预测模块，则在命名空间下构建它
        if getattr(self, "discriminator_predictions", None) is not None:
            with tf.name_scope(self.discriminator_predictions.name):
                self.discriminator_predictions.build(None)
@add_start_docstrings("""Funnel Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING)
class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
        super().__init__(config, *inputs, **kwargs)

        # 初始化 Funnel 主层，并命名为 "funnel"
        self.funnel = TFFunnelMainLayer(config, name="funnel")
        # 初始化 Funnel Masked LM Head，并关联到 Funnel embeddings，命名为 "lm_head"
        self.lm_head = TFFunnelMaskedLMHead(config, self.funnel.embeddings, name="lm_head")

    def get_lm_head(self) -> TFFunnelMaskedLMHead:
        # 返回 Funnel Masked LM Head 对象
        return self.lm_head

    def get_prefix_bias_name(self) -> str:
        # 发出警告，指出方法 get_prefix_bias_name 已被弃用，建议使用 `get_bias`
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回 lm_head 对象的名称前缀，与当前对象名称组合而成的字符串
        return self.name + "/" + self.lm_head.name

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small",
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFMaskedLMOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 调用 Funnel 主层进行模型前向传播
        outputs = self.funnel(
            input_ids,
            attention_mask,
            token_type_ids,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取序列输出（即模型输出的第一个元素）
        sequence_output = outputs[0]
        # 使用 lm_head 处理序列输出，得到预测分数
        prediction_scores = self.lm_head(sequence_output, training=training)

        # 如果没有传入 labels，则损失为 None；否则计算 masked language modeling 损失
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)

        # 如果不需要返回字典，则返回 tuple 格式的输出
        if not return_dict:
            output = (prediction_scores,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典格式的输出，则构建 TFMaskedLMOutput 对象
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 定义一个方法用于处理输出的 TFMaskedLMOutput 对象，输入和输出都是 TFMaskedLMOutput 类型
    def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
        # 不将 hidden_states 和 attentions 转换为 Tensor，因为它们的维度各不相同
        # output.logits 是输出的对数概率
        return TFMaskedLMOutput(logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions)

    # 构建方法，用于构建模型
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True  # 标记模型已经构建

        # 如果有 funnel 属性，构建 funnel
        if getattr(self, "funnel", None) is not None:
            with tf.name_scope(self.funnel.name):
                self.funnel.build(None)  # 调用 funnel 的 build 方法

        # 如果有 lm_head 属性，构建 lm_head
        if getattr(self, "lm_head", None) is not None:
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build(None)  # 调用 lm_head 的 build 方法
@add_start_docstrings(
    """
    Funnel Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    FUNNEL_START_DOCSTRING,
)
class TFFunnelForSequenceClassification(TFFunnelPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 初始化Funnel模型的基础层
        self.funnel = TFFunnelBaseLayer(config, name="funnel")
        # 初始化Funnel模型的分类头部
        self.classifier = TFFunnelClassificationHead(config, config.num_labels, name="classifier")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small-base",
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFSequenceClassifierOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用Funnel模型的前向传播
        outputs = self.funnel(
            input_ids,
            attention_mask,
            token_type_ids,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取最后一层隐藏状态
        last_hidden_state = outputs[0]
        # 获取汇聚的输出
        pooled_output = last_hidden_state[:, 0]
        # 通过分类器预测logits
        logits = self.classifier(pooled_output, training=training)

        # 计算损失，如果提供了标签
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不要求返回字典，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回TFSequenceClassifierOutput对象，包括损失、logits、隐藏状态和注意力分布
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 处理模型输出，不对 hidden_states 和 attentions 使用 tf.convert_to_tensor 转换，
    # 因为它们的维度不同
    def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
        # 返回一个新的 TFSequenceClassifierOutput 对象，保留 logits、hidden_states 和 attentions
        return TFSequenceClassifierOutput(
            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
        )

    # 构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建，直接返回
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 如果存在 self.funnel 属性，则构建 self.funnel
        if getattr(self, "funnel", None) is not None:
            # 使用 self.funnel 的名称作为命名空间
            with tf.name_scope(self.funnel.name):
                # 调用 self.funnel 的 build 方法
                self.funnel.build(None)
        # 如果存在 self.classifier 属性，则构建 self.classifier
        if getattr(self, "classifier", None) is not None:
            # 使用 self.classifier 的名称作为命名空间
            with tf.name_scope(self.classifier.name):
                # 调用 self.classifier 的 build 方法
                self.classifier.build(None)
@add_start_docstrings(
    """
    Funnel Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    FUNNEL_START_DOCSTRING,
)
class TFFunnelForMultipleChoice(TFFunnelPreTrainedModel, TFMultipleChoiceLoss):
    """
    使用 Funnel 模型，并在其顶部添加一个多选分类头部（一个线性层位于汇总输出之上，并带有 softmax），例如用于 RocStories/SWAG 任务。
    继承自 TFFunnelPreTrainedModel 和 TFMultipleChoiceLoss。
    """

    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
        """
        初始化方法，设置模型的配置参数和输入。

        Args:
            config (FunnelConfig): Funnel 模型的配置对象。
            *inputs: 可变位置参数，传递给父类构造函数。
            **kwargs: 关键字参数，传递给父类构造函数。
        """
        super().__init__(config, *inputs, **kwargs)

        # 创建 Funnel 的基础层对象，命名为 "funnel"
        self.funnel = TFFunnelBaseLayer(config, name="funnel")
        # 创建 Funnel 分类头部对象，用于多选分类，输出维度为 1，命名为 "classifier"
        self.classifier = TFFunnelClassificationHead(config, 1, name="classifier")

    @property
    def dummy_inputs(self):
        """
        返回一个字典，包含用于模型前向传播的虚拟输入数据。

        Returns:
            dict: 包含虚拟输入数据的字典，键为 "input_ids"，值为形状为 (3, 3, 4) 的 tf.Tensor。
        """
        return {"input_ids": tf.ones((3, 3, 4), dtype=tf.int32)}

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small-base",
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        **kwargs
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        """
        模型前向传播方法，接受多种输入和控制参数。

        Args:
            input_ids (TFModelInputType, optional): 输入的 token IDs，形状为 (batch_size, num_choices, sequence_length)。
            attention_mask (np.ndarray or tf.Tensor, optional): 注意力掩码，形状与 input_ids 相同。
            token_type_ids (np.ndarray or tf.Tensor, optional): token 类型 IDs，形状与 input_ids 相同。
            inputs_embeds (np.ndarray or tf.Tensor, optional): 嵌入输入，形状为 (batch_size, num_choices, sequence_length, embedding_dim)。
            output_attentions (bool, optional): 是否返回注意力权重。
            output_hidden_states (bool, optional): 是否返回隐藏状态。
            return_dict (bool, optional): 是否返回字典形式的输出。
            labels (np.ndarray or tf.Tensor, optional): 分类标签，形状为 (batch_size, num_choices)。
            training (bool, optional): 是否为训练模式。

        Returns:
            Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: 返回模型的输出结果。
        """
        # 函数实现由装饰器 add_start_docstrings_to_model_forward 和 add_code_sample_docstrings 添加的文档字符串提供详细信息。
        pass  # 实际上的前向传播逻辑在具体的调用中执行，这里暂时不做任何操作，保留 pass 语句。
    ) -> Union[Tuple[tf.Tensor], TFMultipleChoiceModelOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        # 如果提供了 `input_ids`，则获取其第二个维度的大小作为选择数量，第三个维度的大小作为序列长度
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]
            seq_length = shape_list(input_ids)[2]
        else:
            # 如果未提供 `input_ids`，则使用 `inputs_embeds` 的第二个和第三个维度作为选择数量和序列长度
            num_choices = shape_list(inputs_embeds)[1]
            seq_length = shape_list(inputs_embeds)[2]

        # 将输入张量展平成二维张量，以便与模型处理的期望形状匹配
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_inputs_embeds = (
            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )

        # 调用模型的前向传播函数 `funnel`，传递展平后的输入张量和其他相关参数
        outputs = self.funnel(
            flat_input_ids,
            attention_mask=flat_attention_mask,
            token_type_ids=flat_token_type_ids,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从模型输出中获取最后一层隐藏状态和池化输出
        last_hidden_state = outputs[0]
        pooled_output = last_hidden_state[:, 0]

        # 使用分类器模型 `classifier` 对池化输出进行分类预测
        logits = self.classifier(pooled_output, training=training)

        # 将 logits 重新形状为二维张量，以匹配多选选择的期望形状
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        # 如果未提供标签 `labels`，则损失值为 None；否则使用 `hf_compute_loss` 函数计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果 `return_dict` 为 False，则返回一个元组，包含损失值和模型输出的其他部分
        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 `return_dict` 为 True，则返回一个 `TFMultipleChoiceModelOutput` 对象，包含损失值、logits、隐藏状态和注意力权重
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoiceModelOutput:
        # 作为服务输出，直接将给定的输出对象中的 logits、hidden_states 和 attentions 作为输出返回
        return TFMultipleChoiceModelOutput(
            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
        )
    # 定义 build 方法，用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 设置模型状态为已构建
        self.built = True
        
        # 如果模型中有名为 "funnel" 的子模型存在
        if getattr(self, "funnel", None) is not None:
            # 在命名空间下构建 "funnel" 子模型
            with tf.name_scope(self.funnel.name):
                self.funnel.build(None)
        
        # 如果模型中有名为 "classifier" 的子模型存在
        if getattr(self, "classifier", None) is not None:
            # 在命名空间下构建 "classifier" 子模型
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
@add_start_docstrings(
    """
    Funnel Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    FUNNEL_START_DOCSTRING,
)
class TFFunnelForTokenClassification(TFFunnelPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels  # 初始化模型的标签数量

        self.funnel = TFFunnelMainLayer(config, name="funnel")  # 创建主要的Funnel层
        self.dropout = keras.layers.Dropout(config.hidden_dropout)  # 设置dropout层
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )  # 设置分类器，用于将隐藏状态输出映射到标签空间
        self.config = config  # 存储配置信息

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small",
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFTokenClassifierOutput]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        outputs = self.funnel(
            input_ids,
            attention_mask,
            token_type_ids,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )  # 调用Funnel模型的前向传播

        sequence_output = outputs[0]  # 获取模型输出的序列隐藏状态

        sequence_output = self.dropout(sequence_output, training=training)  # 在训练时应用dropout

        logits = self.classifier(sequence_output)  # 将序列隐藏状态映射到标签空间的logits

        loss = None if labels is None else self.hf_compute_loss(labels, logits)  # 如果有标签，则计算损失

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output  # 如果不返回字典，则返回元组形式的输出

        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )  # 如果返回字典，则使用TFTokenClassifierOutput包装输出
    # 定义一个方法，用于处理模型的输出，将其转换为 TFTokenClassifierOutput 类型
    def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput:
        # 由于 hidden_states 和 attentions 的维度不同，并非所有都可以通过 tf.convert_to_tensor 转换为张量
        # 所以这里不对它们进行转换
        return TFTokenClassifierOutput(
            logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions
        )

    # 定义一个方法，用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        # 如果存在名为 "funnel" 的属性，构建 funnel 模型
        if getattr(self, "funnel", None) is not None:
            with tf.name_scope(self.funnel.name):
                self.funnel.build(None)
        # 如果存在名为 "classifier" 的属性，构建 classifier 模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                # 构建 classifier 模型，输入形状为 [None, None, self.config.hidden_size]
                self.classifier.build([None, None, self.config.hidden_size])
# 使用装饰器添加模型文档字符串，描述 Funnel 模型在提取式问答任务（如 SQuAD）上的用途
@add_start_docstrings(
    """
    Funnel Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    FUNNEL_START_DOCSTRING,  # 引用已定义的 FUNNEL_START_DOCSTRING
)
# 定义 TFFunnelForQuestionAnswering 类，继承自 TFFunnelPreTrainedModel 和 TFQuestionAnsweringLoss
class TFFunnelForQuestionAnswering(TFFunnelPreTrainedModel, TFQuestionAnsweringLoss):
    # 初始化方法
    def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None:
        super().__init__(config, *inputs, **kwargs)
        # 设置模型的标签数目
        self.num_labels = config.num_labels

        # 创建 Funnel 主层，并命名为 "funnel"
        self.funnel = TFFunnelMainLayer(config, name="funnel")
        
        # 创建用于问答输出的 Dense 层，输出维度为 config.num_labels，使用指定的初始化器
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        
        # 保存配置信息
        self.config = config

    # 使用装饰器定义 call 方法，用于模型的前向传播
    @unpack_inputs
    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="funnel-transformer/small",
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFQuestionAnsweringModelOutput]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """

        outputs = self.funnel(
            input_ids,
            attention_mask,
            token_type_ids,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取模型的输出序列表示
        sequence_output = outputs[0]

        # 通过全连接层获取起始位置和结束位置的预测分数
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        loss = None
        # 如果提供了起始位置和结束位置的标签，则计算损失
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions, "end_position": end_positions}
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果不要求返回字典，则返回起始位置和结束位置的预测分数以及额外的输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFQuestionAnsweringModelOutput 对象，包含损失、起始位置预测分数、结束位置预测分数、隐藏状态和注意力权重
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
        # 针对服务输出，直接复制输入的 TFQuestionAnsweringModelOutput 对象
        # 不转换 hidden_states 和 attentions 到 Tensor，因为它们具有不同的维度
        return TFQuestionAnsweringModelOutput(
            start_logits=output.start_logits,
            end_logits=output.end_logits,
            hidden_states=output.hidden_states,
            attentions=output.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果模型具有 "funnel" 属性，则构建 "funnel" 模型
        if getattr(self, "funnel", None) is not None:
            with tf.name_scope(self.funnel.name):
                self.funnel.build(None)
        # 如果模型具有 "qa_outputs" 属性，则构建 "qa_outputs" 层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\funnel\tokenization_funnel.py`

# 定义了代码文件的编码格式为 UTF-8
# 版权声明，指明代码版权归 HuggingFace Inc. 团队所有，采用 Apache License, Version 2.0
# 此函数用于加载指定路径下的词汇表文件，返回一个有序字典表示的词汇表
import collections
import os
import unicodedata
from typing import List, Optional, Tuple

# 从 tokenization_utils 模块中导入需要用到的函数和类
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging

# 获取 logger 对象，用于记录日志
logger = logging.get_logger(__name__)

# 定义词汇表文件名字典，只包含一个键值对，指定了词汇表文件名为 "vocab.txt"
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 模型名称列表，包含了不同规模和基础的 Funnel Transformer 模型
_model_names = [
    "small",
    "small-base",
    "medium",
    "medium-base",
    "intermediate",
    "intermediate-base",
    "large",
    "large-base",
    "xlarge",
    "xlarge-base",
]

# 预训练模型的词汇表文件映射，为每个模型配置了其对应的预训练词汇表下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/vocab.txt",
        "funnel-transformer/small-base": "https://huggingface.co/funnel-transformer/small-base/resolve/main/vocab.txt",
        "funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/vocab.txt",
        "funnel-transformer/medium-base": (
            "https://huggingface.co/funnel-transformer/medium-base/resolve/main/vocab.txt"
        ),
        "funnel-transformer/intermediate": (
            "https://huggingface.co/funnel-transformer/intermediate/resolve/main/vocab.txt"
        ),
        "funnel-transformer/intermediate-base": (
            "https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/vocab.txt"
        ),
        "funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/vocab.txt",
        "funnel-transformer/large-base": "https://huggingface.co/funnel-transformer/large-base/resolve/main/vocab.txt",
        "funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/vocab.txt",
        "funnel-transformer/xlarge-base": (
            "https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/vocab.txt"
        ),
    }
}

# 预训练模型的位置嵌入大小映射，为每个模型配置了其对应的位置嵌入维度为 512
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {f"funnel-transformer/{name}": 512 for name in _model_names}

# 预训练模型的初始化配置映射，为每个模型配置了其对应的初始化配置，这里统一设置了小写处理为 True
PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case": True} for name in _model_names}

# 从 transformers.models.bert.tokenization_bert.load_vocab 复制的函数，用于加载词汇表文件到一个有序字典中
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    # 使用 enumerate 函数遍历 tokens 列表，同时获取索引 index 和对应的 token
    for index, token in enumerate(tokens):
        # 去除 token 字符串末尾的换行符 "\n"
        token = token.rstrip("\n")
        # 将处理过的 token 作为键，将其索引 index 作为值存入 vocab 字典中
        vocab[token] = index
    # 返回填充完毕的 vocab 字典
    return vocab
# 从transformers.models.bert.tokenization_bert.whitespace_tokenize复制过来的函数
def whitespace_tokenize(text):
    """对文本进行基本的空白字符清理和分割。"""
    # 去除文本两端的空白字符
    text = text.strip()
    # 如果文本为空，则返回空列表
    if not text:
        return []
    # 使用空白字符分割文本，生成token列表
    tokens = text.split()
    # 返回分割后的token列表
    return tokens


class FunnelTokenizer(PreTrainedTokenizer):
    r"""
    构建一个Funnel Transformer的分词器。基于WordPiece。

    这个分词器继承自[`PreTrainedTokenizer`]，包含大部分主要方法。用户应参考这个超类以获取更多关于这些方法的信息。
    """
    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
            Whether or not to do basic tokenization before WordPiece.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"<sep>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"<cls>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sentence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sentence token.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
    ```
    
    # Define a constant for the names of vocabulary files
    vocab_files_names = VOCAB_FILES_NAMES
    # Define a constant mapping pretrained model files to their respective vocabulary files
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # Define a constant mapping pretrained model configurations to their initialization configurations
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # Define a constant for the maximum model input sizes based on pretrained positional embeddings sizes
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # Initialize the classifier token type ID to 2
    cls_token_type_id: int = 2
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        bos_token="<s>",
        eos_token="</s>",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 检查词汇文件是否存在，若不存在则抛出异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = FunnelTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表
        self.vocab = load_vocab(vocab_file)
        # 根据词汇表创建从编号到标记的有序字典
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要基本分词，则初始化基本分词器
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        # 使用词汇表和未知标记初始化词块分词器
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法，传递相同的参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            bos_token=bos_token,
            eos_token=eos_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    @property
    # 从 transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case 复制而来
    def do_lower_case(self):
        # 返回基本分词器的小写标记设置
        return self.basic_tokenizer.do_lower_case

    @property
    # 从 transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size 复制而来
    def vocab_size(self):
        # 返回词汇表的大小
        return len(self.vocab)

    # 从 transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab 复制而来
    def get_vocab(self):
        # 返回词汇表和添加的标记编码器的字典
        return dict(self.vocab, **self.added_tokens_encoder)

    # 从 transformers.models.bert.tokenization_bert.BertTokenizer._tokenize 复制而来
    def _tokenize(self, text, split_special_tokens=False):
        # 初始化空列表，用于存储分词后的 token
        split_tokens = []
        # 如果需要进行基础分词处理
        if self.do_basic_tokenize:
            # 使用 basic_tokenizer 对文本进行分词
            for token in self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens if not split_special_tokens else None
            ):
                # 如果 token 在 never_split 集合中
                if token in self.basic_tokenizer.never_split:
                    # 直接加入 split_tokens
                    split_tokens.append(token)
                else:
                    # 否则对 token 进行 wordpiece 分词处理，并加入 split_tokens
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 否则直接使用 wordpiece_tokenizer 对文本进行分词处理
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        # 返回分词后的 token 列表
        return split_tokens

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据 token 在 vocab 中查找对应的 id，如果不存在则返回 unk_token 对应的 id
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据 index 在 ids_to_tokens 中查找对应的 token，如果不存在则返回 unk_token
        return self.ids_to_tokens.get(index, self.unk_token)

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 将 token 列表连接成一个字符串，并移除特殊标记 " ##"，然后去除首尾空格
        out_string = " ".join(tokens).replace(" ##", "").strip()
        # 返回连接后的字符串
        return out_string

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if token_ids_1 is None:
            # 返回仅包含第一个句子的特殊 token 的输入序列
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        # 否则返回包含两个句子的特殊 token 的输入序列
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
        """
        Retrieve special tokens mask from the list of token IDs.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs corresponding to the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether the token list is already formatted with special tokens.

        Returns:
            `List[int]`: List of integers representing whether each token is special (1) or not (0).
        """
        # 如果输入序列已经包含特殊 token，则直接返回全零的 mask
        if already_has_special_tokens:
            return [0] * len(token_ids_0)
        # 否则生成一个 mask 列表
        mask = [1] * len(token_ids_0)
        sep = [self.sep_token_id]
        # 将第一个句子的末尾和可能存在的第二个句子的末尾设置为 1，其余为 0
        if token_ids_1 is not None:
            mask += sep + [0] * len(token_ids_1)
        else:
            mask += [0]
        return mask
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # Check if the token list already has special tokens
        if already_has_special_tokens:
            # If true, delegate to the base class's method to retrieve special token masks
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If there are two token lists (sequence pairs)
        if token_ids_1 is not None:
            # Create a mask with special tokens for both sequences
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        # Otherwise, create a mask with special tokens for the single sequence
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
        Transformer sequence pair mask has the following format:

        ```
        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define special tokens
        sep = [self.sep_token_id]  # Separator token ID
        cls = [self.cls_token_id]  # Classification token ID

        # If there is only one sequence
        if token_ids_1 is None:
            # Return token type IDs for the first sequence only
            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]

        # If there are two sequences
        # Return token type IDs for both sequences
        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    # 定义一个方法来保存词汇表到指定的目录和文件名前缀（可选）
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 初始化索引为0，用于检查词汇索引的连续性
        index = 0
        # 检查保存目录是否已存在
        if os.path.isdir(save_directory):
            # 构建词汇表文件的完整路径，包括目录和文件名前缀
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # 若保存目录不存在，则直接使用给定的文件路径作为词汇表文件路径
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        
        # 打开词汇表文件以写入模式，使用UTF-8编码
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # 遍历词汇表中的每个词汇及其索引，按索引排序
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # 如果当前词汇的索引不等于预期的索引，记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index
                # 将词汇写入文件，并在词汇后添加换行符
                writer.write(token + "\n")
                # 更新索引以保持连续性
                index += 1
        
        # 返回保存的词汇表文件路径的元组
        return (vocab_file,)
# 从transformers.models.b
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 将 `never_split` 参数与实例变量 `never_split` 的集合取并集，以确保不分割指定的 token
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本中的特殊符号和空白
        text = self._clean_text(text)

        # 以下代码块是为了支持多语言和中文模型而添加的，从2018年11月1日开始使用
        # 即使英文模型也会应用这一步骤，尽管它们没有在任何中文数据上训练
        if self.tokenize_chinese_chars:
            text = self._tokenize_chinese_chars(text)
        # 对文本进行 Unicode 规范化，确保相同字符的不同 Unicode 编码被视为相同字符
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 将文本按空白分割为初始 token
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        # 遍历每个初始 token
        for token in orig_tokens:
            # 如果 token 不在不分割的集合中
            if token not in never_split:
                # 如果需要小写化处理
                if self.do_lower_case:
                    # 将 token 转换为小写
                    token = token.lower()
                    # 如果需要去除重音符号，则去除
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果需要去除重音符号，则去除
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            # 将处理过的 token 按标点符号进行进一步分割
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 将分割后的 token 再次按空白合并为最终的输出 token 列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 对文本进行 Unicode 规范化，确保各种形式的重音符号都能被正确处理
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的 Unicode 分类
            cat = unicodedata.category(char)
            # 如果字符是重音符号，跳过
            if cat == "Mn":
                continue
            # 否则将字符加入到输出列表中
            output.append(char)
        # 将字符列表组合成字符串作为输出
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        # 如果不需要根据标点符号拆分，或者指定的文本在不拆分列表中，直接返回文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果当前字符是标点符号，将其作为新的列表项添加到输出中
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号，检查是否应该开始新单词
                if start_new_word:
                    output.append([])
                start_new_word = False
                # 将字符添加到当前列表项中
                output[-1].append(char)
            i += 1

        # 将每个列表项中的字符连接成字符串，形成最终的拆分结果
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是中文字符，将其两侧添加空格后添加到输出中
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        # 将列表转换为字符串并返回
        return "".join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 检查给定的码点是否是CJK字符的码点范围
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            # 如果字符是无效字符或控制字符，直接跳过
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            # 如果是空白字符，替换为单个空格，否则保留字符
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将列表转换为字符串并返回
        return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化WordpieceTokenizer对象
        self.vocab = vocab  # 词汇表，用于存储词汇
        self.unk_token = unk_token  # 未知词汇的标记
        self.max_input_chars_per_word = max_input_chars_per_word  # 单词最大字符数限制

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 初始化输出的token列表
        output_tokens = []
        # 对text进行空白符分割，得到每个token
        for token in whitespace_tokenize(text):
            # 将token转换为字符列表
            chars = list(token)
            # 如果token长度超过最大输入字符数限制，则将其标记为未知词汇
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            # 初始化标志和起始位置
            is_bad = False
            start = 0
            sub_tokens = []
            # 使用贪婪最长匹配算法进行分词
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    # 判断子串是否在词汇表中
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                # 如果未找到匹配的子串，则标记为无效词汇
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            # 根据标志选择添加子token列表或者未知词汇标记到输出token列表
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        # 返回最终的token列表
        return output_tokens

`.\models\funnel\tokenization_funnel_fast.py`

# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for Funnel Transformer."""

import json
from typing import List, Optional, Tuple

from tokenizers import normalizers  # 导入tokenizers模块中的normalizers

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入预训练分词器类
from ...utils import logging  # 导入日志工具
from .tokenization_funnel import FunnelTokenizer  # 导入FunnelTokenizer类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 定义词汇文件和分词器文件的名称
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 定义可用的模型名称列表
_model_names = [
    "small",
    "small-base",
    "medium",
    "medium-base",
    "intermediate",
    "intermediate-base",
    "large",
    "large-base",
    "xlarge",
    "xlarge-base",
]

# 定义预训练模型对应的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/vocab.txt",
        "funnel-transformer/small-base": "https://huggingface.co/funnel-transformer/small-base/resolve/main/vocab.txt",
        "funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/vocab.txt",
        "funnel-transformer/medium-base": (
            "https://huggingface.co/funnel-transformer/medium-base/resolve/main/vocab.txt"
        ),
        "funnel-transformer/intermediate": (
            "https://huggingface.co/funnel-transformer/intermediate/resolve/main/vocab.txt"
        ),
        "funnel-transformer/intermediate-base": (
            "https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/vocab.txt"
        ),
        "funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/vocab.txt",
        "funnel-transformer/large-base": "https://huggingface.co/funnel-transformer/large-base/resolve/main/vocab.txt",
        "funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/vocab.txt",
        "funnel-transformer/xlarge-base": (
            "https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/vocab.txt"
        ),
    },
    # tokenizer_file 字典，包含了多个键值对，每个键是模型名称，对应的值是其对应的 tokenizer.json 文件的下载链接
    "tokenizer_file": {
        # 模型 "funnel-transformer/small" 的 tokenizer.json 下载链接
        "funnel-transformer/small": "https://huggingface.co/funnel-transformer/small/resolve/main/tokenizer.json",
        # 模型 "funnel-transformer/small-base" 的 tokenizer.json 下载链接
        "funnel-transformer/small-base": (
            "https://huggingface.co/funnel-transformer/small-base/resolve/main/tokenizer.json"
        ),
        # 模型 "funnel-transformer/medium" 的 tokenizer.json 下载链接
        "funnel-transformer/medium": "https://huggingface.co/funnel-transformer/medium/resolve/main/tokenizer.json",
        # 模型 "funnel-transformer/medium-base" 的 tokenizer.json 下载链接
        "funnel-transformer/medium-base": (
            "https://huggingface.co/funnel-transformer/medium-base/resolve/main/tokenizer.json"
        ),
        # 模型 "funnel-transformer/intermediate" 的 tokenizer.json 下载链接
        "funnel-transformer/intermediate": (
            "https://huggingface.co/funnel-transformer/intermediate/resolve/main/tokenizer.json"
        ),
        # 模型 "funnel-transformer/intermediate-base" 的 tokenizer.json 下载链接
        "funnel-transformer/intermediate-base": (
            "https://huggingface.co/funnel-transformer/intermediate-base/resolve/main/tokenizer.json"
        ),
        # 模型 "funnel-transformer/large" 的 tokenizer.json 下载链接
        "funnel-transformer/large": "https://huggingface.co/funnel-transformer/large/resolve/main/tokenizer.json",
        # 模型 "funnel-transformer/large-base" 的 tokenizer.json 下载链接
        "funnel-transformer/large-base": (
            "https://huggingface.co/funnel-transformer/large-base/resolve/main/tokenizer.json"
        ),
        # 模型 "funnel-transformer/xlarge" 的 tokenizer.json 下载链接
        "funnel-transformer/xlarge": "https://huggingface.co/funnel-transformer/xlarge/resolve/main/tokenizer.json",
        # 模型 "funnel-transformer/xlarge-base" 的 tokenizer.json 下载链接
        "funnel-transformer/xlarge-base": (
            "https://huggingface.co/funnel-transformer/xlarge-base/resolve/main/tokenizer.json"
        ),
    },
# 定义一个字典，包含预训练位置嵌入的大小，其中键为形如"funnel-transformer/{name}"的字符串，值为固定的512
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {f"funnel-transformer/{name}": 512 for name in _model_names}

# 定义一个字典，包含预训练初始化配置信息，其中键为形如"funnel-transformer/{name}"的字符串，每个值是一个包含"do_lower_case"键的字典，其值为True
PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case": True} for name in _model_names}


class FunnelTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    
    """
    # 获取给定的词汇文件名列表
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练模型的初始化配置
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 慢速分词器类，用于创建实例
    slow_tokenizer_class = FunnelTokenizer
    # 预训练模型输入的最大长度
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 分类器令牌类型的ID，设置为2
    cls_token_type_id: int = 2
    # 使用给定的参数初始化对象，继承父类的初始化方法
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        bos_token="<s>",
        eos_token="</s>",
        clean_text=True,
        tokenize_chinese_chars=True,
        strip_accents=None,
        wordpieces_prefix="##",
        **kwargs,
    ):
        # 调用父类的初始化方法，传入参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            bos_token=bos_token,
            eos_token=eos_token,
            clean_text=clean_text,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            wordpieces_prefix=wordpieces_prefix,
            **kwargs,
        )

        # 获取当前的标准化器状态，并将其转换为 JSON 格式
        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        # 检查标准化器状态是否与当前实例化时的设置不一致，如果不一致则重新设置标准化器
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):
            # 获取当前标准化器的类，并更新状态
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            # 使用更新后的状态重新设置标准化器
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

        # 将初始化时的小写设置保存到对象的属性中
        self.do_lower_case = do_lower_case

    # 从token_ids_0和token_ids_1（可选）构建带有特殊标记的模型输入，用于序列分类任务
    # 该方法来自于transformers.models.funnel.tokenization_funnel_fast.FunnelTokenizerFast.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        通过连接和添加特殊标记，从序列或序列对构建用于序列分类任务的模型输入。Funnel 序列的格式如下：

        - 单个序列： `[CLS] X [SEP]`
        - 序列对： `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个 ID 列表，用于序列对。

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """
        # 初始化输出列表，添加 [CLS] 标记和 token_ids_0
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # 如果有 token_ids_1，则添加 [SEP] 和 token_ids_1
        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output
    # 根据两个序列的 token IDs 创建用于序列对分类任务的 token 类型 ID 列表。Funnel Transformer 序列对 mask 的格式如下：
    # ```
    # 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
    # | 第一个序列       | 第二个序列       |
    # ```
    # 如果 `token_ids_1` 是 `None`，则方法只返回 mask 的第一个部分（全为 0）。

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
        Transformer sequence pair mask has the following format:

        ```
        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]  # 分隔符的 token ID 列表
        cls = [self.cls_token_id]  # 类别开始的 token ID 列表
        if token_ids_1 is None:
            # 如果没有第二个序列，返回只包含第一个序列 token 类型 ID 的列表
            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
        # 否则，返回同时包含第一个和第二个序列 token 类型 ID 的列表
        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    # 从 transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary 复制而来
    # 保存词汇表到指定的目录，返回保存的文件名组成的元组
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

`.\models\funnel\init.py`

# 导入类型检查标记
from typing import TYPE_CHECKING

# 导入工具函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig"],
    "convert_funnel_original_tf_checkpoint_to_pytorch": [],
    "tokenization_funnel": ["FunnelTokenizer"],
}

# 检查是否有 Tokenizers 库可用，若无则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 tokenization_funnel_fast 到导入结构
    _import_structure["tokenization_funnel_fast"] = ["FunnelTokenizerFast"]

# 检查是否有 PyTorch 库可用，若无则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_funnel 到导入结构
    _import_structure["modeling_funnel"] = [
        "FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FunnelBaseModel",
        "FunnelForMaskedLM",
        "FunnelForMultipleChoice",
        "FunnelForPreTraining",
        "FunnelForQuestionAnswering",
        "FunnelForSequenceClassification",
        "FunnelForTokenClassification",
        "FunnelModel",
        "FunnelPreTrainedModel",
        "load_tf_weights_in_funnel",
    ]

# 检查是否有 TensorFlow 库可用，若无则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_tf_funnel 到导入结构
    _import_structure["modeling_tf_funnel"] = [
        "TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFFunnelBaseModel",
        "TFFunnelForMaskedLM",
        "TFFunnelForMultipleChoice",
        "TFFunnelForPreTraining",
        "TFFunnelForQuestionAnswering",
        "TFFunnelForSequenceClassification",
        "TFFunnelForTokenClassification",
        "TFFunnelModel",
        "TFFunnelPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入 FunnelConfig 和 FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP 类型
    from .configuration_funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig
    # 导入 FunnelTokenizer 类型
    from .tokenization_funnel import FunnelTokenizer

    # 检查是否有 Tokenizers 库可用，若无则抛出异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，导入 FunnelTokenizerFast 类型
        from .tokenization_funnel_fast import FunnelTokenizerFast

    # 检查是否有 PyTorch 库可用，若无则抛出异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 否则，从本地导入Funnel模型相关的模块和变量
    from .modeling_funnel import (
        FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,   # 导入预训练模型的存档列表
        FunnelBaseModel,                       # 导入Funnel基础模型类
        FunnelForMaskedLM,                     # 导入用于MLM的Funnel模型类
        FunnelForMultipleChoice,               # 导入用于多项选择任务的Funnel模型类
        FunnelForPreTraining,                  # 导入用于预训练的Funnel模型类
        FunnelForQuestionAnswering,            # 导入用于问答任务的Funnel模型类
        FunnelForSequenceClassification,       # 导入用于序列分类任务的Funnel模型类
        FunnelForTokenClassification,          # 导入用于标记分类任务的Funnel模型类
        FunnelModel,                           # 导入Funnel模型类
        FunnelPreTrainedModel,                 # 导入Funnel预训练模型类
        load_tf_weights_in_funnel,             # 导入加载TensorFlow权重的函数
    )

    # 尝试检查是否TensorFlow可用，如果不可用则引发OptionalDependencyNotAvailable异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获OptionalDependencyNotAvailable异常并忽略
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 否则，从本地导入TensorFlow版Funnel模型相关的模块和变量
        from .modeling_tf_funnel import (
            TF_FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,     # 导入TensorFlow版预训练模型的存档列表
            TFFunnelBaseModel,                           # 导入TensorFlow版Funnel基础模型类
            TFFunnelForMaskedLM,                         # 导入用于MLM的TensorFlow版Funnel模型类
            TFFunnelForMultipleChoice,                   # 导入用于多项选择任务的TensorFlow版Funnel模型类
            TFFunnelForPreTraining,                      # 导入用于预训练的TensorFlow版Funnel模型类
            TFFunnelForQuestionAnswering,                # 导入用于问答任务的TensorFlow版Funnel模型类
            TFFunnelForSequenceClassification,           # 导入用于序列分类任务的TensorFlow版Funnel模型类
            TFFunnelForTokenClassification,              # 导入用于标记分类任务的TensorFlow版Funnel模型类
            TFFunnelModel,                               # 导入TensorFlow版Funnel模型类
            TFFunnelPreTrainedModel,                     # 导入TensorFlow版Funnel预训练模型类
        )
else:
    # 导入 sys 模块，用于处理模块对象和引用
    import sys

    # 将当前模块添加到 sys.modules 中，通过 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\fuyu\configuration_fuyu.py`

# coding=utf-8
# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fuyu model configuration"""

# 导入所需的模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射，指定模型名称和其对应的配置文件链接
FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "adept/fuyu-8b": "https://huggingface.co/adept/fuyu-8b/resolve/main/config.json",
}

# 定义 FuyuConfig 类，继承自 PretrainedConfig 类
class FuyuConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
    Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    ```
    >>> from transformers import FuyuConfig

    >>> # Initializing a Fuyu fuyu-7b style configuration
    >>> configuration = FuyuConfig()
    ```
    """

    # 指定模型类型为 "fuyu"
    model_type = "fuyu"
    # 推理阶段忽略的键列表，这些键对于推断阶段不起作用
    keys_to_ignore_at_inference = ["past_key_values"]

    # 初始化方法，定义了 FuyuConfig 类的各种配置参数
    def __init__(
        self,
        vocab_size=262144,
        hidden_size=4096,
        intermediate_size=16384,
        num_hidden_layers=36,
        num_attention_heads=64,
        hidden_act="relu2",
        max_position_embeddings=16384,
        image_size=300,
        patch_size=30,
        num_channels=3,
        initializer_range=0.02,
        layer_norm_eps=1e-5,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=25000.0,
        rope_scaling=None,
        qk_layernorm=True,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        partial_rotary_factor=0.5,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        text_config=None,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置模型配置的基本参数
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            hidden_act=hidden_act,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )
        # 额外的模型配置参数
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.use_cache = use_cache
        self.tie_word_embeddings = tie_word_embeddings
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.qk_layernorm = qk_layernorm
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.partial_rotary_factor = partial_rotary_factor
        self.text_config = text_config
        ):
            # 如果 text_config 参数为 None，则使用默认数值初始化文本模型配置字典
            if text_config is None:
                text_config = {
                    "vocab_size": vocab_size,
                    "max_position_embeddings": max_position_embeddings,
                    "hidden_size": hidden_size,
                    "intermediate_size": intermediate_size,
                    "num_hidden_layers": num_hidden_layers,
                    "num_attention_heads": num_attention_heads,
                    "hidden_act": hidden_act,
                    "initializer_range": initializer_range,
                    "layer_norm_eps": layer_norm_eps,
                    "use_cache": use_cache,
                    "rope_theta": rope_theta,
                    "rope_scaling": rope_scaling,
                    "qk_layernorm": qk_layernorm,
                    "hidden_dropout": hidden_dropout,
                    "attention_dropout": attention_dropout,
                    "partial_rotary_factor": partial_rotary_factor,
                    "pad_token_id": pad_token_id,
                    "bos_token_id": bos_token_id,
                    "eos_token_id": eos_token_id,
                    "tie_word_embeddings": tie_word_embeddings,
                }
                # 记录日志，指示 text_config 为 None，使用默认值初始化文本模型
                logger.info("text_config is None. initializing the text model with default values.")
            
            # 如果 text_config 中包含 "model_type" 键，将其值赋给 text_model_type；否则使用默认值 "persimmon"
            text_model_type = text_config["model_type"] if "model_type" in text_config else "persimmon"
            # 使用 CONFIG_MAPPING 中相应的类初始化 self.text_config，传入 text_config 的全部参数
            self.text_config = CONFIG_MAPPING[text_model_type](**text_config)

            # 将参数赋给当前对象的属性
            self.vocab_size = vocab_size
            self.max_position_embeddings = max_position_embeddings
            self.image_size = image_size
            self.patch_size = patch_size
            self.num_channels = num_channels
            self.hidden_size = hidden_size
            self.intermediate_size = intermediate_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.hidden_act = hidden_act
            self.initializer_range = initializer_range
            self.layer_norm_eps = layer_norm_eps
            self.use_cache = use_cache
            self.rope_theta = rope_theta
            self.rope_scaling = rope_scaling
            self.qk_layernorm = qk_layernorm
            self.hidden_dropout = hidden_dropout
            self.attention_dropout = attention_dropout
            self.partial_rotary_factor = partial_rotary_factor

            # 调用 _rope_scaling_validation 方法，确保 rope_scaling 参数有效性
            self._rope_scaling_validation()

            # 调用父类的初始化方法，传入部分参数，完成对象初始化
            super().__init__(
                pad_token_id=pad_token_id,
                bos_token_id=bos_token_id,
                eos_token_id=eos_token_id,
                tie_word_embeddings=tie_word_embeddings,
                **kwargs,
            )

        # 从 transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation 复制
    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        # 检查是否设置了 `rope_scaling`，如果未设置则直接返回
        if self.rope_scaling is None:
            return

        # 检查 `rope_scaling` 是否为字典类型，并且包含两个字段
        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            # 如果不符合要求，抛出 ValueError 异常
            raise ValueError(
                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
                f"got {self.rope_scaling}"
            )
        
        # 获取 `rope_scaling` 中的 `type` 和 `factor` 字段的值
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        
        # 检查 `type` 字段是否为有效的值（'linear' 或 'dynamic'）
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
            # 如果不是有效的值，抛出 ValueError 异常
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
            )
        
        # 检查 `factor` 字段是否为浮点数且大于 1
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            # 如果不是符合要求的值，抛出 ValueError 异常
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

`.\models\fuyu\convert_fuyu_model_weights_to_hf.py`

# 引入命令行参数解析库
import argparse
# 引入操作系统相关功能的库
import os
# 引入系统相关的库
import sys
# 引入警告处理的库
import warnings

# 引入用于扁平化字典操作的库
import flatdict
# 引入PyTorch深度学习框架库
import torch

# 从transformers库中引入FuyuConfig、FuyuForCausalLM和LlamaTokenizer
from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer

# 尝试从transformers库中引入LlamaTokenizerFast，如果失败则发出警告并使用慢速的LlamaTokenizer
try:
    from transformers import LlamaTokenizerFast
    tokenizer_class = LlamaTokenizerFast
except ImportError as e:
    # 发出导入错误的警告
    warnings.warn(e)
    # 发出警告，提示使用慢速的tokenizer
    warnings.warn(
        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
    )
    # 使用慢速的LlamaTokenizer作为tokenizer类
    tokenizer_class = LlamaTokenizer

# 多行注释，提供了代码示例的使用说明和模型加载方法

# 定义需要修改的state_dict键和对应的新键映射关系
KEYS_TO_MODIFY_MAPPING = {
    "self_attention": "self_attn",
    "language_model.encoder": "language_model.model",
    "word_embeddings_for_head": "language_model.lm_head",
    "language_model.embedding.word_embeddings": "language_model.model.embed_tokens",
    "vit_encoder.linear_encoder": "vision_embed_tokens",
}

# 定义需要移除的state_dict键集合
KEYS_TO_REMOVE = {
    "rotary_emb.inv_freq",
    "image_patch_projection",
    "image_patch_projection.weight",
    "image_patch_projection.bias",
}


# 定义一个函数，用于重命名给定state_dict的键
def rename_state_dict(state_dict):
    # 创建空字典，用于存储重命名后的state_dict
    model_state_dict = {}
    # 遍历原始state_dict的键值对
    for key, value in state_dict.items():
        # 遍历需要修改的映射关系
        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
            # 如果当前键包含需要修改的键
            if key_to_modify in key:
                # 替换当前键为新的键
                key = key.replace(key_to_modify, new_key)
        # 如果当前键在需要移除的集合中，则跳过不处理
        if key in KEYS_TO_REMOVE:
            continue
        # 将更新后的键值对添加到新的model_state_dict中
        model_state_dict[key] = value
    return model_state_dict


    # 返回模型的状态字典
    # 这行代码将函数的执行结果返回给调用者，通常用于将函数内部计算得到的结果传递出去
# 定义一个函数用于将 Fuyu 模型的检查点转换为 PyTorch 格式
def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
    # 将 ada_lib_path 添加到系统路径中，以便导入相关库
    sys.path.insert(0, ada_lib_path)
    # 使用 map_location="cpu" 加载 PyTorch 模型的状态字典
    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
    # 将模型状态字典展开成扁平结构
    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
    # 重命名状态字典中的键
    state_dict = rename_state_dict(state_dict)

    # 创建 FuyuConfig 的实例，用于配置 Transformers 模型
    transformers_config = FuyuConfig()
    # 创建 FuyuForCausalLM 模型的实例，并转换为 torch.bfloat16 数据类型
    model = FuyuForCausalLM(transformers_config).to(torch.bfloat16)
    # 加载转换后的模型状态字典
    model.load_state_dict(state_dict)
    # 将模型保存到指定路径，并可选择安全序列化
    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
    # 将 Transformers 配置保存到同一路径
    transformers_config.save_pretrained(pytorch_dump_folder_path)


# 主函数，用于解析命令行参数并调用转换函数
def main():
    parser = argparse.ArgumentParser()
    # 添加命令行参数：输入目录，包含 tokenizer.model 和 model 文件夹的位置
    parser.add_argument(
        "--input_dir",
        help="Location of Fuyu weights, which contains tokenizer.model and model folders",
    )
    # 添加命令行参数：Fuyu 模型的位置
    parser.add_argument(
        "--pt_model_path",
        help="Location of Fuyu `model_optim_rng.pt`",
    )
    # 添加命令行参数：输出目录，用于存储 HF 模型和 tokenizer
    parser.add_argument(
        "--output_dir",
        help="Location to write HF model and tokenizer",
    )
    # 添加命令行参数：adept 库的位置，用于反序列化 .pt 检查点
    parser.add_argument(
        "--ada_lib_path",
        help="Location of original source code from adept to deserialize .pt checkpoint",
    )
    # 添加命令行参数：是否使用安全张量进行保存
    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
    # 解析命令行参数
    args = parser.parse_args()
    # 构建 spm_path，用于 tokenizer 的路径
    spm_path = os.path.join(args.input_dir, "adept_vocab.model")

    # 调用转换函数，将 Fuyu 模型的检查点转换为 PyTorch 格式
    convert_fuyu_checkpoint(
        pytorch_dump_folder_path=args.output_dir,
        pt_model_path=args.pt_model_path,
        safe_serialization=args.safe_serialization,
        ada_lib_path=args.ada_lib_path,
    )
    # 创建 tokenizer 实例，使用 spm_path 和特定的起始和结束标记
    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
    # 将 tokenizer 保存到输出目录
    tokenizer.save_pretrained(args.output_dir)


# 如果脚本被直接执行，则调用主函数
if __name__ == "__main__":
    main()

`.\models\fuyu\image_processing_fuyu.py`

# coding=utf-8
# 设置编码格式为 UTF-8，确保可以处理包含非英文字符的文本文件

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明，标明代码的版权归 HuggingFace Inc. 团队所有，保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 引入 Apache 许可证 2.0 版本

# you may not use this file except in compliance with the License.
# 在符合许可证规定的情况下，才可以使用本文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，否则依据许可证提供的软件将按"原样"分发，不附带任何形式的保证或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证以了解权限和限制

"""Image processor class for Fuyu."""
# 文件描述：Fuyu 的图像处理类

import math
# 导入 math 库，用于数学运算

from typing import Dict, List, Optional, Union
# 导入类型提示所需的模块

import numpy as np
# 导入 NumPy 库，用于处理数组

from ...image_processing_utils import BaseImageProcessor, BatchFeature
# 导入本地的图像处理工具类和批处理特征类

from ...image_transforms import (
    pad,
    resize,
    to_channel_dimension_format,
)
# 从本地图像变换模块中导入 pad, resize, to_channel_dimension_format 函数

from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    is_valid_image,
    make_list_of_images,
    to_numpy_array,
    validate_preprocess_arguments,
)
# 从本地图像工具模块中导入各种图像处理函数和工具函数

from ...utils import (
    TensorType,
    is_torch_available,
    is_torch_device,
    is_torch_dtype,
    logging,
    requires_backends,
)
# 从本地工具模块导入各种通用工具和 TensorFlow 相关函数

if is_torch_available():
    import torch
# 如果 TensorFlow 可用，则导入 TensorFlow 模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

def make_list_of_list_of_images(
    images: Union[List[List[ImageInput]], List[ImageInput], ImageInput],
) -> List[List[ImageInput]]:
    # 定义函数 make_list_of_list_of_images，接受一个参数 images，可以是二维图像列表、一维图像列表或单个图像
    if is_valid_image(images):
        # 如果 images 是有效的图像，则返回一个包含 images 的嵌套列表
        return [[images]]

    if isinstance(images, list) and all(isinstance(image, list) for image in images):
        # 如果 images 是二维列表，且每个元素都是列表，则返回 images 本身
        return images

    if isinstance(images, list):
        # 如果 images 是一维列表，则将其中每个元素都转换为图像列表，再返回
        return [make_list_of_images(image) for image in images]

    raise ValueError("images must be a list of list of images or a list of images or an image.")
    # 如果 images 不符合上述条件，则抛出值错误异常，提示 images 参数必须符合指定的类型要求

class FuyuBatchFeature(BatchFeature):
    # 定义 FuyuBatchFeature 类，继承自 BatchFeature 类

    """
    BatchFeature class for Fuyu image processor and processor.

    The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
    """
    # 类的说明文档：用于 Fuyu 图像处理器和处理器的批处理特征类
    # 处理器输出的字典包含张量和张量列表的混合内容
    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
        """
        Convert the inner content to tensors.

        Args:
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
        """
        # 如果 tensor_type 为 None，则直接返回当前对象自身，无需转换
        if tensor_type is None:
            return self

        # 根据 tensor_type 获取对应的判断和转换函数
        is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type=tensor_type)

        def _convert_tensor(elem):
            # 如果 elem 已经是 tensor 类型，则直接返回
            if is_tensor(elem):
                return elem
            # 否则将 elem 转换成 tensor 类型并返回
            return as_tensor(elem)

        def _safe_convert_tensor(elem):
            try:
                return _convert_tensor(elem)
            except:  # noqa E722
                # 处理异常情况，根据 key 不同抛出不同的 ValueError
                if key == "overflowing_values":
                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
                raise ValueError(
                    "Unable to create tensor, you should probably activate padding "
                    "with 'padding=True' to have batched tensors with the same length."
                )

        # 批量进行 tensor 转换
        for key, value in self.items():
            if isinstance(value, list) and isinstance(value[0], list):
                # 处理二维列表的情况，将其中每个元素转换为 tensor 类型
                self[key] = [[_safe_convert_tensor(elem) for elem in elems] for elems in value]
            elif isinstance(value, list):
                # 处理一维列表的情况，将每个元素转换为 tensor 类型
                self[key] = [_safe_convert_tensor(elem) for elem in value]
            else:
                # 处理单个元素的情况，将其转换为 tensor 类型
                self[key] = _safe_convert_tensor(value)
        # 返回转换后的对象本身
        return self
    def to(self, *args, **kwargs) -> "BatchFeature":
        """
        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
        different `dtypes` and sending the `BatchFeature` to a different `device`.

        Args:
            args (`Tuple`):
                Will be passed to the `to(...)` function of the tensors.
            kwargs (`Dict`, *optional*):
                Will be passed to the `to(...)` function of the tensors.

        Returns:
            [`BatchFeature`]: The same instance after modification.
        """
        requires_backends(self, ["torch"])  # 要求当前环境支持 torch 库
        import torch  # noqa  # 导入 torch 库，忽略与 PEP 8 格式相关的警告

        new_data = {}  # 初始化一个空字典用于存储转换后的数据
        device = kwargs.get("device")  # 获取 kwargs 中的 device 参数

        # 检查 args 是否包含设备信息或数据类型信息
        if device is None and len(args) > 0:
            # 如果 device 参数为 None 且 args 不为空
            arg = args[0]  # 获取第一个参数
            if is_torch_dtype(arg):
                # 如果第一个参数是 torch 的数据类型
                pass  # 什么都不做
            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
                # 如果第一个参数是字符串、torch 设备对象或整数，则将其作为设备
                device = arg
            else:
                # 如果参数类型不符合预期，则抛出异常
                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")

        def _to(elem):
            # 将元素转换并发送到指定设备
            if torch.is_floating_point(elem):
                # 检查元素是否为浮点数类型
                return elem.to(*args, **kwargs)  # 如果是浮点数，则按 args 和 kwargs 指定的设备和类型进行转换
            if device is not None:
                return elem.to(device=device)  # 如果有指定设备，则将元素发送到该设备

            return elem  # 返回未转换的元素

        # 仅将浮点数张量进行类型转换，以避免 tokenizer 将 `LongTensor` 转换为 `FloatTensor` 的问题
        for k, v in self.items():
            if isinstance(v, list) and isinstance(v[0], list):
                # 如果数据结构是列表的列表
                new_v = []
                for elems in v:
                    new_v.append([_to(elem) for elem in elems])  # 对列表中的每个元素进行转换操作
                new_data[k] = new_v  # 更新转换后的数据到 new_data 中
            elif isinstance(v, list):
                # 如果数据结构是列表
                new_data[k] = [_to(elem) for elem in v]  # 对列表中的每个元素进行转换操作
            else:
                new_data[k] = _to(v)  # 对单个元素进行转换操作
        self.data = new_data  # 更新对象的数据为转换后的数据
        return self  # 返回修改后的 BatchFeature 实例
# 继承自 BaseImageProcessor 类的 FuyuImageProcessor 类，用于处理 FuyuForCausalLM 主体之前的图像处理部分。
class FuyuImageProcessor(BaseImageProcessor):
    """
    This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
    handle:

    - Processing Images:
        Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
        dimensions. The image output is always img_h, img_w of (1080, 1920)

        Then, it patches up these images using the patchify_image function.

    - Creating Image Input IDs:
        For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
        variable-sized images, each line of patches is terminated with a newline ID.

    - Image Patch Indices:
        For each image patch, the code maintains an index where these patches should be inserted in a token stream.


    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image to `size`.
        size (`Dict[str, int]`, *optional*, defaults to `{"height": 1080, "width": 1920}`):
            Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to `size`.
        padding_value (`float`, *optional*, defaults to 1.0):
            The value to pad the image with.
        padding_mode (`str`, *optional*, defaults to `"constant"`):
            The padding mode to use when padding the image.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        image_mean (`float`, *optional*, defaults to 0.5):
            The mean to use when normalizing the image.
        image_std (`float`, *optional*, defaults to 0.5):
            The standard deviation to use when normalizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image.
        rescale_factor (`float`, *optional*, defaults to `1 / 255`):
            The factor to use when rescaling the image.
        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
            Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
    """

    # 定义模型输入的名称列表，包括图像、图像输入 ID、图像补丁等信息
    model_input_names = [
        "images",
        "image_input_ids",
        "image_patches",
        "image_patch_indices_per_batch",
        "image_patch_indices_per_subsequence",
    ]
    # 初始化函数，用于设置图像处理器的参数
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图像大小调整的标志
        size: Optional[Dict[str, int]] = None,  # 图像大小的目标尺寸，如果未指定则默认为 {"height": 1080, "width": 1920}
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像调整大小时的重采样方法，默认为双线性插值
        do_pad: bool = True,  # 是否进行图像填充的标志
        padding_value: float = 1.0,  # 填充像素的数值，默认为1.0
        padding_mode: str = "constant",  # 填充像素的模式，默认为常数填充
        do_normalize: bool = True,  # 是否进行图像归一化的标志
        image_mean: Union[float, List[float]] = 0.5,  # 图像归一化的均值，默认为0.5
        image_std: Union[float, List[float]] = 0.5,  # 图像归一化的标准差，默认为0.5
        do_rescale: bool = True,  # 是否进行图像像素值缩放的标志
        rescale_factor: float = 1 / 255,  # 图像像素值缩放的因子，默认为1/255
        patch_size: Optional[Dict[str, int]] = None,  # 图像处理中的补丁尺寸，如果未指定则默认为 {"height": 30, "width": 30}
        **kwargs,  # 其他可能的参数，以字典形式接收
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 初始化各个图像处理器的参数
        self.do_resize = do_resize
        self.size = size if size is not None else {"height": 1080, "width": 1920}
        self.resample = resample
        self.do_pad = do_pad
        self.padding_value = padding_value
        self.padding_mode = padding_mode
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
        # 定义有效的图像处理参数键列表，用于验证和筛选参数
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_pad",
            "padding_value",
            "padding_mode",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_rescale",
            "rescale_factor",
            "patch_size",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    def resize_image(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        # Obtain the height and width of the input image based on its channel dimension format
        image_height, image_width = get_image_size(image, input_data_format)
        
        # Extract the target height and width from the provided size dictionary
        target_height, target_width = size["height"], size["width"]

        # Check if the input image already meets or exceeds the target dimensions
        if image_width <= target_width and image_height <= target_height:
            return image

        # Calculate scaling factors to resize the image while preserving aspect ratio
        height_scale_factor = target_height / image_height
        width_scale_factor = target_width / image_width
        optimal_scale_factor = min(height_scale_factor, width_scale_factor)

        # Compute new dimensions based on the optimal scaling factor
        new_height = int(image_height * optimal_scale_factor)
        new_width = int(image_width * optimal_scale_factor)

        # Resize the image using the specified parameters
        scaled_image = resize(
            image=image,
            size=(new_height, new_width),
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
        return scaled_image
    ) -> np.ndarray:
        """
        Pad an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to pad.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            data_format (`ChannelDimension` or `str`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 获取输入图像的高度和宽度
        image_height, image_width = get_image_size(image, input_data_format)
        # 获取目标填充后的高度和宽度
        target_height, target_width = size["height"], size["width"]
        # 计算上、左、下、右填充的像素数
        padding_top = 0
        padding_left = 0
        padding_bottom = target_height - image_height
        padding_right = target_width - image_width
        # 对图像进行填充操作
        padded_image = pad(
            image,
            padding=((padding_top, padding_bottom), (padding_left, padding_right)),
            mode=mode,  # 填充模式，如常量填充、边缘复制等
            constant_values=constant_values,  # 常量填充的值
            data_format=data_format,  # 输出图像的数据格式
            input_data_format=input_data_format,  # 输入图像的通道维度格式
        )
        # 返回填充后的图像
        return padded_image

    def preprocess(
        self,
        images,
        do_resize: Optional[bool] = None,
        size: Optional[Dict[str, int]] = None,
        resample: Optional[PILImageResampling] = None,
        do_pad: Optional[bool] = None,
        padding_value: Optional[float] = None,
        padding_mode: Optional[str] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[float] = None,
        image_std: Optional[float] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        patch_size: Optional[Dict[str, int]] = None,
        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        return_tensors: Optional[TensorType] = None,
    def get_num_patches(self, image_height: int, image_width: int, patch_size: Dict[str, int] = None) -> int:
        """
        Calculate number of patches required to encode an image.

        Args:
            image_height (`int`):
                Height of the image.
            image_width (`int`):
                Width of the image.
            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
        """
        # 如果未提供 patch_size 参数，则使用对象的默认 patch_size
        patch_size = patch_size if patch_size is not None else self.patch_size
        # 从 patch_size 字典中获取 patch 的高度和宽度
        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]

        # 检查图像高度是否可以被 patch 高度整除，否则抛出错误
        if image_height % patch_height != 0:
            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
        # 检查图像宽度是否可以被 patch 宽度整除，否则抛出错误
        if image_width % patch_width != 0:
            raise ValueError(f"{image_width=} must be divisible by {patch_width}")

        # 计算每个维度中的 patch 数量
        num_patches_per_dim_h = image_height // patch_height
        num_patches_per_dim_w = image_width // patch_width
        # 计算总的 patch 数量
        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
        return num_patches

    def patchify_image(self, image: "torch.Tensor", patch_size: Optional[Dict[str, int]] = None) -> "torch.Tensor":
        """
        Convert an image into a tensor of patches.

        Args:
            image (`torch.Tensor`):
                Image to convert. Shape: [batch, channels, height, width]
            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
        """
        # 确保 torch 被正确导入
        requires_backends(self, ["torch"])
        # 如果未提供 patch_size 参数，则使用对象的默认 patch_size
        patch_size = patch_size if patch_size is not None else self.patch_size
        # 从 patch_size 字典中获取 patch 的高度和宽度
        patch_height, patch_width = patch_size["height"], patch_size["width"]

        # 使用 unfold 方法对图像进行展开操作，按照 patch 的高度和宽度展开
        batch_size, channels, _, _ = image.shape
        unfolded_along_height = image.unfold(2, patch_height, patch_height)
        patches = unfolded_along_height.unfold(3, patch_width, patch_width)
        # 将展开后的 tensor 进行重塑和转置操作，以生成 patches
        patches = patches.contiguous()
        patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
        patches = patches.permute(0, 2, 3, 4, 1)
        patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
        return patches

    def preprocess_with_tokenizer_info(
        self,
        image_input: "torch.Tensor",
        image_present: "torch.Tensor",
        image_unpadded_h: "torch.Tensor",
        image_unpadded_w: "torch.Tensor",
        image_placeholder_id: int,
        image_newline_id: int,
        variable_sized: bool,
        patch_size: Optional[Dict[str, int]] = None,
    ):
        """
        Preprocess image tensors along with tokenizer information.

        Args:
            image_input (`torch.Tensor`):
                Input image tensor.
            image_present (`torch.Tensor`):
                Present image tensor.
            image_unpadded_h (`torch.Tensor`):
                Unpadded image height tensor.
            image_unpadded_w (`torch.Tensor`):
                Unpadded image width tensor.
            image_placeholder_id (`int`):
                Placeholder ID for the image.
            image_newline_id (`int`):
                Newline ID for the image.
            variable_sized (`bool`):
                Whether the image size varies.
            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
        """
        # 这里需要实现函数的详细注释，请根据具体的函数功能添加注释。
        pass

`.\models\fuyu\modeling_fuyu.py`

    """
    The bare Fuyu Model outputting raw hidden-states without any specific head on top.

    This model inherits from `PreTrainedModel`. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch `torch.nn.Module` subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.
    """
    # 定义一个字符串，描述 Fuyu 模型及其用途，包括语言建模头部，用于基于图像补丁和文本进行因果语言建模。
    "Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.",
    # 引用 FUYU_START_DOCSTRING，可能是一个预定义的文档字符串或标记。
    FUYU_START_DOCSTRING,
)
class FuyuForCausalLM(FuyuPreTrainedModel):
    # FuyuForCausalLM 类，继承自 FuyuPreTrainedModel 类
    def __init__(self, config: FuyuConfig):
        # 初始化方法，接受一个 FuyuConfig 类型的参数 config
        super().__init__(config)
        # 调用父类的初始化方法
        self.padding_idx = config.pad_token_id
        # 设置 padding_idx 属性为配置中的 pad_token_id
        self.vocab_size = config.vocab_size
        # 设置 vocab_size 属性为配置中的 vocab_size
        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
        # 创建一个 AutoModelForCausalLM 实例作为语言模型，使用给定的 text_config 配置

        self.vision_embed_tokens = nn.Linear(
            config.patch_size * config.patch_size * config.num_channels, config.hidden_size
        )
        # 创建一个线性层，用于将图像嵌入的输入映射到隐藏大小的维度

        self.gradient_checkpointing = False
        # 初始化梯度检查点为 False，不使用梯度检查点优化

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 获取语言模型的输入嵌入
        return self.language_model.get_input_embeddings()

    def set_input_embeddings(self, value):
        # 设置语言模型的输入嵌入
        self.language_model.set_input_embeddings(value)

    def gather_continuous_embeddings(
        self,
        word_embeddings: torch.Tensor,
        continuous_embeddings: List[torch.Tensor],
        image_patch_input_indices: torch.Tensor,
    ) -> torch.Tensor:
        """This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        """
        if not (word_embeddings.shape[0] == len(continuous_embeddings)):
            raise ValueError(
                f"Batch sizes must match! Got {len(continuous_embeddings)=} and {word_embeddings.shape[0]=}"
            )

        # Clone the word_embeddings tensor to preserve the original and store modified embeddings
        output_embeddings = word_embeddings.clone()
        
        # Iterate through each batch element
        for batch_idx in range(word_embeddings.shape[0]):
            # Find indices in word_embeddings where non-negative values exist in image_patch_input_indices
            dst_indices = torch.nonzero(image_patch_input_indices[batch_idx] >= 0, as_tuple=True)[0]
            
            # Retrieve corresponding indices from image_patch_input_indices to locate continuous_embeddings
            src_indices = image_patch_input_indices[batch_idx][dst_indices]
            
            # Check if the number of continuous embeddings matches the number of indices
            if src_indices.shape[0] > continuous_embeddings[batch_idx].shape[0]:
                raise ValueError(
                    f"Number of continuous embeddings {continuous_embeddings[batch_idx].shape=} does not match "
                    f"number of continuous token ids {src_indices.shape=} in batch element {batch_idx}."
                )
            
            # Replace selected word embeddings with corresponding continuous embeddings
            output_embeddings[batch_idx, dst_indices] = continuous_embeddings[batch_idx][src_indices]
        
        # Return the modified output_embeddings tensor
        return output_embeddings
    # 定义一个方法用于模型的前向传播
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的 token IDs
        image_patches: torch.Tensor = None,  # 图像片段张量，形状为 [batch_size, num_total_patches, patch_size x patch_size x num_channels]
        image_patches_indices: torch.Tensor = None,  # 图像片段的索引张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩张量
        position_ids: Optional[torch.LongTensor] = None,  # 位置 IDs 张量
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 用于存储过去的键值对的列表
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 嵌入输入张量
        use_cache: Optional[bool] = None,  # 是否使用缓存
        labels: Optional[torch.Tensor] = None,  # 标签张量
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出
    ):
        # 如果传入了 `past_key_values`，则仅使用最后一个 token 的输入
        if past_key_values:
            input_ids = input_ids[:, -1:]

        # 获取额外的关键字参数中的 `position_ids`
        position_ids = kwargs.get("position_ids", None)
        # 如果存在 `attention_mask` 且不存在 `position_ids`
        if attention_mask is not None and position_ids is None:
            # 动态生成位置 IDs 用于批量生成
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            # 如果存在 `past_key_values`，则仅使用最后一个位置 ID
            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)

        # 如果传入了 `inputs_embeds`，则仅在第一个生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 如果存在 `image_patches_indices`，将其添加到模型输入中
        if image_patches_indices is not None:
            model_inputs["image_patches_indices"] = image_patches_indices

        # 更新模型输入字典
        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
                # 如果存在 `past_key_values`，则将以下两个键置为 None
                "image_patches_indices": image_patches_indices if past_key_values is None else None,
                "image_patches": image_patches if past_key_values is None else None,
            }
        )
        # 返回模型输入字典
        return model_inputs

`.\models\fuyu\processing_fuyu.py`

# 指定 Python 源文件的编码格式为 UTF-8
# 版权声明，此代码版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本，除非符合许可证的要求，否则不得使用此文件
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据许可证分发的软件是基于“原样”提供的，
# 不附带任何明示或暗示的保证或条件
# 请参阅许可证以了解特定语言的权限和限制
"""
GIT 的图像/文本处理器类
"""
import re
from typing import Dict, List, Optional, Tuple, Union

import numpy as np

# 导入通用处理函数
from ...processing_utils import ProcessorMixin
# 导入标记化工具的基类
from ...tokenization_utils_base import PaddingStrategy, TruncationStrategy
# 导入实用函数
from ...utils import TensorType, is_torch_available, logging, requires_backends

# 如果 Torch 可用，则导入相关模块
if is_torch_available():
    from .image_processing_fuyu import FuyuBatchFeature

# 获取日志记录器
logger = logging.get_logger(__name__)

# 如果 Torch 可用，则导入 Torch 模块
if is_torch_available():
    import torch

# 定义用于表示文本中边界框和点的标记
TEXT_REPR_BBOX_OPEN = "<box>"
TEXT_REPR_BBOX_CLOSE = "</box>"
TEXT_REPR_POINT_OPEN = "<point>"
TEXT_REPR_POINT_CLOSE = "</point>"

# 定义用于标记化的特殊字符串
TOKEN_BBOX_OPEN_STRING = "<0x00>"  # <bbox>
TOKEN_BBOX_CLOSE_STRING = "<0x01>"  # </bbox>
TOKEN_POINT_OPEN_STRING = "<0x02>"  # <point>
TOKEN_POINT_CLOSE_STRING = "<0x03>"  # </point>
BEGINNING_OF_ANSWER_STRING = "<0x04>"  # <boa>

def full_unpacked_stream_to_tensor(
    all_bi_tokens_to_place: List[int],
    full_unpacked_stream: List["torch.Tensor"],
    fill_value: int,
    batch_size: int,
    new_seq_len: int,
    offset: int,
) -> "torch.Tensor":
    """将解压的令牌流（即批次中每个项目的张量列表）进行必要的填充，以创建一个形状为 batch_size x new_seq_len 的单个张量。
    """

    # 确保 all_bi_tokens_to_place 的长度等于批次大小
    assert len(all_bi_tokens_to_place) == batch_size
    # 确保 full_unpacked_stream 的长度等于批次大小
    assert len(full_unpacked_stream) == batch_size

    # 创建一个填充后的批次张量
    new_padded_tensor = torch.full(
        [batch_size, new_seq_len],
        fill_value=fill_value,
        dtype=full_unpacked_stream[0].dtype,
        device=full_unpacked_stream[0].device,
    )

    # 将每个批次项放入批次张量中
    for bi in range(batch_size):
        tokens_to_place = all_bi_tokens_to_place[bi]
        # 将解压流中的每个项目放入填充张量的相应位置
        new_padded_tensor[bi, :tokens_to_place] = full_unpacked_stream[bi][offset : tokens_to_place + offset]

    return new_padded_tensor

def construct_full_unpacked_stream(
    num_real_text_tokens: Union[List[List[int]], "torch.Tensor"],
    input_stream: "torch.Tensor",
    image_tokens: List[List["torch.Tensor"]],
    batch_size: int,
    num_sub_sequences: int,
) -> List["torch.Tensor"]:
    """接受形状为 B x S x ? 的 input_stream 张量。对于每个子序列，添加所需的
    """
    # 存储所有子序列流的列表
    all_bi_stream = []

    # 遍历每个批次中的索引
    for batch_index in range(batch_size):
        # 存储每个子序列流的列表
        all_si_stream = []

        # 首先，构建完整的标记流（包括图像占位符标记）和每个子序列的损失掩码，并添加到列表中。
        # 我们使用列表而不是张量，因为每个子序列的大小是可变的。
        # TODO 在后续的版本中删除此逻辑，因为不支持子序列。
        
        # 获取图像调整后的标记流
        image_adjustment = image_tokens[batch_index][0]
        
        # 将图像调整后的标记流和输入流的第一个子序列连接起来
        subsequence_stream = torch.cat([image_adjustment, input_stream[batch_index, 0]], dim=0)
        
        # 计算真实标记的数量
        num_real_tokens = image_adjustment.shape[0] + num_real_text_tokens[batch_index][0]
        
        # 只保留真实标记的部分，并添加到子序列流列表中
        all_si_stream.append(subsequence_stream[:num_real_tokens])
        
        # 将所有子序列流连接成一个张量，并添加到所有子序列流的列表中
        all_bi_stream.append(torch.cat(all_si_stream, dim=0))

    # 返回所有批次的标记流列表
    return all_bi_stream
def _replace_string_repr_with_token_tags(prompt: str) -> str:
    # 替换字符串中的特定文本表示符号为对应的标记化标签
    prompt = prompt.replace(TEXT_REPR_POINT_OPEN, TOKEN_POINT_OPEN_STRING)
    prompt = prompt.replace(TEXT_REPR_POINT_CLOSE, TOKEN_POINT_CLOSE_STRING)
    prompt = prompt.replace(TEXT_REPR_BBOX_OPEN, TOKEN_BBOX_OPEN_STRING)
    prompt = prompt.replace(TEXT_REPR_BBOX_CLOSE, TOKEN_BBOX_CLOSE_STRING)
    return prompt


def _segment_prompt_into_text_token_conversions(prompt: str) -> List:
    """
    Given a string prompt, converts the prompt into a list of TextTokenConversions.
    """
    # 初始化空列表用于存储分段后的文本与标记转换
    prompt_text_list: List = []
    # 创建正则表达式模式，用于匹配文本中的特定标记
    regex_pattern = re.compile(
        f"({TOKEN_BBOX_OPEN_STRING}|{TOKEN_BBOX_CLOSE_STRING}|{TOKEN_POINT_OPEN_STRING}|{TOKEN_POINT_CLOSE_STRING})"
    )
    # 使用正则表达式模式分割文本
    prompt_split = regex_pattern.split(prompt)
    for i, elem in enumerate(prompt_split):
        # 跳过空字符串和特定标记的文本片段
        if len(elem) == 0 or elem in [
            TOKEN_BBOX_OPEN_STRING,
            TOKEN_BBOX_CLOSE_STRING,
            TOKEN_POINT_OPEN_STRING,
            TOKEN_POINT_CLOSE_STRING,
        ]:
            continue
        # 添加文本片段及其是否位于特定标记之内的信息到列表中
        prompt_text_list.append(
            (elem, i > 1 and prompt_split[i - 1] in [TOKEN_BBOX_OPEN_STRING, TOKEN_POINT_OPEN_STRING])
        )
    return prompt_text_list


def _transform_coordinates_and_tokenize(prompt: str, scale_factor: float, tokenizer) -> List[int]:
    """
    This function transforms the prompt in the following fashion:
    - <box> <point> and </box> </point> to their respective token mappings
    - extract the coordinates from the tag
    - transform the coordinates into the transformed image space
    - return the prompt tokens with the transformed coordinates and new tags

    Bounding boxes and points MUST be in the following format: <box>y1, x1, y2, x2</box> <point>x, y</point> The spaces
    and punctuation added above are NOT optional.
    """
    # 使用指定的标记替换文本中的特定字符串表示符号
    prompt = _replace_string_repr_with_token_tags(prompt)
    # 将文本分段为文本与标记转换的列表
    prompt_text_list = _segment_prompt_into_text_token_conversions(prompt)
    transformed_prompt_tokens: List[int] = []
    for elem in prompt_text_list:
        if elem[1]:
            # 如果文本位于特定标记内，需对其进行进一步的标记化处理
            within_tag_tokenized = _transform_within_tags(elem[0], scale_factor, tokenizer)
            # 将处理后的标记化结果扩展到转换后的提示标记列表中
            transformed_prompt_tokens.extend(within_tag_tokenized)
        else:
            # 否则，按照正常方式对文本进行标记化处理
            transformed_prompt_tokens.extend(tokenizer(elem[0], add_special_tokens=False).input_ids)
    # 返回经过转换的提示标记列表
    return transformed_prompt_tokens
def _transform_within_tags(text: str, scale_factor: float, tokenizer) -> List[int]:
    """
    Given a bounding box of the fashion <box>1, 2, 3, 4</box> | <point>1, 2</point> This function is responsible for
    converting 1, 2, 3, 4 into tokens of 1 2 3 4 without any commas.
    """
    # 将文本按逗号分隔成字符串列表
    num_int_strs = text.split(",")
    
    if len(num_int_strs) == 2:
        # 如果有开启或关闭标签，移除它们
        token_space_open_string = tokenizer.vocab[TOKEN_POINT_OPEN_STRING]
        token_space_close_string = tokenizer.vocab[TOKEN_POINT_CLOSE_STRING]
    else:
        token_space_open_string = tokenizer.vocab[TOKEN_BBOX_OPEN_STRING]
        token_space_close_string = tokenizer.vocab[TOKEN_BBOX_CLOSE_STRING]

    # 移除所有数字字符串中的空格并转换为浮点数
    num_ints = [float(num.strip()) for num in num_int_strs]
    
    # 根据坐标数量调整到变换后的图像大小
    if len(num_ints) == 2:
        num_ints_translated = scale_point_to_transformed_image(x=num_ints[0], y=num_ints[1], scale_factor=scale_factor)
    elif len(num_ints) == 4:
        num_ints_translated = scale_bbox_to_transformed_image(
            top=num_ints[0],
            left=num_ints[1],
            bottom=num_ints[2],
            right=num_ints[3],
            scale_factor=scale_factor,
        )
    else:
        raise ValueError(f"Invalid number of ints: {len(num_ints)}")
    
    # 将坐标转换为对应的标记，并加入开启和关闭标记
    tokens = [tokenizer.vocab[str(num)] for num in num_ints_translated]
    return [token_space_open_string] + tokens + [token_space_close_string]


def _tokenize_prompts_with_image_and_batch(
    tokenizer,
    prompts: List[List[str]],
    scale_factors: Optional[List[List["torch.Tensor"]]],
    max_tokens_to_generate: int,
    max_position_embeddings: int,
    add_BOS: bool,
    add_beginning_of_answer_token: bool,
) -> Tuple["torch.Tensor", "torch.Tensor"]:
    """
    Given a set of prompts and number of tokens to generate:
    - tokenize prompts
    - set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
    - pad all the sequences to this length so we can convert them into a 3D tensor.
    """

    # 如果存在缩放因子，同时转换坐标并进行标记化
    if scale_factors is not None:
        transformed_prompt_tokens = []
        for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
            transformed_prompt_tokens.append(
                [
                    _transform_coordinates_and_tokenize(prompt, scale_factor.item(), tokenizer)
                    for prompt, scale_factor in zip(prompt_seq, scale_factor_seq)
                ]
            )
    else:
        # 否则，仅对提示进行标记化
        transformed_prompt_tokens = [[tokenizer.tokenize(prompt) for prompt in prompt_seq] for prompt_seq in prompts]

    prompts_tokens = transformed_prompt_tokens

    if add_BOS:
        # 如果需要添加起始标记，获取起始标记的词汇表索引
        bos_token = tokenizer.vocab["<s>"]
    # 如果不需要在答案开头添加特定的开始标记，则使用文本生成器的结束标记作为开始标记
    else:
        bos_token = tokenizer.vocab["|ENDOFTEXT|"]

    # 将每个提示序列的每个子序列的开头加上开始标记，并形成三重嵌套列表
    prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens]

    # 如果需要在答案开头添加开始标记，则将其加入到每个提示序列的最后一个子序列中
    if add_beginning_of_answer_token:
        boa = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
        # 只将开始答案标记添加到最后一个子序列中，因为这是将要生成的部分
        for token_seq in prompts_tokens:
            token_seq[-1].append(boa)

    # 现在我们有了一个嵌套列表的列表，每个子列表代表不同长度的序列
    # 我们希望扩展这些列表以：
    #   - 包含需要生成的标记
    #   - 使所有序列长度相等
    # 获取提示序列的长度
    prompts_length = [[len(x) for x in prompts_tokens_seq] for prompts_tokens_seq in prompts_tokens]

    # 获取最大的提示序列长度
    max_prompt_len: int = np.max(prompts_length)

    # 每个样本的长度，为最大提示长度加上最大可生成的标记数，但不超过最大位置嵌入长度
    samples_length = min(max_prompt_len + max_tokens_to_generate, max_position_embeddings)

    # 如果提示长度加上最大可生成的标记数超过了最大位置嵌入长度，则发出警告并生成尽可能多的标记
    if max_prompt_len + max_tokens_to_generate > max_position_embeddings:
        logger.warning(
            f"Max subsequence prompt length of {max_prompt_len} + max tokens to generate {max_tokens_to_generate}",
            f"exceeds context length of {max_position_embeddings}. Will generate as many tokens as possible.",
        )

    # 现在更新嵌套列表，使其所有子列表长度相等：samples_length
    for prompt_tokens_seq, prompts_length_seq in zip(prompts_tokens, prompts_length):
        for prompt_tokens, prompt_length in zip(prompt_tokens_seq, prompts_length_seq):
            if len(prompt_tokens) > samples_length:
                raise ValueError("Length of subsequence prompt exceeds sequence length.")
            padding_size = samples_length - prompt_length
            # 添加结束文本标记来填充使子序列长度达到 samples_length
            prompt_tokens.extend([tokenizer.vocab["|ENDOFTEXT|"]] * padding_size)

    # 现在我们有了结构化的格式，可以将其转换为张量
    prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.int64)
    prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.int64)

    # 返回处理后的张量
    return prompts_tokens_tensor, prompts_length_tensor
# 定义一个函数，将原始的水平坐标转换为变换后的水平坐标
def original_to_transformed_h_coords(original_coords, scale_h):
    return np.round(original_coords * scale_h).astype(np.int32)

# 定义一个函数，将原始的垂直坐标转换为变换后的垂直坐标
def original_to_transformed_w_coords(original_coords, scale_w):
    return np.round(original_coords * scale_w).astype(np.int32)

# 定义一个函数，根据缩放因子将点的坐标缩放到变换后的图像上，并返回整数列表
def scale_point_to_transformed_image(x: float, y: float, scale_factor: float) -> List[int]:
    # 将 x 坐标缩放并转换为整数
    x_scaled = original_to_transformed_w_coords(np.array([x / 2]), scale_factor)[0]
    # 将 y 坐标缩放并转换为整数
    y_scaled = original_to_transformed_h_coords(np.array([y / 2]), scale_factor)[0]
    return [x_scaled, y_scaled]

# 定义一个函数，根据缩放因子将边界框的坐标缩放到变换后的图像上，并返回整数列表
def scale_bbox_to_transformed_image(
    top: float, left: float, bottom: float, right: float, scale_factor: float
) -> List[int]:
    # 将 top 坐标缩放并转换为整数
    top_scaled = original_to_transformed_w_coords(np.array([top / 2]), scale_factor)[0]
    # 将 left 坐标缩放并转换为整数
    left_scaled = original_to_transformed_h_coords(np.array([left / 2]), scale_factor)[0]
    # 将 bottom 坐标缩放并转换为整数
    bottom_scaled = original_to_transformed_w_coords(np.array([bottom / 2]), scale_factor)[0]
    # 将 right 坐标缩放并转换为整数
    right_scaled = original_to_transformed_h_coords(np.array([right / 2]), scale_factor)[0]
    return [top_scaled, left_scaled, bottom_scaled, right_scaled]

class FuyuProcessor(ProcessorMixin):
    r"""
    构造一个 Fuyu 处理器，将 Fuyu 图像处理器和 Llama 分词器封装为单个处理器。

    [`FuyuProcessor`] 提供了 [`FuyuImageProcessor`] 和 [`LlamaTokenizerFast`] 的所有功能。查看 [`~FuyuProcessor.__call__`] 和 [`~FuyuProcessor.decode`] 获取更多信息。

    Args:
        image_processor ([`FuyuImageProcessor`]):
            必需的图像处理器输入。
        tokenizer ([`LlamaTokenizerFast`]):
            必需的分词器输入。
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "FuyuImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor=image_processor, tokenizer=tokenizer)
        self.image_processor = image_processor  # 设置图像处理器
        self.tokenizer = tokenizer  # 设置分词器
        self.max_tokens_to_generate = 10  # 最大生成的令牌数量
        self.max_position_embeddings = 16384  # TODO 无法从模型文件中推断出来：在何处设置它？
        self.pad_token_id = 0  # 填充令牌的ID
        self.dummy_image_index = -1  # 虚拟图像索引
    # 将输入序列和注意力掩码填充为相同长度
    def _left_pad_inputs_with_attention_mask(self, model_inputs: List[Dict], return_attention_mask: bool):
        # 计算输入序列中最长的 input_ids 的长度
        max_length_input_ids = max(entry["input_ids"].shape[1] for entry in model_inputs)
        # 计算输入序列中最长的 image_patches_indices 的长度
        max_length_image_patch_indices = max(entry["image_patches_indices"].shape[1] for entry in model_inputs)

        # 初始化批处理后的输入字典
        batched_inputs = {"input_ids": [], "image_patches": [], "image_patches_indices": [], "attention_mask": []}

        # 遍历每个输入条目
        for entry in model_inputs:
            for key, tensor in entry.items():
                if key == "input_ids":
                    # 计算需要填充的 token 数量
                    num_padding_tokens = max_length_input_ids - tensor.shape[1]
                    # 在序列开头填充 pad_token_id，使得所有序列长度一致
                    padded_input_ids = torch.cat(
                        [
                            torch.full((tensor.shape[0], num_padding_tokens), self.pad_token_id, dtype=torch.long),
                            tensor,
                        ],
                        dim=1,
                    )
                    batched_inputs[key].append(padded_input_ids)

                    # 创建相同形状的 attention_mask，用于指示哪些 token 是填充的
                    attention_mask = torch.cat(
                        [torch.zeros(tensor.shape[0], num_padding_tokens, dtype=torch.long), torch.ones_like(tensor)],
                        dim=1,
                    )
                    batched_inputs["attention_mask"].append(attention_mask)

                elif key == "image_patches":
                    # 对于 image_patches，直接将其添加到列表中，不进行填充处理
                    batched_inputs[key].append(tensor)

                else:  # 对于 image_patches_indices
                    # 计算需要填充的 indices 数量
                    num_padding_indices = max_length_image_patch_indices - tensor.shape[1]
                    # 在序列开头填充 dummy_image_index，使得所有序列长度一致
                    padded_indices = torch.cat(
                        [
                            torch.full(
                                (tensor.shape[0], num_padding_indices), self.dummy_image_index, dtype=torch.long
                            ),
                            tensor,
                        ],
                        dim=1,
                    )
                    batched_inputs[key].append(padded_indices)

        # 确定最终的批处理键值，准备进行拼接
        batched_keys = ["input_ids", "image_patches_indices"]
        if return_attention_mask:
            batched_keys.append("attention_mask")

        # 将所有列表中的 tensor 沿着第 0 维度（批次维度）进行拼接
        for key in batched_keys:
            batched_inputs[key] = torch.cat(batched_inputs[key], dim=0)

        # 返回批处理后的输入字典
        return batched_inputs
        ):
        # 创建一个包含单个值为1的张量，用于表示图像是否存在的标志
        image_present = torch.ones(1, 1, 1)
        # 使用图像处理器预处理图像数据，并结合标记信息进行预处理
        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
            image_input=tensor_batch_images,
            image_present=image_present,
            image_unpadded_h=image_unpadded_heights,
            image_unpadded_w=image_unpadded_widths,
            image_placeholder_id=image_placeholder_id,
            image_newline_id=image_newline_id,
            variable_sized=True,
        )
        # FIXME max_tokens_to_generate 被嵌入到此处理器的调用中。FIXME 是用来指示待修复的问题或改进的注释。
        # 使用给定的tokenizer对提示语进行标记化处理，包括图像和批处理信息
        prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
            tokenizer=self.tokenizer,
            prompts=prompts,
            scale_factors=scale_factors,
            max_tokens_to_generate=self.max_tokens_to_generate,
            max_position_embeddings=self.max_position_embeddings,
            add_BOS=True,
            add_beginning_of_answer_token=True,
        )
        # 构建完整的解包流，包括图像输入的标记化和文本提示标记
        image_padded_unpacked_tokens = construct_full_unpacked_stream(
            num_real_text_tokens=prompts_length,
            input_stream=prompt_tokens,
            image_tokens=model_image_input["image_input_ids"],
            batch_size=1,
            num_sub_sequences=self.subsequence_length,
        )
        # 构建图像补丁索引的输入
        unpacked_image_patch_indices_per_batch = construct_full_unpacked_stream(
            num_real_text_tokens=prompts_length,
            input_stream=torch.full_like(prompt_tokens, -1),
            image_tokens=model_image_input["image_patch_indices_per_batch"],
            batch_size=1,
            num_sub_sequences=self.subsequence_length,
        )
        # 计算最长提示长度
        max_prompt_length = max(x.shape[-1] for x in image_padded_unpacked_tokens)
        # 计算批处理中的最大序列长度
        max_seq_len_batch = min(max_prompt_length + self.max_tokens_to_generate, self.max_position_embeddings)
        # 确定要放置的标记数量
        tokens
    def __call__(
        self,
        text=None,
        images=None,
        add_special_tokens: bool = True,
        return_attention_mask: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_token_type_ids: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        """
        这个方法用于调用 LlamaTokenizerFast 的 `PreTrainedTokenizer.__call__` 方法，接收多种参数并处理。
        请参考 LlamaTokenizerFast 的 `PreTrainedTokenizer.__call__` 方法的文档了解更多信息。
        """
        return self.tokenizer.__call__(
            text=text,
            images=images,
            add_special_tokens=add_special_tokens,
            return_attention_mask=return_attention_mask,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_token_type_ids=return_token_type_ids,
            return_length=return_length,
            verbose=verbose,
            return_tensors=return_tensors,
            **kwargs,
        )

    def batch_decode(self, *args, **kwargs):
        """
        这个方法将其所有参数转发给 LlamaTokenizerFast 的 `PreTrainedTokenizer.batch_decode` 方法。
        请参考 LlamaTokenizerFast 的 `PreTrainedTokenizer.batch_decode` 方法的文档了解更多信息。
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        这个方法将其所有参数转发给 LlamaTokenizerFast 的 `PreTrainedTokenizer.decode` 方法。
        请参考 LlamaTokenizerFast 的 `PreTrainedTokenizer.decode` 方法的文档了解更多信息。
        """
        return self.tokenizer.decode(*args, **kwargs)

`.\models\fuyu\init.py`

# 版权声明和版权许可声明，指明版权归属和使用许可
# Copyright 2023 AdeptAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块中的TYPE_CHECKING类型
from typing import TYPE_CHECKING

# 引入相关的自定义工具函数和类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块的导入结构，初始化空字典
_import_structure = {
    "configuration_fuyu": ["FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP", "FuyuConfig"],
}

# 检查视觉处理是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则将视觉处理相关的模块添加到导入结构中
    _import_structure["image_processing_fuyu"] = ["FuyuImageProcessor"]
    _import_structure["processing_fuyu"] = ["FuyuProcessor"]

# 检查是否Torch可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则将模型相关的模块添加到导入结构中
    _import_structure["modeling_fuyu"] = [
        "FuyuForCausalLM",
        "FuyuPreTrainedModel",
    ]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 从配置模块导入相关的类和常量
    from .configuration_fuyu import FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP, FuyuConfig

    # 检查视觉处理是否可用，若不可用则pass
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，则从图像处理模块导入相关类
        from .image_processing_fuyu import FuyuImageProcessor
        from .processing_fuyu import FuyuProcessor

    # 检查Torch是否可用，若不可用则pass
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，则从建模模块导入相关类
        from .modeling_fuyu import (
            FuyuForCausalLM,
            FuyuPreTrainedModel,
        )

# 如果不是类型检查阶段
else:
    import sys

    # 将当前模块注册为LazyModule，使用LazyModule来延迟加载模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\gemma\configuration_gemma.py`

# coding=utf-8
# 声明版权和许可信息

# 导入所需的模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 初始化一个空字典，用于存储预训练配置的归档映射
GEMMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}

# GemmaConfig 类，用于存储 GemmaModel 的配置信息，继承自 PretrainedConfig 类
class GemmaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma-7B.

    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    ```
    >>> from transformers import GemmaModel, GemmaConfig

    >>> # Initializing a Gemma gemma-7b style configuration
    >>> configuration = GemmaConfig()

    >>> # Initializing a model from the gemma-7b style configuration
    >>> model = GemmaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型标识为 "gemma"
    model_type = "gemma"

    # 在推理过程中要忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]

    # GemmaConfig 类的初始化方法
    def __init__(
        self,
        vocab_size=256000,
        hidden_size=3072,
        intermediate_size=24576,
        num_hidden_layers=28,
        num_attention_heads=16,
        num_key_value_heads=16,
        head_dim=256,
        hidden_act="gelu_pytorch_tanh",
        hidden_activation=None,
        max_position_embeddings=8192,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=0,
        eos_token_id=1,
        bos_token_id=2,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        # 调用父类 PretrainedConfig 的初始化方法
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            num_key_value_heads=num_key_value_heads,
            head_dim=head_dim,
            hidden_act=hidden_act,
            hidden_activation=hidden_activation,
            max_position_embeddings=max_position_embeddings,
            initializer_range=initializer_range,
            rms_norm_eps=rms_norm_eps,
            use_cache=use_cache,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            bos_token_id=bos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            rope_theta=rope_theta,
            attention_bias=attention_bias,
            attention_dropout=attention_dropout,
            **kwargs,
        )
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.hidden_activation = hidden_activation
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout



        # 初始化模型参数
        self.vocab_size = vocab_size  # 词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 最大位置嵌入大小
        self.hidden_size = hidden_size  # 隐藏层大小
        self.intermediate_size = intermediate_size  # 中间层大小
        self.num_hidden_layers = num_hidden_layers  # 隐藏层数量
        self.num_attention_heads = num_attention_heads  # 注意力头数量
        self.head_dim = head_dim  # 注意力头维度
        self.num_key_value_heads = num_key_value_heads  # 键值头数量
        self.hidden_act = hidden_act  # 隐藏层激活函数
        self.hidden_activation = hidden_activation  # 隐藏层激活函数（备用）
        self.initializer_range = initializer_range  # 初始化范围
        self.rms_norm_eps = rms_norm_eps  # RMS 归一化的 epsilon 值
        self.use_cache = use_cache  # 是否使用缓存
        self.rope_theta = rope_theta  # ROPE 参数
        self.attention_bias = attention_bias  # 注意力偏置
        self.attention_dropout = attention_dropout  # 注意力丢弃率



        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )



        # 调用父类初始化方法，传入特定参数和关键字参数
        super().__init__(
            pad_token_id=pad_token_id,  # 填充符号 ID
            bos_token_id=bos_token_id,  # 起始符号 ID
            eos_token_id=eos_token_id,  # 结束符号 ID
            tie_word_embeddings=tie_word_embeddings,  # 是否共享词嵌入
            **kwargs,  # 其他未命名参数
        )

Transformers-源码解析-五十一-

Transformers 源码解析（五十一）

.\models\funnel\modeling_tf_funnel.py

.\models\funnel\tokenization_funnel.py

.\models\funnel\tokenization_funnel_fast.py

.\models\funnel\__init__.py

.\models\fuyu\configuration_fuyu.py

.\models\fuyu\convert_fuyu_model_weights_to_hf.py

.\models\fuyu\image_processing_fuyu.py

.\models\fuyu\modeling_fuyu.py

.\models\fuyu\processing_fuyu.py

.\models\fuyu\__init__.py

.\models\gemma\configuration_gemma.py

`.\models\funnel\modeling_tf_funnel.py`

`.\models\funnel\tokenization_funnel.py`

`.\models\funnel\tokenization_funnel_fast.py`

`.\models\funnel\init.py`

`.\models\fuyu\configuration_fuyu.py`

`.\models\fuyu\convert_fuyu_model_weights_to_hf.py`

`.\models\fuyu\image_processing_fuyu.py`

`.\models\fuyu\modeling_fuyu.py`

`.\models\fuyu\processing_fuyu.py`

`.\models\fuyu\init.py`

`.\models\gemma\configuration_gemma.py`