Transformers 源码解析（九十四）

`.\models\rembert\modeling_tf_rembert.py`

# 设置编码格式为 UTF-8
# 版权声明及许可信息
# 
# 根据 Apache 许可证 2.0 版本使用此文件
# 除非符合许可证的条款，否则不得使用此文件
# 您可以在以下网址获取许可证副本：
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，
# 不附带任何明示或暗示的担保或条件
# 请参阅许可证了解特定语言下的权限和限制

""" TF 2.0 RemBERT model."""

# 导入必要的库和模块
from __future__ import annotations  # 用于支持类型注释的反向兼容性

import math  # 导入数学库
from typing import Dict, Optional, Tuple, Union  # 导入类型定义

import numpy as np  # 导入 numpy 库
import tensorflow as tf  # 导入 tensorflow 库

# 导入模块中的各类输出定义
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFBaseModelOutputWithPoolingAndCrossAttentions,
    TFCausalLMOutputWithCrossAttentions,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
# 导入模块中的各类实用函数和损失函数
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
# 导入模块中的各类实用函数
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
# 导入通用的实用函数
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
# 导入 RemBERT 的配置类
from .configuration_rembert import RemBertConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# RemBERT 的模型配置文档字符串
_CONFIG_FOR_DOC = "RemBertConfig"

# RemBERT 预训练模型的存档列表
TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/rembert",
    # 查看所有 RemBERT 模型：https://huggingface.co/models?filter=rembert
]

# TFRemBertEmbeddings 类定义，用于构建来自单词、位置和标记类型嵌入的嵌入向量
class TFRemBertEmbeddings(keras.layers.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""

    # 初始化函数，接受一个 RemBertConfig 对象作为参数
    def __init__(self, config: RemBertConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化配置
        self.config = config
        self.input_embedding_size = config.input_embedding_size  # 输入嵌入的大小
        self.max_position_embeddings = config.max_position_embeddings  # 最大位置嵌入数量
        self.initializer_range = config.initializer_range  # 初始化范围
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")  # LayerNorm 层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)  # Dropout 层
    # 在构建函数中，用于构建模型层，初始化各种嵌入层的权重和偏置
    def build(self, input_shape=None):
        # 在 "word_embeddings" 命名空间下创建权重变量，用于词嵌入
        self.weight = self.add_weight(
            name="weight",
            shape=[self.config.vocab_size, self.input_embedding_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在 "token_type_embeddings" 命名空间下创建权重变量，用于类型嵌入
        self.token_type_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.config.type_vocab_size, self.input_embedding_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 在 "position_embeddings" 命名空间下创建权重变量，用于位置嵌入
        self.position_embeddings = self.add_weight(
            name="embeddings",
            shape=[self.max_position_embeddings, self.input_embedding_size],
            initializer=get_initializer(self.initializer_range),
        )

        # 如果已经构建过，直接返回
        if self.built:
            return

        # 标记该层已经构建
        self.built = True

        # 如果存在 LayerNorm 层，则构建该层，设置输入形状为 [None, None, self.config.input_embedding_size]
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.input_embedding_size])

    # 在调用函数中，根据输入张量进行嵌入操作
    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        token_type_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
        past_key_values_length=0,
        training: bool = False,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        # 断言输入张量 input_ids 或者 inputs_embeds 不为空
        assert not (input_ids is None and inputs_embeds is None)

        # 如果存在 input_ids，根据 input_ids 从权重中收集对应的嵌入向量
        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        # 获取输入嵌入的形状列表，去除最后一维（通常是 batch 维度）
        input_shape = shape_list(inputs_embeds)[:-1]

        # 如果 token_type_ids 为空，用零填充
        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)

        # 如果 position_ids 为空，根据 past_key_values_length 和输入形状的第二维度创建位置嵌入
        if position_ids is None:
            position_ids = tf.expand_dims(
                tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
            )

        # 根据 position_ids 从位置嵌入中获取对应的嵌入向量
        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
        # 根据 token_type_ids 从类型嵌入中获取对应的嵌入向量
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
        # 将输入嵌入、位置嵌入和类型嵌入相加得到最终的嵌入向量
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
        # 对最终嵌入向量进行 LayerNorm 归一化处理
        final_embeddings = self.LayerNorm(inputs=final_embeddings)
        # 根据训练模式进行 dropout 操作，避免过拟合
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)

        # 返回最终的嵌入向量
        return final_embeddings
# 从 transformers.models.bert.modeling_tf_bert.TFBertSelfAttention 复制代码并修改为使用 RemBert
class TFRemBertSelfAttention(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, **kwargs):
        super().__init__(**kwargs)

        # 检查隐藏大小是否能被注意力头数整除
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 创建查询、键、值的全连接层，并初始化
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        # 设置注意力概率的dropout层
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        # 判断是否是解码器层，并保存配置
        self.is_decoder = config.is_decoder
        self.config = config

    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
        # 将张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

        # 将张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
        return tf.transpose(tensor, perm=[0, 2, 1, 3])

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ):
        # 略，此处通常是进行自注意力计算的逻辑

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "query", None) is not None:
            with tf.name_scope(self.query.name):
                self.query.build([None, None, self.config.hidden_size])
        if getattr(self, "key", None) is not None:
            with tf.name_scope(self.key.name):
                self.key.build([None, None, self.config.hidden_size])
        if getattr(self, "value", None) is not None:
            with tf.name_scope(self.value.name):
                self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RemBert
class TFRemBertSelfOutput(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义全连接层，用于映射隐藏状态到指定大小的向量空间
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 定义 Layer Normalization 层，用于归一化输出的隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 定义 Dropout 层，用于在训练时随机断开一定比例的神经元，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 通过全连接层进行线性变换
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时应用 Dropout
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 使用 Layer Normalization 并将残差连接加回来
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            # 构建全连接层
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "LayerNorm", None) is not None:
            # 构建 Layer Normalization 层
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->RemBert
class TFRemBertAttention(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化自注意力层
        self.self_attention = TFRemBertSelfAttention(config, name="self")
        # 初始化输出层
        self.dense_output = TFRemBertSelfOutput(config, name="output")

    def prune_heads(self, heads):
        # 精简自注意力头部，但此处未实现具体功能
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 使用自注意力层处理输入，得到自注意力输出
        self_outputs = self.self_attention(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        # 使用输出层处理自注意力的输出，得到最终的注意力输出
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        # 如果需要输出注意力，将其添加到输出中
        outputs = (attention_output,) + self_outputs[1:]

        return outputs
    # 构建方法，用于构造神经网络层，如果已经构建过则直接返回
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，不进行重复构建
        if self.built:
            return
        # 将标志位设置为已构建
        self.built = True
        
        # 如果存在 self_attention 属性，则构建 self_attention
        if getattr(self, "self_attention", None) is not None:
            # 使用 self_attention 层的名称作为命名空间
            with tf.name_scope(self.self_attention.name):
                # 调用 self_attention 的 build 方法进行构建
                self.self_attention.build(None)
        
        # 如果存在 dense_output 属性，则构建 dense_output
        if getattr(self, "dense_output", None) is not None:
            # 使用 dense_output 层的名称作为命名空间
            with tf.name_scope(self.dense_output.name):
                # 调用 dense_output 的 build 方法进行构建
                self.dense_output.build(None)
# 从 transformers.models.bert.modeling_tf_bert.TFBertIntermediate 复制并将 Bert 替换为 RemBert
class TFRemBertIntermediate(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，输出单元数为 config.intermediate_size，使用指定初始化器初始化权重
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 如果 hidden_act 是字符串类型，则根据字符串获取 TensorFlow 激活函数；否则直接使用 config.hidden_act
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        
        self.config = config

    # 定义层的前向传播逻辑，接受隐藏状态张量并返回转换后的张量
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 全连接层的前向传播，输入 hidden_states，输出转换后的 hidden_states
        hidden_states = self.dense(inputs=hidden_states)
        # 应用激活函数转换 hidden_states
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    # 构建层，设置层的内部变量
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 dense 层，则按照指定的形状构建它
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# 从 transformers.models.bert.modeling_tf_bert.TFBertOutput 复制并将 Bert 替换为 RemBert
class TFRemBertOutput(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, **kwargs):
        super().__init__(**kwargs)

        # 创建一个全连接层，输出单元数为 config.hidden_size，使用指定初始化器初始化权重
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 创建 LayerNormalization 层，epsilon 设置为 config.layer_norm_eps
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 创建 Dropout 层，dropout 率为 config.hidden_dropout_prob
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    # 定义层的前向传播逻辑，接受隐藏状态张量和输入张量，返回转换后的张量
    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 全连接层的前向传播，输入 hidden_states，输出转换后的 hidden_states
        hidden_states = self.dense(inputs=hidden_states)
        # 使用 Dropout 对 hidden_states 进行处理，根据 training 参数决定是否使用
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 应用 LayerNormalization，加上输入张量 input_tensor
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    # 构建层，设置层的内部变量
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 dense 层，则按照指定的形状构建它
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        # 如果存在 LayerNorm 层，则按照指定的形状构建它
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# 从 transformers.models.bert.modeling_tf_bert.TFBertLayer 复制并将 Bert 替换为 RemBert
class TFRemBertLayer(keras.layers.Layer):
    # 使用给定的配置初始化 RemBert 模型
    def __init__(self, config: RemBertConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 创建 RemBertAttention 层，命名为 "attention"
        self.attention = TFRemBertAttention(config, name="attention")
        
        # 检查当前模型是否为解码器
        self.is_decoder = config.is_decoder
        
        # 检查是否添加了跨注意力机制
        self.add_cross_attention = config.add_cross_attention
        
        # 如果添加了跨注意力机制但当前模型不是解码器，则抛出错误
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            
            # 创建跨注意力机制的 RemBertAttention 层，命名为 "crossattention"
            self.crossattention = TFRemBertAttention(config, name="crossattention")
        
        # 创建 RemBertIntermediate 层，命名为 "intermediate"
        self.intermediate = TFRemBertIntermediate(config, name="intermediate")
        
        # 创建 RemBertOutput 层，命名为 "output"
        self.bert_output = TFRemBertOutput(config, name="output")
    ) -> Tuple[tf.Tensor]:
        # 定义函数的输入和输出类型，此函数返回一个元组，包含一个 TensorFlow 张量
        # decoder 单向自注意力的缓存键/值元组位于位置 1、2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用自注意力模块处理隐藏状态，生成自注意力输出
        self_attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            past_key_value=self_attn_past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        # 获取自注意力输出的主要结果
        attention_output = self_attention_outputs[0]

        # 如果是解码器，最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            # 如果不是解码器，添加自注意力结果（如果输出注意力权重的话）
            outputs = self_attention_outputs[1:]

        cross_attn_present_key_value = None
        # 如果是解码器且有编码器的隐藏状态作为输入
        if self.is_decoder and encoder_hidden_states is not None:
            # 如果没有设置交叉注意力层，则引发错误
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 交叉注意力缓存的键/值元组位于过去键/值元组的位置 3、4
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 使用交叉注意力模块处理自注意力输出，生成交叉注意力输出
            cross_attention_outputs = self.crossattention(
                input_tensor=attention_output,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 获取交叉注意力输出的主要结果
            attention_output = cross_attention_outputs[0]
            # 添加交叉注意力结果（如果输出注意力权重的话）
            outputs = outputs + cross_attention_outputs[1:-1]

            # 将交叉注意力缓存添加到当前键/值元组的位置 3、4
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 使用中间层处理注意力输出的隐藏状态
        intermediate_output = self.intermediate(hidden_states=attention_output)
        # 使用 BERT 输出层处理中间层和输入的注意力输出，生成最终的层输出
        layer_output = self.bert_output(
            hidden_states=intermediate_output, input_tensor=attention_output, training=training
        )
        # 添加注意力（如果有输出的话）
        outputs = (layer_output,) + outputs

        # 如果是解码器，将注意力键/值作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        # 返回整个函数的输出
        return outputs
    # 构建模型的方法，用于设置模型的各个组件
    def build(self, input_shape=None):
        # 如果模型已经构建完成，则直接返回，不重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 如果存在注意力层，则构建注意力层
        if getattr(self, "attention", None) is not None:
            # 使用注意力层的名称作为命名空间，构建注意力层
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 如果存在中间层，则构建中间层
        if getattr(self, "intermediate", None) is not None:
            # 使用中间层的名称作为命名空间，构建中间层
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        
        # 如果存在BERT输出层，则构建BERT输出层
        if getattr(self, "bert_output", None) is not None:
            # 使用BERT输出层的名称作为命名空间，构建BERT输出层
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)
        
        # 如果存在交叉注意力层，则构建交叉注意力层
        if getattr(self, "crossattention", None) is not None:
            # 使用交叉注意力层的名称作为命名空间，构建交叉注意力层
            with tf.name_scope(self.crossattention.name):
                self.crossattention.build(None)
class TFRemBertEncoder(keras.layers.Layer):
    # TFRemBertEncoder 类定义，继承自 keras.layers.Layer

    def __init__(self, config: RemBertConfig, **kwargs):
        # 初始化方法，接受一个 RemBertConfig 类型的 config 参数和额外的关键字参数

        super().__init__(**kwargs)
        # 调用父类的初始化方法

        self.config = config
        # 将传入的 config 参数保存为实例变量

        self.embedding_hidden_mapping_in = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="embedding_hidden_mapping_in",
        )
        # 创建一个 Dense 层，用于映射输入到隐藏状态空间，使用了 config 中的 hidden_size 和 initializer_range 参数

        self.layer = [TFRemBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
        # 创建 TFRemBertLayer 的列表，根据 num_hidden_layers 参数进行循环创建多个层

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_values: Tuple[Tuple[tf.Tensor]],
        use_cache: bool,
        output_attentions: bool,
        output_hidden_states: bool,
        return_dict: bool,
        training: bool = False,
    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
        # 对输入的隐藏状态进行嵌入映射，用于后续的处理
        hidden_states = self.embedding_hidden_mapping_in(inputs=hidden_states)
        # 如果需要输出所有隐藏状态，则初始化空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化空元组
        all_attentions = () if output_attentions else None
        # 如果需要输出交叉注意力权重且模型配置允许，则初始化空元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果需要使用缓存，则初始化空元组以存储下一层解码器的缓存
        next_decoder_cache = () if use_cache else None
        # 遍历每一层解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到所有隐藏状态元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的过去键值对，用于解码器自注意力机制
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 执行当前层的解码器操作，包括自注意力和可能的交叉注意力
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            # 如果使用缓存，将当前层的缓存信息添加到下一层解码器缓存中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)

            # 如果需要输出注意力权重，则将当前层的注意力权重添加到所有注意力元组中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)
                # 如果模型配置允许且存在编码器隐藏状态，则将当前层的交叉注意力权重添加到所有交叉注意力元组中
                if self.config.add_cross_attention and encoder_hidden_states is not None:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出最后一层的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的输出，则按顺序返回非空的结果元组
        if not return_dict:
            return tuple(
                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
            )

        # 返回字典形式的 TFBaseModelOutputWithPastAndCrossAttentions 对象
        return TFBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果定义了嵌入隐藏映射函数，则构建该函数
        if getattr(self, "embedding_hidden_mapping_in", None) is not None:
            with tf.name_scope(self.embedding_hidden_mapping_in.name):
                self.embedding_hidden_mapping_in.build([None, None, self.config.input_embedding_size])
        # 如果定义了层序列，则逐层构建每一层
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                with tf.name_scope(layer.name):
                    layer.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RemBert
class TFRemBertPooler(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, **kwargs):
        super().__init__(**kwargs)

        # Initialize a dense layer for pooling with specified hidden size, tanh activation, and name.
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # Pooling operation by extracting the hidden state of the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # Build the dense layer with the configured hidden size.
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


class TFRemBertLMPredictionHead(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)

        self.config = config
        self.initializer_range = config.initializer_range
        self.output_embedding_size = config.output_embedding_size
        # Dense layer for LM prediction with specified output embedding size and initializer.
        self.dense = keras.layers.Dense(
            config.output_embedding_size, kernel_initializer=get_initializer(self.initializer_range), name="dense"
        )
        # Activation function for the hidden layer.
        if isinstance(config.hidden_act, str):
            self.activation = get_tf_activation(config.hidden_act)
        else:
            self.activation = config.hidden_act
        # Layer normalization for the prediction head.
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")

    def build(self, input_shape=None):
        # Initialize weights for the LM decoder and bias.
        self.decoder = self.add_weight(
            name="decoder/weight",
            shape=[self.config.vocab_size, self.output_embedding_size],
            initializer=get_initializer(self.initializer_range),
        )
        self.decoder_bias = self.add_weight(
            shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
        )

        if self.built:
            return
        self.built = True
        # Build dense layer for the output embedding size.
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # Build layer normalization for output embedding size.
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, self.config.output_embedding_size])

    def get_output_embeddings(self) -> keras.layers.Layer:
        # Return the output embeddings layer.
        return self

    def set_output_embeddings(self, value):
        # Set the decoder weights for the LM head.
        self.decoder = value
        self.decoder.vocab_size = shape_list(value)[0]
    # 返回一个字典，包含解码器偏置的名称和变量
    def get_bias(self) -> Dict[str, tf.Variable]:
        return {"decoder_bias": self.decoder_bias}

    # 设置解码器的偏置值，并更新词汇表大小
    def set_bias(self, value: tf.Variable):
        self.decoder_bias = value["decoder_bias"]
        self.config.vocab_size = shape_list(value["decoder_bias"])[0]

    # 对隐藏状态进行一系列操作，用于解码器的推断过程
    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将隐藏状态通过全连接层
        hidden_states = self.dense(inputs=hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        # 获取序列长度
        seq_length = shape_list(tensor=hidden_states)[1]
        # 将隐藏状态重塑成指定形状
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.output_embedding_size])
        # 对隐藏状态进行层归一化
        hidden_states = self.LayerNorm(hidden_states)
        # 执行矩阵乘法，进行解码器的线性变换
        hidden_states = tf.matmul(a=hidden_states, b=self.decoder, transpose_b=True)
        # 将输出重塑为原始序列长度和词汇表大小的形状
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
        # 添加解码器偏置到输出中
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
        # 返回最终的隐藏状态作为输出
        return hidden_states
# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->RemBert
class TFRemBertMLMHead(keras.layers.Layer):
    def __init__(self, config: RemBertConfig, input_embeddings: keras.layers.Layer, **kwargs):
        super().__init__(**kwargs)

        # 创建 TFRemBertLMPredictionHead 实例作为预测头部
        self.predictions = TFRemBertLMPredictionHead(config, input_embeddings, name="predictions")

    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
        # 调用预测头部的前向传播，生成预测分数
        prediction_scores = self.predictions(hidden_states=sequence_output)

        return prediction_scores

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "predictions", None) is not None:
            with tf.name_scope(self.predictions.name):
                # 构建预测头部的内部层
                self.predictions.build(None)


@keras_serializable
class TFRemBertMainLayer(keras.layers.Layer):
    config_class = RemBertConfig

    def __init__(self, config: RemBertConfig, add_pooling_layer: bool = True, **kwargs):
        super().__init__(**kwargs)

        # 初始化 RemBert 主层，包括配置和是否为解码器
        self.config = config
        self.is_decoder = config.is_decoder

        # 创建 TFRemBertEmbeddings、TFRemBertEncoder 和 TFRemBertPooler（如果需要的话）
        self.embeddings = TFRemBertEmbeddings(config, name="embeddings")
        self.encoder = TFRemBertEncoder(config, name="encoder")
        self.pooler = TFRemBertPooler(config, name="pooler") if add_pooling_layer else None

    def get_input_embeddings(self) -> keras.layers.Layer:
        # 返回嵌入层
        return self.embeddings

    def set_input_embeddings(self, value: tf.Variable):
        # 设置输入的词嵌入权重和词汇大小
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    @unpack_inputs
    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call 复制而来
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        ):
        # RemBert 主层的前向传播函数，接收多个输入参数，并返回相应的输出
        pass  # 实际代码中会有进一步的实现
    # 定义神经网络层的构建方法，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 将标志位设置为已构建
        self.built = True
        # 如果存在嵌入层，则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            # 在命名空间中构建嵌入层
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果存在编码器，则构建编码器
        if getattr(self, "encoder", None) is not None:
            # 在命名空间中构建编码器
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果存在池化层，则构建池化层
        if getattr(self, "pooler", None) is not None:
            # 在命名空间中构建池化层
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)
# 添加类 TFRemBertPreTrainedModel，继承自 TFPreTrainedModel，用于处理权重初始化、预训练模型下载和加载的抽象类
class TFRemBertPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 RemBertConfig
    config_class = RemBertConfig
    # 模型基础名称前缀为 "rembert"
    base_model_prefix = "rembert"


# 定义字符串 REMBERT_START_DOCSTRING，用于提供 TFRemBertModel 的文档说明
REMBERT_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`RemBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义字符串 REMBERT_INPUTS_DOCSTRING，用于描述 TFRemBertModel 的输入参数说明（在此处未提供具体内容）
REMBERT_INPUTS_DOCSTRING = r"""
"""

# 使用 add_start_docstrings 装饰器为 TFRemBertModel 添加文档字符串
@add_start_docstrings(
    "The bare RemBERT Model transformer outputing raw hidden-states without any specific head on top.",
    REMBERT_START_DOCSTRING,
)
# 定义 TFRemBertModel 类，继承自 TFRemBertPreTrainedModel
class TFRemBertModel(TFRemBertPreTrainedModel):
    pass  # 实际实现 TFRemBertModel 的代码未在此处提供，因此添加一个 pass 占位符
    # 初始化方法，接受一个 RemBertConfig 对象作为配置，以及其他可变数量的输入参数和关键字参数
    def __init__(self, config: RemBertConfig, *inputs, **kwargs):
        # 调用父类的初始化方法，将配置对象及其他参数传递给父类
        super().__init__(config, *inputs, **kwargs)

        # 创建 TFRemBertMainLayer 对象，命名为 "rembert"，使用传入的配置对象
        self.rembert = TFRemBertMainLayer(config, name="rembert")

    # 使用装饰器 unpack_inputs 包装
    # 使用装饰器 add_start_docstrings_to_model_forward 添加模型前向传播的起始文档字符串
    # 使用装饰器 add_code_sample_docstrings 添加代码示例的文档字符串
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
        r"""
        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        """
        outputs = self.rembert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 调用模型的主体部分 `rembert` 来处理输入和可选的缓存键值对
        return outputs

    def build(self, input_shape=None):
        if self.built:
            return
        # 如果已经构建过，则直接返回，避免重复构建
        self.built = True
        # 如果模型已经存在 `rembert` 属性
        if getattr(self, "rembert", None) is not None:
            # 在名字作用域中构建 `rembert` 模型
            with tf.name_scope(self.rembert.name):
                # 使用 `rembert` 的构建方法来构建模型，参数为 `None`
                self.rembert.build(None)
@add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
# 使用装饰器为类添加文档字符串，指出其作为带有语言建模头部的 RemBERT 模型
class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLoss):
    def __init__(self, config: RemBertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        if config.is_decoder:
            logger.warning(
                "If you want to use `TFRemBertForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化 RemBERT 主层，如果配置为解码器则发出警告
        self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False)
        # 初始化 Masked LM 头部，使用 RemBERT 嵌入作为输入
        self.mlm = TFRemBertMLMHead(config, input_embeddings=self.rembert.embeddings, name="mlm___cls")

    def get_lm_head(self) -> keras.layers.Layer:
        # 返回 Masked LM 头部的预测层
        return self.mlm.predictions

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="google/rembert",
        output_type=TFMaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播函数，包括各种输入参数和返回值的文档说明
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        # 定义函数签名和返回类型，可以返回 TFMaskedLMOutput 或包含 tf.Tensor 的元组
        outputs = self.rembert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型获取输出的序列表示
        sequence_output = outputs[0]
        # 使用 MLM 层生成预测分数
        prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
        # 如果提供了标签，计算损失；否则损失为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)

        # 如果不要求返回字典，则组装输出元组
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMaskedLMOutput 对象，包含损失、预测分数、隐藏状态和注意力权重
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 rembert 模型，则构建 rembert 模型
        if getattr(self, "rembert", None) is not None:
            with tf.name_scope(self.rembert.name):
                self.rembert.build(None)
        # 如果存在 mlm 模型，则构建 mlm 模型
        if getattr(self, "mlm", None) is not None:
            with tf.name_scope(self.mlm.name):
                self.mlm.build(None)
@add_start_docstrings(
    """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
)
# 定义 TFRemBertForCausalLM 类，继承自 TFRemBertPreTrainedModel 和 TFCausalLanguageModelingLoss
class TFRemBertForCausalLM(TFRemBertPreTrainedModel, TFCausalLanguageModelingLoss):
    def __init__(self, config: RemBertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 如果配置文件中不是解码器，发出警告信息
        if not config.is_decoder:
            logger.warning("If you want to use `TFRemBertForCausalLM` as a standalone, add `is_decoder=True.`")

        # 初始化 RemBERT 主层，不添加池化层
        self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False)
        # 初始化 RemBERT 的 MLM 头部
        self.mlm = TFRemBertMLMHead(config, input_embeddings=self.rembert.embeddings, name="mlm___cls")

    # 获取语言建模头部
    def get_lm_head(self) -> keras.layers.Layer:
        return self.mlm.predictions

    # 从 transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation 复制过来的方法
    # 准备生成的输入数据，包括输入的 ID，过去的键值，注意力掩码等
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        # 如果没有提供注意力掩码，则创建一个全为 1 的注意力掩码
        if attention_mask is None:
            attention_mask = tf.ones(input_shape)

        # 如果有过去的键值，只使用最后一个输入 ID
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    # 解包输入参数，并添加代码示例的文档字符串
    @unpack_inputs
    @add_code_sample_docstrings(
        checkpoint="google/rembert",
        output_type=TFCausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义调用函数，接受多个输入参数和一些可选的参数，返回一个输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        # 如果已经建立过模型，则直接返回
        if self.built:
            return
        # 标记模型已经建立
        self.built = True
        # 如果存在 RemBERT 主层，则在命名空间下建立主层
        if getattr(self, "rembert", None) is not None:
            with tf.name_scope(self.rembert.name):
                self.rembert.build(None)
        # 如果存在 MLM 头部，则在命名空间下建立 MLM 头部
        if getattr(self, "mlm", None) is not None:
            with tf.name_scope(self.mlm.name):
                self.mlm.build(None)

@add_start_docstrings(
    """
    RemBERT Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks.
    """
    REMBERT_START_DOCSTRING,



# 定义了一个 RemBERT 模型转换器，带有顶部的序列分类/回归头，例如用于GLUE任务。
# 这部分代码用于文档字符串的开头标记 REMBERT_START_DOCSTRING。


这样的注释能够准确描述每行代码的功能和作用，而不会过多或者过少地概括其含义。
# 定义一个继承自 TFRemBertPreTrainedModel 和 TFSequenceClassificationLoss 的模型类 TFRemBertForSequenceClassification
class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceClassificationLoss):
    
    # 初始化方法，接受一个 RemBertConfig 对象和其他输入参数
    def __init__(self, config: RemBertConfig, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 设置模型的标签数量
        self.num_labels = config.num_labels

        # 创建 TFRemBertMainLayer 对象，用于主要的 RemBert 模型
        self.rembert = TFRemBertMainLayer(config, name="rembert")
        
        # 创建一个 Dropout 层，使用配置中的 dropout 比率
        self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
        
        # 创建一个全连接层作为分类器，单元数为配置中的标签数量，初始化方式为配置中的初始化范围
        self.classifier = keras.layers.Dense(
            units=config.num_labels,
            kernel_initializer=get_initializer(config.initializer_range),
            name="classifier",
        )
        
        # 保存配置对象
        self.config = config

    # 调用方法，接受多种输入参数，并返回模型的输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="google/rembert",
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用 rembert 模型进行前向传播，获取模型输出
        outputs = self.rembert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中获取池化后的特征表示
        pooled_output = outputs[1]
        # 在训练时对特征表示进行 dropout 处理
        pooled_output = self.dropout(inputs=pooled_output, training=training)
        # 使用分类器模型对池化后的特征表示进行分类预测
        logits = self.classifier(inputs=pooled_output)
        # 如果提供了标签，计算损失函数
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不需要返回字典形式的结果，则组装输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的结果，则创建 TFSequenceClassifierOutput 对象
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 构建 rembert 模型的网络结构
        if getattr(self, "rembert", None) is not None:
            with tf.name_scope(self.rembert.name):
                self.rembert.build(None)
        # 构建分类器模型的网络结构
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                # 指定分类器的输入形状
                self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    RemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    REMBERT_START_DOCSTRING,
)
class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss):
    def __init__(self, config: RemBertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 RemBERT 主层
        self.rembert = TFRemBertMainLayer(config, name="rembert")
        # 添加 dropout 层
        self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
        # 添加分类器层，用于多选题目
        self.classifier = keras.layers.Dense(
            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="google/rembert",
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """

        # 如果输入了 `input_ids`，则获取其第二维和第三维的大小作为 `num_choices` 和 `seq_length`
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]
            seq_length = shape_list(input_ids)[2]
        else:
            # 否则，使用 `inputs_embeds` 的第二维和第三维作为 `num_choices` 和 `seq_length`
            num_choices = shape_list(inputs_embeds)[1]
            seq_length = shape_list(inputs_embeds)[2]

        # 将 `input_ids` 展平为形状为 `(batch_size * num_choices, seq_length)` 的张量，如果 `input_ids` 不为 `None`
        flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
        # 将 `attention_mask` 展平为形状为 `(batch_size * num_choices, seq_length)` 的张量，如果 `attention_mask` 不为 `None`
        flat_attention_mask = (
            tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
        )
        # 将 `token_type_ids` 展平为形状为 `(batch_size * num_choices, seq_length)` 的张量，如果 `token_type_ids` 不为 `None`
        flat_token_type_ids = (
            tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
        )
        # 将 `position_ids` 展平为形状为 `(batch_size * num_choices, seq_length)` 的张量，如果 `position_ids` 不为 `None`
        flat_position_ids = (
            tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
        )
        # 将 `inputs_embeds` 展平为形状为 `(batch_size * num_choices, seq_length, embed_dim)` 的张量，如果 `inputs_embeds` 不为 `None`
        flat_inputs_embeds = (
            tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )
        
        # 调用 `self.rembert` 方法进行模型推理，传入展平后的输入和其他参数
        outputs = self.rembert(
            input_ids=flat_input_ids,
            attention_mask=flat_attention_mask,
            token_type_ids=flat_token_type_ids,
            position_ids=flat_position_ids,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # 获取池化后的输出，通过 dropout 进行正则化
        pooled_output = outputs[1]
        pooled_output = self.dropout(inputs=pooled_output, training=training)
        # 使用分类器进行多选项分类任务的预测
        logits = self.classifier(inputs=pooled_output)
        # 将 logits 重塑为形状为 `(batch_size, num_choices)` 的张量
        reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
        # 如果提供了 `labels`，计算损失函数
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)

        # 如果 `return_dict` 为 False，返回非字典形式的输出
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        
        # 如果 `return_dict` 为 True，返回包含损失、logits、隐藏状态和注意力权重的字典输出
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    # 将模型标记为已构建状态
    self.built = True

    # 如果存在名为 "rembert" 的属性，并且不为 None，则构建其模型
    if getattr(self, "rembert", None) is not None:
        # 使用 TensorFlow 的命名空间为 "rembert" 构建模型
        with tf.name_scope(self.rembert.name):
            self.rembert.build(None)

    # 如果存在名为 "classifier" 的属性，并且不为 None，则构建其模型
    if getattr(self, "classifier", None) is not None:
        # 使用 TensorFlow 的命名空间为 "classifier" 构建模型，期望输入的形状为 [None, None, self.config.hidden_size]
        with tf.name_scope(self.classifier.name):
            self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    RemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    REMBERT_START_DOCSTRING,
)
class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config: RemBertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化分类任务的标签数量
        self.num_labels = config.num_labels

        # 初始化 RemBERT 主层，不包含池化层
        self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False)
        
        # Dropout 层，根据配置中的隐藏层dropout概率
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        
        # 分类器，输出单元数为配置中的标签数量，使用指定的初始化器范围
        self.classifier = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        
        # 保存配置对象
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="google/rembert",
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用自定义模型 `rembert` 进行前向传播，获取输出
        outputs = self.rembert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中获取序列输出（通常是最后一层的隐藏状态）
        sequence_output = outputs[0]
        # 根据训练状态进行 dropout 操作，用于防止过拟合
        sequence_output = self.dropout(inputs=sequence_output, training=training)
        # 将 dropout 后的输出送入分类器，得到预测的 logits
        logits = self.classifier(inputs=sequence_output)
        # 如果提供了标签，则计算损失；否则损失为 None
        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)

        # 如果不要求返回字典，则组装输出并返回
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的输出，则构建 TFTokenClassifierOutput 对象并返回
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 设置模型为已构建状态
        self.built = True
        # 如果存在 `rembert` 模型，则构建其层次结构
        if getattr(self, "rembert", None) is not None:
            with tf.name_scope(self.rembert.name):
                self.rembert.build(None)
        # 如果存在分类器 `classifier`，则构建其层次结构，并指定输入形状
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
"""
RemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
    """
    RemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    REMBERT_START_DOCSTRING,
)
class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnsweringLoss):
    def __init__(self, config: RemBertConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        self.num_labels = config.num_labels  # 从配置中获取标签的数量

        self.rembert = TFRemBertMainLayer(config, add_pooling_layer=False, name="rembert")  # 初始化 RemBERT 主层
        self.qa_outputs = keras.layers.Dense(
            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )  # 初始化用于问答输出的全连接层，输出单元数为标签数量
        self.config = config  # 保存配置参数

    @unpack_inputs
    @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="google/rembert",
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        """
        Forward pass of the model. This function handles various inputs for question-answering tasks
        and returns model outputs such as start and end logits.

        Args:
            input_ids: Indices of input tokens in the vocabulary.
            attention_mask: Mask to avoid performing attention on padding tokens.
            token_type_ids: Segment token indices to indicate first and second portions of the inputs.
            position_ids: Indices of positions of each input sequence token in the position embeddings.
            head_mask: Mask to nullify selected heads of the self-attention modules.
            inputs_embeds: Optionally provided embeddings instead of input_ids.
            output_attentions: Whether to return attentions weights (a.k.a. self-weights).
            output_hidden_states: Whether to return all hidden-states.
            return_dict: Whether to return a dictionary instead of a tuple.
            start_positions: Ground truth for the start position of the span in the input.
            end_positions: Ground truth for the end position of the span in the input.
            training: Whether the model is in training mode.

        Returns:
            Depending on `return_dict`, either a tuple or a dictionary containing model outputs such as
            start and end logits for span prediction.
        """
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 调用 RoBERTa 模型进行前向传播，获取输出
        outputs = self.rembert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 RoBERTa 模型的输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传入问答模型的输出层，得到 logits
        logits = self.qa_outputs(inputs=sequence_output)
        # 将 logits 沿着最后一个维度分割为起始位置和结束位置的 logits
        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
        # 去除维度为 1 的维度，确保维度匹配
        start_logits = tf.squeeze(input=start_logits, axis=-1)
        end_logits = tf.squeeze(input=end_logits, axis=-1)
        loss = None

        # 如果提供了起始位置和结束位置的标签，则计算损失
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            # 调用 HF 框架的损失计算函数，计算损失值
            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))

        # 如果不要求返回字典，则根据是否有损失值返回相应的输出
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFQuestionAnsweringModelOutput 类型的对象，包含损失值、起始位置 logits、结束位置 logits、隐藏状态和注意力权重
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 RoBERTa 模型，使用其名称作为命名空间构建模型
        if getattr(self, "rembert", None) is not None:
            with tf.name_scope(self.rembert.name):
                self.rembert.build(None)
        # 如果存在问答输出层，使用配置的隐藏大小构建输出层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\rembert\tokenization_rembert.py`

# coding=utf-8
# 声明编码方式为UTF-8，确保支持各种字符集

# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
# 版权声明：HuggingFace 团队和 HuggingFace 公司保留所有权利。

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 根据 Apache License, Version 2.0 许可协议授权，详见上述链接

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非有适用法律要求或书面同意，本软件按"原样"分发，不附带任何明示或暗示的担保或条件。
# 请参阅许可协议了解更多信息。

"""Tokenization classes for RemBERT."""
# 用于 RemBERT 的分词类

import os
from shutil import copyfile
from typing import List, Optional, Tuple

import sentencepiece as spm

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging

# 导入所需模块和类

logger = logging.get_logger(__name__)

# 获取 logger 对象

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.model"}

# 定义词汇文件的名称映射，指定为 sentencepiece.model

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "google/rembert": "https://huggingface.co/google/rembert/resolve/main/sentencepiece.model",
    },
}

# 预训练模型的词汇文件映射，指定 RemBERT 模型对应的 sentencepiece.model 的下载地址

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/rembert": 256,
}

# 预训练模型的位置嵌入尺寸映射，指定 RemBERT 模型的位置嵌入尺寸为 256

class RemBertTokenizer(PreTrainedTokenizer):
    """
    Construct a RemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # 构建 RemBERT 分词器，基于 SentencePiece

    def __init__(
        self,
        vocab_file,
        *,
        tokenizer_file=None,
        do_lower_case=False,
        bos_token="[CLS]",
        eos_token="[SEP]",
        sep_token="[SEP]",
        cls_token="[CLS]",
        pad_token="[PAD]",
        unk_token="[UNK]",
        mask_token="[MASK]",
        **kwargs
    ):
        # 初始化方法，接收参数包括词汇文件路径、是否转换为小写、特殊标记等

        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            unk_token=unk_token,
            mask_token=mask_token,
            **kwargs
        )

        # 调用父类的初始化方法，设置分词器的基本属性
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """
    # 定义类变量，包含预定义的文件名字典
    vocab_files_names = VOCAB_FILES_NAMES
    # 包含预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 包含预训练位置嵌入大小的字典
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    # 初始化方法，接收多个参数，包括词汇文件和特殊标记的设置
    def __init__(
        self,
        vocab_file,
        do_lower_case=False,
        remove_space=True,
        keep_accents=True,
        bos_token="[CLS]",
        eos_token="[SEP]",
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs,
        # 如果 mask_token 是字符串类型，则设置 lstrip=True 和 rstrip=False 的 AddedToken 对象
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 初始化模型的参数
        self.do_lower_case = do_lower_case  # 是否将输入文本转换为小写
        self.remove_space = remove_space    # 是否移除输入文本中的空格
        self.keep_accents = keep_accents    # 是否保留输入文本中的重音符号
        self.vocab_file = vocab_file        # 词汇表文件的路径

        # 使用 SentencePieceProcessor 初始化 self.sp_model 对象，并加载词汇表文件
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

        # 调用父类的初始化方法，设置模型的基本参数
        super().__init__(
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 返回词汇表的大小，即 self.sp_model 中词汇的数量
        return len(self.sp_model)

    def get_vocab(self):
        # 构建并返回词汇表，包括从 id 到 token 的映射和额外添加的 tokens 编码器
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def __getstate__(self):
        # 返回对象的状态字典，不包括 sp_model，用于序列化对象
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        # 从状态字典中恢复对象的状态，重新加载 sp_model
        self.__dict__ = d
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def _tokenize(self, text, sample=False):
        """Tokenize a string."""
        # 使用 sp_model 对文本进行分词，返回分词后的结果 pieces
        pieces = self.sp_model.EncodeAsPieces(text)
        return pieces

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据 token 转换成对应的 id，使用 sp_model 的 PieceToId 方法
        return self.sp_model.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据 id 转换成对应的 token，使用 sp_model 的 IdToPiece 方法
        return self.sp_model.IdToPiece(index)

    def convert_tokens_to_string(self, tokens):
        # 将 tokens 转换成字符串，使用 sp_model 的 decode_pieces 方法
        out_string = self.sp_model.decode_pieces(tokens)
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ):
        # 构建包含特殊 token 的输入，这里不包含实现细节，只是声明方法签名
        pass
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs from two sequences. Token type IDs are binary masks identifying the type of each token
        in the sequence: 0 for the first sequence, 1 for the second sequence.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs where each ID corresponds to the type of its respective token.
        """

        if token_ids_1 is None:
            # If there's no second sequence, all tokens belong to the first sequence (type ID 0)
            return [0] * len(token_ids_0)

        # Create token type IDs for a pair of sequences
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define the separation token ID and the classification token ID
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If there is no second sequence provided, return a mask with 0s for only the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # Otherwise, return a mask with 0s for the first sequence and 1s for the second sequence
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Check if the save_directory exists; if not, log an error and return None
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
        
        # Define the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # Copy the vocabulary file to the output directory if it's different from the current location
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # If the vocabulary file doesn't exist, write the serialized sp_model to the output file
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # Return the path to the saved vocabulary file
        return (out_vocab_file,)

`.\models\rembert\tokenization_rembert_fast.py`

# coding=utf-8
# 声明文件编码格式为UTF-8

# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
# 版权声明，版权属于Google AI、Google Brain和HuggingFace Inc.团队。

# Licensed under the Apache License, Version 2.0 (the "License");
# 以Apache License 2.0版本授权许可，详细信息可查阅License链接

# you may not use this file except in compliance with the License.
# 除非遵守许可证，否则不得使用此文件。

# You may obtain a copy of the License at
# 可在上述链接获取许可证的副本。

#     http://www.apache.org/licenses/LICENSE-2.0
#     许可证的URL地址

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and
# limitations under the License.
# 查阅许可证获取更多具体的权限和限制信息。

""" Tokenization classes for RemBERT model."""
# 注释：RemBERT模型的分词类

import os
# 导入标准库os中的功能
from shutil import copyfile
# 从shutil模块中导入copyfile函数
from typing import List, Optional, Tuple
# 导入typing模块中的List、Optional和Tuple类型

from ...tokenization_utils import AddedToken
# 从上级目录中的tokenization_utils模块导入AddedToken类
from ...tokenization_utils_fast import PreTrainedTokenizerFast
# 从上级目录中的tokenization_utils_fast模块导入PreTrainedTokenizerFast类
from ...utils import is_sentencepiece_available, logging
# 从上级目录中的utils模块导入is_sentencepiece_available和logging功能

if is_sentencepiece_available():
    # 如果SentencePiece可用
    from .tokenization_rembert import RemBertTokenizer
    # 从当前目录中的tokenization_rembert模块导入RemBertTokenizer类
else:
    # 如果SentencePiece不可用
    RemBertTokenizer = None
    # 将RemBertTokenizer设置为None

logger = logging.get_logger(__name__)
# 获取当前模块的logger对象
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.model", "tokenizer_file": "tokenizer.json"}
# 定义词汇文件和分词器文件的名称字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "google/rembert": "https://huggingface.co/google/rembert/resolve/main/sentencepiece.model",
    },
    "tokenizer_file": {
        "google/rembert": "https://huggingface.co/google/rembert/resolve/main/tokenizer.json",
    },
}
# 预训练词汇文件映射字典，包含google/rembert模型的词汇和分词器文件的URL

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/rembert": 256,
}
# 预训练位置嵌入大小字典，包含google/rembert模型的大小为256

SPIECE_UNDERLINE = "▁"
# 定义SentencePiece的特殊字符下划线

class RemBertTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods
    """
    # RemBertTokenizerFast类继承自PreTrainedTokenizerFast，构建一个“快速”的RemBert分词器，
    # 使用HuggingFace的tokenizers库支持。基于Unigram模型。此分词器继承自PreTrainedTokenizerFast类，
    # 包含大多数主要方法。用户应参考此超类以获取关于这些方法的更多信息。
    # 定义函数参数和说明
    Args:
        vocab_file (`str`):
            SentencePiece 文件的路径，通常以 *.spm* 扩展名，包含实例化分词器所需的词汇表。
        do_lower_case (`bool`, *optional*, defaults to `True`):
            在分词时是否将输入转换为小写。
        remove_space (`bool`, *optional*, defaults to `True`):
            在分词时是否去除文本中的空格（去除字符串前后多余的空格）。
        keep_accents (`bool`, *optional*, defaults to `False`):
            在分词时是否保留重音符号。
        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            序列的开始标记，用于预训练。在构建序列时，实际使用的是 `cls_token`。
        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            序列的结束标记。在构建序列时，实际使用的是 `sep_token`。
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            未知标记，表示词汇表中不存在的标记。
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            分隔符标记，在构建多个序列组成的序列时使用，例如序列分类或问答任务中的文本和问题之间的分隔。
            同时也是构建特殊序列的最后一个标记。
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            填充标记，在将不同长度的序列进行批处理时使用。
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            分类器标记，在进行序列分类时使用，是构建特殊序列的第一个标记。
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            掩码标记，在掩码语言建模训练中使用，模型尝试预测的标记。
    
    # 设置特定的常量和映射
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = RemBertTokenizer
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        remove_space=True,
        keep_accents=False,
        bos_token="[CLS]",
        eos_token="[SEP]",
        unk_token="<unk>",
        sep_token="[SEP]",
        pad_token="<pad>",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs,
    ):
        # 如果 mask_token 是字符串类型，则创建一个 AddedToken 对象，用于处理左侧去除空格而右侧保留空格
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 调用父类的初始化方法，设置各种属性和参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

        # 设置对象的属性，用于保存配置参数
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查词汇表文件是否存在，以确定是否可以保存缓慢的分词器
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从一个或一对序列构建模型输入，用于序列分类任务，通过连接和添加特殊标记。RemBERT 序列的格式如下：

        - 单个序列：`[CLS] X [SEP]`
        - 序列对：`[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                将添加特殊标记的 ID 列表
            token_ids_1 (`List[int]`, *optional*, 默认为 `None`):
                第二个序列的 ID 列表（对序列任务）

        Returns:
            `List[int]`: 包含适当特殊标记的输入 ID 列表。
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return cls + token_ids_0 + sep
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        获取特殊标记的掩码，用于标识哪些位置是特殊标记的位置。

        Args:
            token_ids_0 (`List[int]`):
                第一个序列的 ID 列表
            token_ids_1 (`List[int]`, *optional*, 默认为 `None`):
                第二个序列的 ID 列表（对序列任务）
            already_has_special_tokens (`bool`, 默认为 `False`):
                是否已经包含了特殊标记，如果是则为 True

        Returns:
            `List[int]`: 表示特殊标记位置的掩码列表。
        """
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """

        # 定义特殊标记
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # 如果没有第二个序列，则返回只有第一个序列部分的 mask (全为0)
        if token_ids_1 is None:
            # 构建第一个序列的 token type IDs，格式为 [0, ..., 0]，长度为 cls + token_ids_0 + sep 的总长度
            return len(cls + token_ids_0 + sep) * [0]

        # 如果有第二个序列
        # 构建第一个序列和第二个序列的 token type IDs
        # 第一个序列部分为 [0, ..., 0]，长度为 cls + token_ids_0 + sep 的总长度
        # 第二个序列部分为 [1, ..., 1]，长度为 token_ids_1 + sep 的总长度
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    # 定义一个方法用于保存词汇表到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
        
        # 构建输出词汇表文件路径，包括可选的文件名前缀和默认的词汇表文件名
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与目标输出路径不同，复制当前词汇表文件到目标路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回包含输出文件路径的元组
        return (out_vocab_file,)

`.\models\rembert\init.py`

# 引入类型检查模块，用于检查类型相关的导入
from typing import TYPE_CHECKING

# 从工具模块中导入所需内容：可选依赖不可用异常、延迟加载模块、判断是否有句子分词模块可用、判断是否有 TensorFlow 模块可用、判断是否有 Tokenizers 模块可用、判断是否有 PyTorch 模块可用
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块导入结构字典，用于存储不同模块的导入结构
_import_structure = {
    "configuration_rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig", "RemBertOnnxConfig"]
}

# 尝试检查是否句子分词模块可用，如果不可用则抛出可选依赖不可用异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass  # 如果句子分词模块不可用，继续执行

# 如果句子分词模块可用，则将 RemBertTokenizer 添加到导入结构字典中
else:
    _import_structure["tokenization_rembert"] = ["RemBertTokenizer"]

# 尝试检查是否 Tokenizers 模块可用，如果不可用则抛出可选依赖不可用异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass  # 如果 Tokenizers 模块不可用，继续执行

# 如果 Tokenizers 模块可用，则将 RemBertTokenizerFast 添加到导入结构字典中
else:
    _import_structure["tokenization_rembert_fast"] = ["RemBertTokenizerFast"]

# 尝试检查是否 PyTorch 模块可用，如果不可用则抛出可选依赖不可用异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass  # 如果 PyTorch 模块不可用，继续执行

# 如果 PyTorch 模块可用，则将 RemBert 相关模块添加到导入结构字典中
else:
    _import_structure["modeling_rembert"] = [
        "REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "RemBertForCausalLM",
        "RemBertForMaskedLM",
        "RemBertForMultipleChoice",
        "RemBertForQuestionAnswering",
        "RemBertForSequenceClassification",
        "RemBertForTokenClassification",
        "RemBertLayer",
        "RemBertModel",
        "RemBertPreTrainedModel",
        "load_tf_weights_in_rembert",
    ]

# 尝试检查是否 TensorFlow 模块可用，如果不可用则抛出可选依赖不可用异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass  # 如果 TensorFlow 模块不可用，继续执行

# 如果 TensorFlow 模块可用，则将 TFRemBert 相关模块添加到导入结构字典中
else:
    _import_structure["modeling_tf_rembert"] = [
        "TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFRemBertForCausalLM",
        "TFRemBertForMaskedLM",
        "TFRemBertForMultipleChoice",
        "TFRemBertForQuestionAnswering",
        "TFRemBertForSequenceClassification",
        "TFRemBertForTokenClassification",
        "TFRemBertLayer",
        "TFRemBertModel",
        "TFRemBertPreTrainedModel",
    ]

# 如果当前环境在类型检查模式下，从配置模块中导入所需内容：预训练配置存档映射、RemBertConfig、RemBertOnnxConfig
if TYPE_CHECKING:
    from .configuration_rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig, RemBertOnnxConfig

    # 尝试检查是否句子分词模块可用，如果不可用则抛出可选依赖不可用异常
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass  # 如果句子分词模块不可用，继续执行

    # 如果句子分词模块可用，则从分词模块中导入 RemBertTokenizer
    else:
        from .tokenization_rembert import RemBertTokenizer
    # 检查是否安装了 tokenizers 库，若未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做处理继续执行后续代码
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 tokenizers 库可用，则从本地模块导入 RemBertTokenizerFast 类
        from .tokenization_rembert_fast import RemBertTokenizerFast

    # 检查是否安装了 torch 库，若未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做处理继续执行后续代码
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 torch 库可用，则从本地模块导入以下类和函数
        from .modeling_rembert import (
            REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            RemBertForCausalLM,
            RemBertForMaskedLM,
            RemBertForMultipleChoice,
            RemBertForQuestionAnswering,
            RemBertForSequenceClassification,
            RemBertForTokenClassification,
            RemBertLayer,
            RemBertModel,
            RemBertPreTrainedModel,
            load_tf_weights_in_rembert,
        )

    # 检查是否安装了 TensorFlow 库，若未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做处理继续执行后续代码
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 TensorFlow 库可用，则从本地模块导入以下类和函数
        from .modeling_tf_rembert import (
            TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFRemBertForCausalLM,
            TFRemBertForMaskedLM,
            TFRemBertForMultipleChoice,
            TFRemBertForQuestionAnswering,
            TFRemBertForSequenceClassification,
            TFRemBertForTokenClassification,
            TFRemBertLayer,
            TFRemBertModel,
            TFRemBertPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于管理 Python 解释器的运行时环境
    import sys

    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 类进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\resnet\configuration_resnet.py`

# coding=utf-8
# 文件编码声明，指定使用 UTF-8 编码格式
# Copyright 2022 Microsoft Research, Inc. and The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 Microsoft Research, Inc. 和 HuggingFace Inc. 团队所有
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可协议授权，除非符合许可协议要求，否则不得使用此文件
# you may not use this file except in compliance with the License.
# 未经许可，不得使用此文件
# You may obtain a copy of the License at
# 可以在以下链接获取许可协议的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则按“原样”分发本软件，不附带任何明示或暗示的担保或条件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 不提供任何类型的担保或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可协议，了解权限和限制
""" ResNet model configuration"""
# ResNet 模型配置说明文档

from collections import OrderedDict
# 导入 OrderedDict 类，用于创建有序字典
from typing import Mapping
# 导入 Mapping 类型，用于类型提示

from packaging import version
# 导入 version 模块，用于版本管理

from ...configuration_utils import PretrainedConfig
# 从上级目录中导入 PretrainedConfig 类
from ...onnx import OnnxConfig
# 从上级目录中导入 OnnxConfig 类
from ...utils import logging
# 从上级目录中导入 logging 模块
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
# 从上级目录中导入 BackboneConfigMixin 类和 get_aligned_output_features_output_indices 函数

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/resnet-50": "https://huggingface.co/microsoft/resnet-50/blob/main/config.json",
}
# 定义 RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP 字典，映射预训练模型名到其配置文件的 URL
# 用于提供预训练模型的配置信息

class ResNetConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ResNetModel`]. It is used to instantiate an
    ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the ResNet
    [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # ResNetConfig 类，用于存储 ResNetModel 的配置信息
    # 继承自 BackboneConfigMixin 和 PretrainedConfig 类

    def __init__(self, **kwargs):
        # 初始化方法，接受任意关键字参数

        super().__init__(**kwargs)
        # 调用父类 PretrainedConfig 的初始化方法，传入关键字参数
    # 定义了模型类型为 "resnet"
    model_type = "resnet"
    
    # 定义了可选的层类型列表，包括 "basic" 和 "bottleneck"
    layer_types = ["basic", "bottleneck"]
    # 初始化函数，用于创建一个新的神经网络模型对象
    def __init__(
        self,
        num_channels=3,  # 输入数据的通道数，默认为3（RGB图像）
        embedding_size=64,  # 嵌入向量的大小，默认为64
        hidden_sizes=[256, 512, 1024, 2048],  # 每个阶段的隐藏层大小列表
        depths=[3, 4, 6, 3],  # 每个阶段的残差块数量列表
        layer_type="bottleneck",  # 残差块类型，默认为瓶颈块
        hidden_act="relu",  # 隐藏层激活函数，默认为ReLU
        downsample_in_first_stage=False,  # 是否在第一个阶段下采样，默认为False
        downsample_in_bottleneck=False,  # 是否在瓶颈块中进行下采样，默认为False
        out_features=None,  # 输出特征的列表或字典，默认为None
        out_indices=None,  # 输出索引的列表或字典，默认为None
        **kwargs,  # 其他未命名参数
    ):
        super().__init__(**kwargs)  # 调用父类的初始化方法
        if layer_type not in self.layer_types:  # 如果给定的层类型不在允许的层类型列表中
            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
        self.num_channels = num_channels  # 设置输入数据的通道数
        self.embedding_size = embedding_size  # 设置嵌入向量的大小
        self.hidden_sizes = hidden_sizes  # 设置每个阶段的隐藏层大小列表
        self.depths = depths  # 设置每个阶段的残差块数量列表
        self.layer_type = layer_type  # 设置残差块的类型
        self.hidden_act = hidden_act  # 设置隐藏层的激活函数
        self.downsample_in_first_stage = downsample_in_first_stage  # 设置是否在第一个阶段进行下采样
        self.downsample_in_bottleneck = downsample_in_bottleneck  # 设置是否在瓶颈块中进行下采样
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]  # 设置阶段的名称列表
        # 调用函数获取对齐的输出特征和输出索引
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )
class ResNetOnnxConfig(OnnxConfig):
    # 定义一个 ResNet 的 ONNX 配置类，继承自 OnnxConfig 类

    # 定义 torch_onnx_minimum_version 属性，指定最低支持的 Torch 版本为 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 返回一个有序字典，包含输入的名称到索引及其描述的映射关系
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    @property
    def atol_for_validation(self) -> float:
        # 返回用于验证的绝对容差值，设定为 1e-3
        return 1e-3

`.\models\resnet\convert_resnet_to_pytorch.py`

# 设置编码格式为 UTF-8

# 版权声明，这段代码由 HuggingFace Inc. 团队所有，遵循 Apache License, Version 2.0

# 导入必要的库和模块
import argparse  # 用于解析命令行参数
import json  # 处理 JSON 格式数据
from dataclasses import dataclass, field  # 用于创建数据类，支持默认字段
from functools import partial  # 创建偏函数
from pathlib import Path  # 处理文件和目录路径
from typing import List  # 定义类型提示

import timm  # 导入 timm 库，用于模型加载
import torch  # PyTorch 库
import torch.nn as nn  # PyTorch 神经网络模块
from huggingface_hub import hf_hub_download  # 从 Hugging Face Hub 下载模型
from torch import Tensor  # PyTorch 张量类型

# 从 transformers 库中导入必要的模块和函数
from transformers import AutoImageProcessor, ResNetConfig, ResNetForImageClassification
from transformers.utils import logging  # 导入 logging 模块

# 设置日志输出级别为 info
logging.set_verbosity_info()

# 获取日志记录器
logger = logging.get_logger()


@dataclass
class Tracker:
    # 追踪器类，用于跟踪神经网络模块的前向传播
    module: nn.Module  # 要追踪的模块
    traced: List[nn.Module] = field(default_factory=list)  # 用于存储追踪到的模块列表，默认为空列表
    handles: list = field(default_factory=list)  # 存储注册的钩子句柄列表，默认为空列表

    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
        # 前向传播的钩子函数，用于注册到模块上
        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
        # 检查模块是否没有子模块或是卷积层或批归一化层
        if has_not_submodules:
            self.traced.append(m)  # 将当前模块添加到追踪列表中

    def __call__(self, x: Tensor):
        # 实现对象可调用功能，用于启动追踪
        for m in self.module.modules():
            self.handles.append(m.register_forward_hook(self._forward_hook))  # 注册前向传播钩子到每个模块上
        self.module(x)  # 执行模块的前向传播
        [x.remove() for x in self.handles]  # 移除所有注册的钩子
        return self

    @property
    def parametrized(self):
        # 检查追踪到的模块中是否有可学习的参数
        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))


@dataclass
class ModuleTransfer:
    # 模块转移类，用于将一个模块的参数传输到另一个模块
    src: nn.Module  # 源模块
    dest: nn.Module  # 目标模块
    verbose: int = 0  # 控制详细程度的参数，默认为 0
    src_skip: List = field(default_factory=list)  # 跳过源模块中的特定层，默认为空列表
    dest_skip: List = field(default_factory=list)  # 跳过目标模块中的特定层，默认为空列表
    # 定义一个调用方法，接受一个张量 x 作为输入，在 self.src 和 self.dest 之间传输权重。
    def __call__(self, x: Tensor):
        """
        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
        hood we tracked all the operations in both modules.
        """
        # 使用 Tracker 对象追踪 self.dest 模块的前向传播过程，并获取其参数化表示
        dest_traced = Tracker(self.dest)(x).parametrized
        # 使用 Tracker 对象追踪 self.src 模块的前向传播过程，并获取其参数化表示
        src_traced = Tracker(self.src)(x).parametrized

        # 根据 self.src_skip 中的过滤条件，过滤掉 src_traced 中类型为 self.src_skip 的元素
        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
        # 根据 self.dest_skip 中的过滤条件，过滤掉 dest_traced 中类型为 self.dest_skip 的元素
        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))

        # 如果 dest_traced 和 src_traced 的长度不相等，抛出异常
        if len(dest_traced) != len(src_traced):
            raise Exception(
                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
                f" destination module has {len(dest_traced)}."
            )

        # 遍历 dest_traced 和 src_traced，将 src_m 的状态字典加载到 dest_m 中
        for dest_m, src_m in zip(dest_traced, src_traced):
            dest_m.load_state_dict(src_m.state_dict())
            # 如果 verbose 等于 1，则打印权重转移信息
            if self.verbose == 1:
                print(f"Transfered from={src_m} to={dest_m}")
# 定义函数，将指定模型的权重转换并推送到指定目录或 Hub
def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Path, push_to_hub: bool = True):
    # 打印正在转换的模型名称
    print(f"Converting {name}...")
    
    # 在没有梯度的情况下执行以下操作
    with torch.no_grad():
        # 创建指定模型，并加载预训练权重，设置为评估模式
        from_model = timm.create_model(name, pretrained=True).eval()
        # 创建自定义的 ResNet 配置模型，也设置为评估模式
        our_model = ResNetForImageClassification(config).eval()
        # 创建模型之间的模块传输器，从原始模型到自定义模型
        module_transfer = ModuleTransfer(src=from_model, dest=our_model)
        # 创建一个随机输入张量
        x = torch.randn((1, 3, 224, 224))
        # 使用模块传输器传输输入张量，确保传输正确
        module_transfer(x)

    # 断言检查：确保两个模型的输出 logits 在数值上非常接近
    assert torch.allclose(from_model(x), our_model(x).logits), "The model logits don't match the original one."

    # 根据模型名称生成检查点名称
    checkpoint_name = f"resnet{'-'.join(name.split('resnet'))}"
    # 打印检查点名称
    print(checkpoint_name)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 将自定义模型推送到指定路径或名称的 Hub 仓库，使用临时目录
        our_model.push_to_hub(
            repo_path_or_name=save_directory / checkpoint_name,
            commit_message="Add model",
            use_temp_dir=True,
        )

        # 创建一个自动图像处理器，从预训练模型加载，推送到 Hub
        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
        image_processor.push_to_hub(
            repo_path_or_name=save_directory / checkpoint_name,
            commit_message="Add image processor",
            use_temp_dir=True,
        )

        # 打印推送成功的消息
        print(f"Pushed {checkpoint_name}")


# 定义函数，将指定模型的权重转换并推送到指定目录或 Hub
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
    # 定义用于 ImageNet 的标签文件名
    filename = "imagenet-1k-id2label.json"
    # ImageNet 数据集中的标签数量
    num_labels = 1000
    # 预期的输出形状
    expected_shape = (1, num_labels)

    # Hub 仓库的 ID
    repo_id = "huggingface/label-files"
    # 加载 ImageNet 标签映射文件，以字典形式存储
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    # 将字符串键转换为整数键
    id2label = {int(k): v for k, v in id2label.items()}

    # 将 id2label 赋值给自己（实际上是多余的操作）
    id2label = id2label
    # 创建从标签到 ID 的反向映射字典
    label2id = {v: k for k, v in id2label.items()}

    # 使用部分函数构造 ImageNet 预训练配置
    ImageNetPreTrainedConfig = partial(ResNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)

    # 各个模型名称与其配置的映射关系
    names_to_config = {
        "resnet18": ImageNetPreTrainedConfig(
            depths=[2, 2, 2, 2], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
        ),
        "resnet26": ImageNetPreTrainedConfig(
            depths=[2, 2, 2, 2], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
        ),
        "resnet34": ImageNetPreTrainedConfig(
            depths=[3, 4, 6, 3], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
        ),
        "resnet50": ImageNetPreTrainedConfig(
            depths=[3, 4, 6, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
        ),
        "resnet101": ImageNetPreTrainedConfig(
            depths=[3, 4, 23, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
        ),
        "resnet152": ImageNetPreTrainedConfig(
            depths=[3, 8, 36, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
        ),
    }

    # 如果指定了模型名称，则执行权重转换并推送到 Hub
    if model_name:
        convert_weight_and_push(model_name, names_to_config[model_name], save_directory, push_to_hub)
    else:
        # 对于字典 names_to_config 中的每个键值对，分别赋值给 model_name 和 config
        for model_name, config in names_to_config.items():
            # 调用函数 convert_weight_and_push，将 model_name, config, save_directory, push_to_hub 作为参数传入
            convert_weight_and_push(model_name, config, save_directory, push_to_hub)
    # 返回变量 config 和 expected_shape
    return config, expected_shape
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 添加必需的参数
    parser.add_argument(
        "--model_name",
        default=None,
        type=str,
        help=(
            "要转换的模型名称，必须是支持的 resnet* 架构之一，"
            "目前支持的有：resnet18,26,34,50,101,152。如果为 `None`，则转换所有支持的模型。"
        ),
    )
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=Path,
        required=True,
        help="输出 PyTorch 模型目录的路径。",
    )
    parser.add_argument(
        "--push_to_hub",
        default=True,
        type=bool,
        required=False,
        help="如果为 True，将模型和图像处理器推送到 hub。",
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 获取 PyTorch 模型输出目录路径，并创建该目录（如果不存在）
    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)

    # 调用函数将权重转换并推送到指定目录和 hub
    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)

`.\models\resnet\modeling_flax_resnet.py`

# 导入必要的模块和函数
from functools import partial  # 导入 functools 模块中的 partial 函数
from typing import Optional, Tuple  # 导入类型提示的 Optional 和 Tuple

import flax.linen as nn  # 导入 flax.linen 模块并重命名为 nn
import jax  # 导入 jax 库
import jax.numpy as jnp  # 导入 jax 库中的 numpy 并重命名为 jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze  # 从 flax.core.frozen_dict 导入相关函数
from flax.traverse_util import flatten_dict, unflatten_dict  # 从 flax.traverse_util 导入 flatten_dict 和 unflatten_dict

from ...modeling_flax_outputs import (  # 导入输出相关的模块
    FlaxBaseModelOutputWithNoAttention,
    FlaxBaseModelOutputWithPoolingAndNoAttention,
    FlaxImageClassifierOutputWithNoAttention,
)
from ...modeling_flax_utils import (  # 导入实用函数和类
    ACT2FN,
    FlaxPreTrainedModel,
    append_replace_return_docstrings,
    overwrite_call_docstring,
)
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward  # 从 utils 导入函数
from .configuration_resnet import ResNetConfig  # 导入 ResNetConfig 配置类

RESNET_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

"""

# 上述部分是对模型的开始文档字符串的定义和赋值
    # Parameters参数：
    # config ([`ResNetConfig`]): 模型配置类，包含模型的所有参数。
    #   使用配置文件初始化不会加载与模型相关的权重，仅加载配置。
    #   查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法以加载模型权重。
    # dtype (`jax.numpy.dtype`, *optional*, 默认为 `jax.numpy.float32`):
    #   计算时的数据类型。可以是 `jax.numpy.float32`、`jax.numpy.float16`（在GPU上）、`jax.numpy.bfloat16`（在TPU上）之一。
    #   这可用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定，所有计算将使用给定的 `dtype` 进行。
    #
    #   **注意，这仅指定计算的数据类型，不影响模型参数的数据类型。**
    #
    #   如果要更改模型参数的数据类型，请参见 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""
    Args:
        pixel_values (`jax.numpy.float32` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`AutoImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class Identity(nn.Module):
    """Identity function."""

    @nn.compact
    def __call__(self, x, **kwargs):
        return x


class FlaxResNetConvLayer(nn.Module):
    """
    Defines a convolutional layer followed by batch normalization and activation function.
    """

    out_channels: int
    kernel_size: int = 3
    stride: int = 1
    activation: Optional[str] = "relu"
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # Define the convolutional layer
        self.convolution = nn.Conv(
            self.out_channels,
            kernel_size=(self.kernel_size, self.kernel_size),
            strides=self.stride,
            padding=self.kernel_size // 2,
            dtype=self.dtype,
            use_bias=False,
            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="normal", dtype=self.dtype),
        )
        # Define batch normalization layer
        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
        # Define activation function
        self.activation_func = ACT2FN[self.activation] if self.activation is not None else Identity()

    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        # Perform convolution
        hidden_state = self.convolution(x)
        # Apply batch normalization
        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
        # Apply activation function
        hidden_state = self.activation_func(hidden_state)
        return hidden_state


class FlaxResNetEmbeddings(nn.Module):
    """
    ResNet Embeddings (stem) composed of a single aggressive convolution.
    """

    config: ResNetConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # Define the embedding layer using FlaxResNetConvLayer
        self.embedder = FlaxResNetConvLayer(
            self.config.embedding_size,
            kernel_size=7,
            stride=2,
            activation=self.config.hidden_act,
            dtype=self.dtype,
        )
        # Define max pooling operation
        self.max_pool = partial(nn.max_pool, window_shape=(3, 3), strides=(2, 2), padding=((1, 1), (1, 1)))

    def __call__(self, pixel_values: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        num_channels = pixel_values.shape[-1]
        # Check if number of input channels matches the configuration
        if num_channels != self.config.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # Apply embedding layer
        embedding = self.embedder(pixel_values, deterministic=deterministic)
        # Apply max pooling
        embedding = self.max_pool(embedding)
        return embedding


class FlaxResNetShortCut(nn.Module):
    """
    Placeholder class for Flax ResNet shortcut connections.
    No implementation details provided.
    """
    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
    downsample the input using `stride=2`.
    """

    # 定义一个类，用于 ResNet 中的快捷连接（shortcut），将残差特征投影到正确的尺寸。如果需要，也可以用来
    # 使用 `stride=2` 对输入进行下采样。
    out_channels: int  # 输出通道数，即卷积层输出的特征图的深度
    stride: int = 2  # 步长，默认为2，用于卷积操作时的步进大小
    dtype: jnp.dtype = jnp.float32  # 数据类型，默认为 jax 的 float32 类型

    def setup(self):
        # 设置卷积层，用于实现残差块中的投影操作，将输入特征投影到输出通道大小
        self.convolution = nn.Conv(
            self.out_channels,
            kernel_size=(1, 1),  # 卷积核大小为 1x1
            strides=self.stride,  # 使用类属性中定义的步长
            use_bias=False,  # 不使用偏置项
            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),  # 卷积核初始化方式
            dtype=self.dtype,  # 使用指定的数据类型
        )
        # 设置批归一化层，用于规范化卷积层的输出特征
        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)

    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        # 在调用时，对输入 x 进行卷积投影操作
        hidden_state = self.convolution(x)
        # 对投影后的特征进行批归一化处理
        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
        return hidden_state
class FlaxResNetBasicLayerCollection(nn.Module):
    out_channels: int                     # 输出通道数
    stride: int = 1                       # 步长，默认为1
    dtype: jnp.dtype = jnp.float32         # 数据类型，默认为32位浮点数

    def setup(self):
        self.layer = [                    # 创建层列表
            FlaxResNetConvLayer(self.out_channels, stride=self.stride, dtype=self.dtype),  # 卷积层1
            FlaxResNetConvLayer(self.out_channels, activation=None, dtype=self.dtype),      # 卷积层2
        ]

    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        for layer in self.layer:         # 对每一层进行迭代
            hidden_state = layer(hidden_state, deterministic=deterministic)  # 应用层到隐藏状态
        return hidden_state               # 返回最终隐藏状态


class FlaxResNetBasicLayer(nn.Module):
    """
    A classic ResNet's residual layer composed by two `3x3` convolutions.
    """
    
    in_channels: int                      # 输入通道数
    out_channels: int                     # 输出通道数
    stride: int = 1                       # 步长，默认为1
    activation: Optional[str] = "relu"    # 激活函数，默认为ReLU
    dtype: jnp.dtype = jnp.float32         # 数据类型，默认为32位浮点数

    def setup(self):
        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1  # 是否应用快捷连接
        self.shortcut = (
            FlaxResNetShortCut(self.out_channels, stride=self.stride, dtype=self.dtype)  # 如果需要，则创建快捷连接
            if should_apply_shortcut
            else None
        )
        self.layer = FlaxResNetBasicLayerCollection(  # 创建基础层集合
            out_channels=self.out_channels,
            stride=self.stride,
            dtype=self.dtype,
        )
        self.activation_func = ACT2FN[self.activation]  # 获取激活函数

    def __call__(self, hidden_state, deterministic: bool = True):
        residual = hidden_state          # 保存残差
        hidden_state = self.layer(hidden_state, deterministic=deterministic)  # 应用基础层到隐藏状态

        if self.shortcut is not None:    # 如果存在快捷连接
            residual = self.shortcut(residual, deterministic=deterministic)  # 应用快捷连接到残差
        hidden_state += residual         # 添加残差到隐藏状态

        hidden_state = self.activation_func(hidden_state)  # 应用激活函数到隐藏状态
        return hidden_state               # 返回最终隐藏状态


class FlaxResNetBottleNeckLayerCollection(nn.Module):
    out_channels: int                     # 输出通道数
    stride: int = 1                       # 步长，默认为1
    activation: Optional[str] = "relu"    # 激活函数，默认为ReLU
    reduction: int = 4                    # 减少倍数，默认为4
    dtype: jnp.dtype = jnp.float32         # 数据类型，默认为32位浮点数

    def setup(self):
        reduces_channels = self.out_channels // self.reduction  # 减少的通道数

        self.layer = [
            FlaxResNetConvLayer(reduces_channels, kernel_size=1, dtype=self.dtype, name="0"),  # 第一个卷积层
            FlaxResNetConvLayer(reduces_channels, stride=self.stride, dtype=self.dtype, name="1"),  # 第二个卷积层
            FlaxResNetConvLayer(self.out_channels, kernel_size=1, activation=None, dtype=self.dtype, name="2"),  # 第三个卷积层
        ]

    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        for layer in self.layer:         # 对每一层进行迭代
            hidden_state = layer(hidden_state, deterministic=deterministic)  # 应用层到隐藏状态
        return hidden_state               # 返回最终隐藏状态


class FlaxResNetBottleNeckLayer(nn.Module):
    """
    A classic ResNet's bottleneck layer composed by three `3x3` convolutions. The first `1x1` convolution reduces the
    input by a factor of `reduction` in order to make the second `3x3` convolution faster. The last `1x1` convolution
    remaps the reduced features to `out_channels`.
    """
    # 输入通道数，表示输入特征的数量
    in_channels: int
    # 输出通道数，表示输出特征的数量
    out_channels: int
    # 步长，默认为1，控制卷积操作的步长大小
    stride: int = 1
    # 激活函数类型，默认为"relu"
    activation: Optional[str] = "relu"
    # 降维参数，默认为4，用于瓶颈层中的维度缩减
    reduction: int = 4
    # 数据类型，默认为32位浮点数
    dtype: jnp.dtype = jnp.float32

    # 初始化方法，用于设置网络层的结构
    def setup(self):
        # 判断是否需要应用快捷连接（shortcut）
        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
        # 如果需要应用快捷连接，则创建一个FlaxResNetShortCut对象
        self.shortcut = (
            FlaxResNetShortCut(self.out_channels, stride=self.stride, dtype=self.dtype)
            if should_apply_shortcut
            else None
        )

        # 创建一个FlaxResNetBottleNeckLayerCollection对象，用于构建瓶颈层集合
        self.layer = FlaxResNetBottleNeckLayerCollection(
            self.out_channels,
            stride=self.stride,
            activation=self.activation,
            reduction=self.reduction,
            dtype=self.dtype,
        )

        # 获取指定名称的激活函数
        self.activation_func = ACT2FN[self.activation]

    # 对象调用方法，用于执行网络层的前向传播
    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        # 将输入隐藏状态作为残差进行保存
        residual = hidden_state

        # 如果存在快捷连接，则对残差应用快捷连接
        if self.shortcut is not None:
            residual = self.shortcut(residual, deterministic=deterministic)
        
        # 对输入隐藏状态应用瓶颈层集合的处理
        hidden_state = self.layer(hidden_state, deterministic)
        
        # 将原始输入和处理后的残差相加
        hidden_state += residual
        
        # 对相加后的结果应用激活函数
        hidden_state = self.activation_func(hidden_state)
        
        # 返回处理后的隐藏状态
        return hidden_state
class FlaxResNetStageLayersCollection(nn.Module):
    """
    A ResNet stage composed by stacked layers.
    """

    config: ResNetConfig  # 配置对象，包含 ResNet 的配置信息
    in_channels: int  # 输入通道数
    out_channels: int  # 输出通道数
    stride: int = 2  # 步幅，默认为 2
    depth: int = 2  # 层深度，默认为 2
    dtype: jnp.dtype = jnp.float32  # 数据类型，默认为 jnp.float32

    def setup(self):
        layer = FlaxResNetBottleNeckLayer if self.config.layer_type == "bottleneck" else FlaxResNetBasicLayer

        layers = [
            # downsampling is done in the first layer with stride of 2
            # 第一层进行下采样，步幅为 2
            layer(
                self.in_channels,
                self.out_channels,
                stride=self.stride,
                activation=self.config.hidden_act,
                dtype=self.dtype,
                name="0",
            ),
        ]

        for i in range(self.depth - 1):
            layers.append(
                layer(
                    self.out_channels,
                    self.out_channels,
                    activation=self.config.hidden_act,
                    dtype=self.dtype,
                    name=str(i + 1),
                )
            )

        self.layers = layers

    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        hidden_state = x
        for layer in self.layers:
            hidden_state = layer(hidden_state, deterministic=deterministic)
        return hidden_state


class FlaxResNetStage(nn.Module):
    """
    A ResNet stage composed by stacked layers.
    """

    config: ResNetConfig  # 配置对象，包含 ResNet 的配置信息
    in_channels: int  # 输入通道数
    out_channels: int  # 输出通道数
    stride: int = 2  # 步幅，默认为 2
    depth: int = 2  # 层深度，默认为 2
    dtype: jnp.dtype = jnp.float32  # 数据类型，默认为 jnp.float32

    def setup(self):
        self.layers = FlaxResNetStageLayersCollection(
            self.config,
            in_channels=self.in_channels,
            out_channels=self.out_channels,
            stride=self.stride,
            depth=self.depth,
            dtype=self.dtype,
        )

    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
        return self.layers(x, deterministic=deterministic)


class FlaxResNetStageCollection(nn.Module):
    config: ResNetConfig  # 配置对象，包含 ResNet 的配置信息
    dtype: jnp.dtype = jnp.float32  # 数据类型，默认为 jnp.float32

    def setup(self):
        in_out_channels = zip(self.config.hidden_sizes, self.config.hidden_sizes[1:])
        stages = [
            FlaxResNetStage(
                self.config,
                self.config.embedding_size,
                self.config.hidden_sizes[0],
                stride=2 if self.config.downsample_in_first_stage else 1,
                depth=self.config.depths[0],
                dtype=self.dtype,
                name="0",
            )
        ]

        for i, ((in_channels, out_channels), depth) in enumerate(zip(in_out_channels, self.config.depths[1:])):
            stages.append(
                FlaxResNetStage(self.config, in_channels, out_channels, depth=depth, dtype=self.dtype, name=str(i + 1))
            )

        self.stages = stages
    # 定义类中的调用方法，用于执行模型推理过程，返回不包含注意力权重的模型输出对象
    def __call__(
        self,
        hidden_state: jnp.ndarray,
        output_hidden_states: bool = False,
        deterministic: bool = True,
    ) -> FlaxBaseModelOutputWithNoAttention:
        # 如果需要输出隐藏状态，则初始化一个空元组；否则置为 None
        hidden_states = () if output_hidden_states else None
    
        # 遍历模型的各个阶段模块
        for stage_module in self.stages:
            # 如果需要输出隐藏状态，则将隐藏状态进行维度转换，并添加到隐藏状态元组中
            if output_hidden_states:
                hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
            
            # 调用当前阶段模块，更新隐藏状态
            hidden_state = stage_module(hidden_state, deterministic=deterministic)
    
        # 返回更新后的隐藏状态和可能的隐藏状态元组
        return hidden_state, hidden_states
# 定义一个继承自 nn.Module 的类 FlaxResNetEncoder，用于实现 ResNet 编码器
class FlaxResNetEncoder(nn.Module):
    # 类属性 config，类型为 ResNetConfig，用于配置模型
    config: ResNetConfig
    # 类属性 dtype，默认为 jnp.float32 的数据类型
    dtype: jnp.dtype = jnp.float32

    # 初始化方法，设置编码器的各个阶段
    def setup(self):
        # 创建 FlaxResNetStageCollection 对象，用于管理 ResNet 的阶段
        self.stages = FlaxResNetStageCollection(self.config, dtype=self.dtype)

    # 对象调用方法，实现编码器的前向传播逻辑
    def __call__(
        self,
        hidden_state: jnp.ndarray,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ) -> FlaxBaseModelOutputWithNoAttention:
        # 调用编码器的阶段对象进行前向传播，得到编码后的隐藏状态和可能的中间隐藏状态列表
        hidden_state, hidden_states = self.stages(
            hidden_state, output_hidden_states=output_hidden_states, deterministic=deterministic
        )

        # 如果需要输出中间隐藏状态，将当前隐藏状态加入到隐藏状态列表中
        if output_hidden_states:
            hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)

        # 如果不需要返回字典形式的输出，则将有效的结果作为元组返回
        if not return_dict:
            return tuple(v for v in [hidden_state, hidden_states] if v is not None)

        # 返回包含最终隐藏状态和隐藏状态列表的 FlaxBaseModelOutputWithNoAttention 对象
        return FlaxBaseModelOutputWithNoAttention(
            last_hidden_state=hidden_state,
            hidden_states=hidden_states,
        )


# 定义一个继承自 FlaxPreTrainedModel 的抽象类 FlaxResNetPreTrainedModel
class FlaxResNetPreTrainedModel(FlaxPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 类属性 config_class，指定模型配置类为 ResNetConfig
    config_class = ResNetConfig
    # 类属性 base_model_prefix，指定基础模型前缀为 "resnet"
    base_model_prefix = "resnet"
    # 类属性 main_input_name，指定主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 类属性 module_class，用于指定模块类的类型，默认为 None
    module_class: nn.Module = None

    # 初始化方法，用于创建模型对象
    def __init__(
        self,
        config: ResNetConfig,
        input_shape=(1, 224, 224, 3),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 根据配置类和其他参数创建模块对象
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 如果未指定输入形状，则使用默认形状根据配置设置
        if input_shape is None:
            input_shape = (1, config.image_size, config.image_size, config.num_channels)
        # 调用父类的初始化方法，传递配置、模块、输入形状等参数
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    # 初始化权重方法，用于随机初始化模型的参数
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化像素值为全零张量作为输入
        pixel_values = jnp.zeros(input_shape, dtype=self.dtype)

        # 创建随机数生成器字典，用于参数初始化
        rngs = {"params": rng}

        # 使用模块对象的初始化方法，初始化模型参数，返回未冻结的参数字典
        random_params = self.module.init(rngs, pixel_values, return_dict=False)

        # 如果提供了已有的参数字典，则将缺失的参数从随机初始化的参数中复制过来
        if params is not None:
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            return freeze(unflatten_dict(params))
        else:
            return random_params

    # 对象调用方法，实现模型的前向传播逻辑
    @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
    def __call__(
        self,
        pixel_values,
        params: dict = None,
        train: bool = False,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> FlaxBaseModelOutput:
        # 实现模型的前向传播逻辑，具体细节根据模型的实现和输入参数进行处理
        pass  # 这里未提供完整的方法实现，需要根据具体模型的实现补充
        ):
        # 如果 output_hidden_states 不为 None，则使用传入的值，否则使用模型配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 不为 None，则使用传入的值，否则使用模型配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 调整像素值的维度顺序，将通道维度移到最后一个维度位置
        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))

        # 处理可能需要的随机数生成器
        rngs = {}

        # 调用模型的 apply 方法进行推断或训练
        return self.module.apply(
            {
                "params": params["params"] if params is not None else self.params["params"],
                "batch_stats": params["batch_stats"] if params is not None else self.params["batch_stats"],
            },
            jnp.array(pixel_values, dtype=jnp.float32),  # 将像素值转换为 jax 数组并传入
            not train,  # 如果不是训练模式，则传入 True，表示推断模式
            output_hidden_states,  # 传入是否需要隐藏状态的标志
            return_dict,  # 传入是否返回字典的标志
            rngs=rngs,  # 传入随机数生成器
            mutable=["batch_stats"] if train else False,  # 当 train 为 True 时返回包含 batch_stats 的元组
        )
# 定义一个名为FlaxResNetModule的类，继承自nn.Module
class FlaxResNetModule(nn.Module):
    # 类变量config，用于存储ResNet的配置信息
    config: ResNetConfig
    # 定义dtype变量，默认为jnp.float32，用于指定计算时的数据类型
    dtype: jnp.dtype = jnp.float32  # 计算中使用的数据类型

    # 定义setup方法，用于初始化模块
    def setup(self):
        # 创建FlaxResNetEmbeddings对象，使用self.config和self.dtype作为参数
        self.embedder = FlaxResNetEmbeddings(self.config, dtype=self.dtype)
        # 创建FlaxResNetEncoder对象，使用self.config和self.dtype作为参数
        self.encoder = FlaxResNetEncoder(self.config, dtype=self.dtype)

        # 创建部分应用了avg_pool函数的pooler对象，设置了padding参数
        self.pooler = partial(
            nn.avg_pool,
            padding=((0, 0), (0, 0)),
        )

    # 定义__call__方法，实现对象的调用功能
    def __call__(
        self,
        pixel_values,
        deterministic: bool = True,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> FlaxBaseModelOutputWithPoolingAndNoAttention:
        # 如果output_hidden_states为None，则使用self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果return_dict为None，则使用self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 获取嵌入输出，调用self.embedder对象
        embedding_output = self.embedder(pixel_values, deterministic=deterministic)

        # 获取编码器输出，调用self.encoder对象
        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 获取最后隐藏状态
        last_hidden_state = encoder_outputs[0]

        # 对最后隐藏状态进行自适应平均池化操作
        pooled_output = self.pooler(
            last_hidden_state,
            window_shape=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
            strides=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
        ).transpose(0, 3, 1, 2)

        # 调整最后隐藏状态的维度顺序
        last_hidden_state = last_hidden_state.transpose(0, 3, 1, 2)

        # 如果return_dict为False，则返回元组
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 返回FlaxBaseModelOutputWithPoolingAndNoAttention对象
        return FlaxBaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )


# 为FlaxResNetModel类添加文档注释
@add_start_docstrings(
    "The bare ResNet model outputting raw features without any specific head on top.",
    RESNET_START_DOCSTRING,
)
class FlaxResNetModel(FlaxResNetPreTrainedModel):
    module_class = FlaxResNetModule


# 定义FLAX_VISION_MODEL_DOCSTRING常量，包含FlaxResNetModel类的文档字符串
FLAX_VISION_MODEL_DOCSTRING = """
    Returns:

    Examples:

    ```
    >>> from transformers import AutoImageProcessor, FlaxResNetModel
    >>> from PIL import Image
    >>> import requests

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)
    >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    >>> model = FlaxResNetModel.from_pretrained("microsoft/resnet-50")
    >>> inputs = image_processor(images=image, return_tensors="np")
    >>> outputs = model(**inputs)
    >>> last_hidden_states = outputs.last_hidden_state
    ```
"""

# 调用overwrite_call_docstring函数，为FlaxResNetModel类覆盖文档字符串
overwrite_call_docstring(FlaxResNetModel, FLAX_VISION_MODEL_DOCSTRING)
# 调用append_replace_return_docstrings函数，为FlaxResNetModel类添加或替换返回值的文档字符串
append_replace_return_docstrings(
    # 导入FlaxResNetModel类，并指定output_type和config_class参数
    FlaxResNetModel, output_type=FlaxBaseModelOutputWithPoolingAndNoAttention, config_class=ResNetConfig
# 导入所需的库和模块
import jax
import requests

# FLAX_VISION_CLASSIF_DOCSTRING文档字符串，描述了FlaxResNetForImageClassification模型的返回和示例用法
FLAX_VISION_CLASSIF_DOCSTRING = """
    Returns:
        返回一个示例，展示如何使用该模型进行图像分类预测。

    Example:
        展示如何从URL下载图像并使用模型预测图像的分类。

    ```
    >>> from transformers import AutoImageProcessor, FlaxResNetForImageClassification
    >>> from PIL import Image
    >>> import jax
    >>> import requests

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
    >>> model = FlaxResNetForImageClassification.from_pretrained("microsoft/resnet-50")

    >>> inputs = image_processor(images=image, return_tensors="np")
    >>> outputs = model(**inputs)
    >>> logits = outputs.logits

    >>> # model predicts one of the 1000 ImageNet classes
    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
    ```
"""

# 使用overwrite_call_docstring函数将FLAX_VISION_CLASSIF_DOCSTRING设置为FlaxResNetForImageClassification类的文档字符串
overwrite_call_docstring(FlaxResNetForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)

# 使用append_replace_return_docstrings函数，为FlaxResNetForImageClassification类追加和替换返回结果的文档字符串
append_replace_return_docstrings(
    FlaxResNetForImageClassification, output_type=FlaxImageClassifierOutputWithNoAttention, config_class=ResNetConfig
)

`.\models\resnet\modeling_resnet.py`

# 导入必要的库和模块
from typing import Optional

import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入模型输出相关的类
from ...activations import ACT2FN
from ...modeling_outputs import (
    BackboneOutput,
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
)
# 导入预训练模型相关的工具和类
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
# 导入 ResNet 的配置类
from .configuration_resnet import ResNetConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "ResNetConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "microsoft/resnet-50"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"

# 预定义的 ResNet 预训练模型列表
RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/resnet-50",
    # See all resnet models at https://huggingface.co/models?filter=resnet
]

# 定义 ResNetConvLayer 类，实现 ResNet 的卷积层
class ResNetConvLayer(nn.Module):
    def __init__(
        self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu"
    ):
        super().__init__()
        # 定义卷积层
        self.convolution = nn.Conv2d(
            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False
        )
        # 定义批标准化层
        self.normalization = nn.BatchNorm2d(out_channels)
        # 定义激活函数，从预定义的 ACT2FN 中选择
        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()

    def forward(self, input: Tensor) -> Tensor:
        # 前向传播函数
        hidden_state = self.convolution(input)  # 卷积操作
        hidden_state = self.normalization(hidden_state)  # 批标准化操作
        hidden_state = self.activation(hidden_state)  # 激活函数操作
        return hidden_state

# 定义 ResNetEmbeddings 类，用于 ResNet 的嵌入（stem）部分
class ResNetEmbeddings(nn.Module):
    """
    ResNet Embeddings (stem) composed of a single aggressive convolution.
    """
    # 初始化函数，用于创建一个 ResNet 模型对象
    def __init__(self, config: ResNetConfig):
        # 调用父类的初始化函数，确保继承自父类的属性被正确初始化
        super().__init__()
        # 创建一个 ResNetConvLayer 对象作为嵌入层，配置如下：
        #   - 输入通道数为 config.num_channels
        #   - 输出特征维度为 config.embedding_size
        #   - 卷积核大小为 7x7
        #   - 步长为 2
        #   - 激活函数为 config.hidden_act
        self.embedder = ResNetConvLayer(
            config.num_channels, config.embedding_size, kernel_size=7, stride=2, activation=config.hidden_act
        )
        # 创建一个最大池化层对象，配置如下：
        #   - 池化核大小为 3x3
        #   - 步长为 2
        #   - 填充为 1
        self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        # 将 config 中的通道数设置为当前对象的通道数属性
        self.num_channels = config.num_channels

    # 前向传播函数，用于定义数据从输入到输出的流程
    def forward(self, pixel_values: Tensor) -> Tensor:
        # 获取输入张量 pixel_values 的通道数
        num_channels = pixel_values.shape[1]
        # 如果输入张量的通道数与初始化时设置的通道数不一致，则抛出数值错误异常
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 将输入张量 pixel_values 通过 embedder 进行嵌入操作得到 embedding
        embedding = self.embedder(pixel_values)
        # 对 embedding 使用 pooler 进行最大池化操作
        embedding = self.pooler(embedding)
        # 返回池化后的 embedding 张量作为最终输出
        return embedding
# 定义一个经典的 ResNet 瓶颈层，由三个 3x3 的卷积组成。

class ResNetBottleNeckLayer(nn.Module):
    """
    A classic ResNet's bottleneck layer composed by three `3x3` convolutions.

    The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
    convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If
    `downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        activation: str = "relu",
        reduction: int = 4,
        downsample_in_bottleneck: bool = False,
    ):
        super().__init__()
        # 判断是否需要在瓶颈层中进行降采样
        should_apply_shortcut = in_channels != out_channels or stride != 1
        # 如果需要降采样，则使用 ResNetShortCut 类来处理，否则使用恒等映射 nn.Identity()
        self.shortcut = (
            ResNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
        )
        # 第一个 3x3 卷积层，用于降维或保持维度不变
        self.layer1 = ResNetConvLayer(in_channels, out_channels // reduction, kernel_size=1, stride=1)
        # 第二个 3x3 卷积层，用于特征提取
        self.layer2 = ResNetConvLayer(out_channels // reduction, out_channels // reduction, kernel_size=3, stride=stride)
        # 第三个 1x1 卷积层，用于将特征映射回原始维度
        self.layer3 = ResNetConvLayer(out_channels // reduction, out_channels, kernel_size=1, stride=1, activation=None)
        # 激活函数，根据传入的 activation 参数选择相应的激活函数
        self.activation = ACT2FN[activation]

    def forward(self, hidden_state):
        # 将输入作为残差项备份
        residual = hidden_state
        # 依次通过各卷积层
        hidden_state = self.layer1(hidden_state)
        hidden_state = self.layer2(hidden_state)
        hidden_state = self.layer3(hidden_state)
        # 应用可能的降采样或恒等映射
        residual = self.shortcut(residual)
        # 将残差项与卷积输出相加
        hidden_state += residual
        # 应用激活函数
        hidden_state = self.activation(hidden_state)
        return hidden_state
    ):
        # 调用父类的构造方法进行初始化
        super().__init__()
        # 确定是否应用快捷方式，根据输入通道数、输出通道数和步长来判断
        should_apply_shortcut = in_channels != out_channels or stride != 1
        # 计算减少的通道数，用于残差块内的维度减少操作
        reduces_channels = out_channels // reduction
        # 如果需要应用快捷方式，则创建ResNetShortCut对象；否则创建一个恒等映射对象(nn.Identity())
        self.shortcut = (
            ResNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
        )
        # 构建残差块的主要层序列
        self.layer = nn.Sequential(
            # 第一个卷积层，用于减少输入通道数或步长的卷积操作
            ResNetConvLayer(
                in_channels, reduces_channels, kernel_size=1, stride=stride if downsample_in_bottleneck else 1
            ),
            # 第二个卷积层，不进行减少的卷积操作
            ResNetConvLayer(reduces_channels, reduces_channels, stride=stride if not downsample_in_bottleneck else 1),
            # 第三个卷积层，用于将减少的通道数映射回输出通道数的卷积操作，不使用激活函数
            ResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None),
        )
        # 根据给定的激活函数名称选择对应的激活函数
        self.activation = ACT2FN[activation]

    def forward(self, hidden_state):
        # 将输入的隐藏状态作为残差块的输入
        residual = hidden_state
        # 经过残差块的主要层序列操作，得到输出的隐藏状态
        hidden_state = self.layer(hidden_state)
        # 对输入的残差应用快捷方式，将其映射到与主要层输出相同的维度空间
        residual = self.shortcut(residual)
        # 将残差与主要层的输出相加，形成残差块的最终输出
        hidden_state += residual
        # 对最终输出应用预定义的激活函数
        hidden_state = self.activation(hidden_state)
        # 返回处理后的隐藏状态作为残差块的最终输出
        return hidden_state
# 定义 ResNet 网络的一个阶段，由多个堆叠的层组成
class ResNetStage(nn.Module):
    """
    A ResNet stage composed by stacked layers.
    """

    def __init__(
        self,
        config: ResNetConfig,
        in_channels: int,
        out_channels: int,
        stride: int = 2,
        depth: int = 2,
    ):
        super().__init__()

        # 根据配置选择使用瓶颈块或基础块作为层
        layer = ResNetBottleNeckLayer if config.layer_type == "bottleneck" else ResNetBasicLayer

        # 根据配置选择不同的第一层
        if config.layer_type == "bottleneck":
            first_layer = layer(
                in_channels,
                out_channels,
                stride=stride,
                activation=config.hidden_act,
                downsample_in_bottleneck=config.downsample_in_bottleneck,
            )
        else:
            first_layer = layer(in_channels, out_channels, stride=stride, activation=config.hidden_act)
        
        # 创建包含多个层的序列容器
        self.layers = nn.Sequential(
            first_layer, *[layer(out_channels, out_channels, activation=config.hidden_act) for _ in range(depth - 1)]
        )

    def forward(self, input: Tensor) -> Tensor:
        hidden_state = input
        # 逐层进行前向传播
        for layer in self.layers:
            hidden_state = layer(hidden_state)
        return hidden_state


class ResNetEncoder(nn.Module):
    """
    ResNet 编码器由多个 ResNet 阶段组成。
    """

    def __init__(self, config: ResNetConfig):
        super().__init__()
        self.stages = nn.ModuleList([])
        
        # 根据 `downsample_in_first_stage` 确定第一个阶段的第一层是否降采样输入
        self.stages.append(
            ResNetStage(
                config,
                config.embedding_size,
                config.hidden_sizes[0],
                stride=2 if config.downsample_in_first_stage else 1,
                depth=config.depths[0],
            )
        )
        
        # 构建其余阶段
        in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
        for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
            self.stages.append(ResNetStage(config, in_channels, out_channels, depth=depth))

    def forward(
        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
    ) -> BaseModelOutputWithNoAttention:
        hidden_states = () if output_hidden_states else None

        # 遍历每个阶段进行前向传播
        for stage_module in self.stages:
            if output_hidden_states:
                hidden_states = hidden_states + (hidden_state,)
            
            hidden_state = stage_module(hidden_state)

        # 如果需要输出隐藏状态，则将最终的隐藏状态添加到隐藏状态元组中
        if output_hidden_states:
            hidden_states = hidden_states + (hidden_state,)

        # 如果不需要返回字典，则根据情况返回隐藏状态和/或隐藏状态元组
        if not return_dict:
            return tuple(v for v in [hidden_state, hidden_states] if v is not None)

        # 返回带有最终隐藏状态和隐藏状态元组的 BaseModelOutputWithNoAttention 对象
        return BaseModelOutputWithNoAttention(
            last_hidden_state=hidden_state,
            hidden_states=hidden_states,
        )


class ResNetPreTrainedModel(PreTrainedModel):
    """
    处理权重初始化和预训练模型下载和加载的抽象类。
    """

    # 指定配置类为 ResNetConfig
    config_class = ResNetConfig
    # 定义基础模型前缀为 "resnet"
    base_model_prefix = "resnet"
    
    # 定义主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    
    # 定义初始化权重函数，接受一个模块作为参数
    def _init_weights(self, module):
        # 如果模块是 nn.Conv2d 类型，则使用 Kaiming 初始化方法初始化权重
        if isinstance(module, nn.Conv2d):
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
        # 如果模块是 nn.BatchNorm2d 或 nn.GroupNorm 类型，则将权重初始化为 1，偏置初始化为 0
        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
            nn.init.constant_(module.weight, 1)
            nn.init.constant_(module.bias, 0)
# 定义一个多行字符串，用于描述此模型是一个 PyTorch 的 torch.nn.Module 子类，使用时需按照一般的 PyTorch 模块方式使用，
# 参考 PyTorch 文档了解一般用法和行为。
RESNET_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ResNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义一个多行字符串，用于描述输入参数的文档信息，包括像素值、是否返回所有层的隐藏状态以及返回类型的选择。
RESNET_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 使用装饰器 @add_start_docstrings 和提供的多行字符串 RESNET_START_DOCSTRING，为 ResNetModel 类添加文档说明。
@add_start_docstrings(
    "The bare ResNet model outputting raw features without any specific head on top.",
    RESNET_START_DOCSTRING,
)
class ResNetModel(ResNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.embedder = ResNetEmbeddings(config)
        self.encoder = ResNetEncoder(config)
        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
        # Initialize weights and apply final processing
        self.post_init()

    # 使用装饰器 @add_start_docstrings_to_model_forward 和提供的多行字符串 RESNET_INPUTS_DOCSTRING，为 forward 方法添加输入文档说明。
    @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
    ) -> BaseModelOutputWithPoolingAndNoAttention:
        # 指定函数的返回类型，这里返回一个带有池化和无注意力的基础模型输出
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果输出隐藏状态未指定，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用嵌入器（embedder）处理像素值，生成嵌入输出
        embedding_output = self.embedder(pixel_values)

        # 使用编码器（encoder）处理嵌入输出，可以选择输出隐藏状态和是否返回字典
        encoder_outputs = self.encoder(
            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
        )

        # 获取编码器输出的最后隐藏状态
        last_hidden_state = encoder_outputs[0]

        # 使用池化器（pooler）对最后隐藏状态进行池化
        pooled_output = self.pooler(last_hidden_state)

        # 如果不返回字典，则返回元组形式的输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 如果需要返回字典，则构建特定的基础模型输出对象并返回
        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
    """
    ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """
    # 使用 ResNetPreTrainedModel 作为基类，构建一个带有图像分类头部的 ResNet 模型，用于 ImageNet 等任务
    class ResNetForImageClassification(ResNetPreTrainedModel):
        def __init__(self, config):
            super().__init__(config)
            # 设置分类标签数量
            self.num_labels = config.num_labels
            # 初始化 ResNet 模型
            self.resnet = ResNetModel(config)
            # 分类头部，包括展平层和线性层，根据配置决定是否使用标签数量进行分类
            self.classifier = nn.Sequential(
                nn.Flatten(),
                nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
            )
            # 初始化权重并应用最终处理
            self.post_init()

        @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
        @add_code_sample_docstrings(
            checkpoint=_IMAGE_CLASS_CHECKPOINT,
            output_type=ImageClassifierOutputWithNoAttention,
            config_class=_CONFIG_FOR_DOC,
            expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
        )
        # 前向传播函数，接收像素值、标签、是否输出隐藏状态和是否返回字典作为参数
        def forward(
            self,
            pixel_values: Optional[torch.FloatTensor] = None,
            labels: Optional[torch.LongTensor] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ) -> ImageClassifierOutputWithNoAttention:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用该值；否则使用 self.config.use_return_dict 的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ResNet 模型来计算输出，根据 return_dict 是否为 True 返回不同的输出
        outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果 return_dict 为 False，则使用 outputs 的第二个元素作为汇集的输出；否则使用 outputs 的 pooler_output
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 将汇集的输出传入分类器，得到 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None

        # 如果 labels 不为 None，则开始计算损失
        if labels is not None:
            # 如果问题类型尚未确定，则根据条件确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"
            
            # 根据问题类型选择合适的损失函数和计算方式
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单一标签的回归问题，计算损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签的回归问题，计算损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类问题，使用带 logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回 logits 和额外的 hidden states；否则返回损失和 logits
        if not return_dict:
            output = (logits,) + outputs[2:]
            return (loss,) + output if loss is not None else output
        
        # 返回 ImageClassifierOutputWithNoAttention 对象，其中包括损失、logits 和 hidden states
        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
@add_start_docstrings(
    """
    ResNet backbone, to be used with frameworks like DETR and MaskFormer.
    """,
    RESNET_START_DOCSTRING,
)
class ResNetBackbone(ResNetPreTrainedModel, BackboneMixin):
    def __init__(self, config):
        # 调用父类的初始化方法，传入配置参数
        super().__init__(config)
        # 调用基类的初始化背景方法
        super()._init_backbone(config)

        # 设置特征维度列表，包括嵌入大小和隐藏层大小
        self.num_features = [config.embedding_size] + config.hidden_sizes
        # 初始化嵌入器
        self.embedder = ResNetEmbeddings(config)
        # 初始化编码器
        self.encoder = ResNetEncoder(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
    ) -> BackboneOutput:
        """
        返回模型的输出结果。
        
        Examples:
        
        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/resnet-50", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 2048, 7, 7]
        ```
        """
        # 确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 确定是否输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 对输入的像素值进行嵌入处理
        embedding_output = self.embedder(pixel_values)

        # 使用编码器处理嵌入输出，并请求输出隐藏状态
        outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)

        # 获取隐藏状态
        hidden_states = outputs.hidden_states

        # 初始化空的特征映射元组
        feature_maps = ()
        # 遍历阶段名称和隐藏状态，添加符合输出特征的阶段
        for idx, stage in enumerate(self.stage_names):
            if stage in self.out_features:
                feature_maps += (hidden_states[idx],)

        # 如果不返回字典格式的输出，组合输出并包含隐藏状态
        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs.hidden_states,)
            return output

        # 返回 BackboneOutput 对象，包含特征映射、隐藏状态（如果有）和注意力（为空）
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )

`.\models\resnet\modeling_tf_resnet.py`

# coding=utf-8
# Copyright 2022 Microsoft Research, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TensorFlow ResNet model."""

from typing import Optional, Tuple, Union

import tensorflow as tf

from ...activations_tf import ACT2FN
from ...modeling_tf_outputs import (
    TFBaseModelOutputWithNoAttention,
    TFBaseModelOutputWithPoolingAndNoAttention,
    TFImageClassifierOutputWithNoAttention,
)
from ...modeling_tf_utils import (
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import shape_list
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_resnet import ResNetConfig


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "ResNetConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "microsoft/resnet-50"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"

TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/resnet-50",
    # See all resnet models at https://huggingface.co/models?filter=resnet
]


class TFResNetConvLayer(keras.layers.Layer):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        activation: str = "relu",
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        # Calculate padding value based on kernel size for valid padding
        self.pad_value = kernel_size // 2
        # Define convolutional layer with specified parameters
        self.conv = keras.layers.Conv2D(
            out_channels, kernel_size=kernel_size, strides=stride, padding="valid", use_bias=False, name="convolution"
        )
        # Batch normalization layer with predefined epsilon and momentum values
        self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
        # Activation function based on provided string or default to linear activation
        self.activation = ACT2FN[activation] if activation is not None else keras.layers.Activation("linear")
        # Store input and output channel counts for the layer
        self.in_channels = in_channels
        self.out_channels = out_channels
    # 对输入的 hidden_state 进行卷积操作
    def convolution(self, hidden_state: tf.Tensor) -> tf.Tensor:
        # 在高度和宽度两个维度上进行填充，以匹配 PyTorch Conv2D 模型的填充方式
        height_pad = width_pad = (self.pad_value, self.pad_value)
        # 使用 TensorFlow 的 tf.pad 函数对 hidden_state 进行填充操作
        hidden_state = tf.pad(hidden_state, [(0, 0), height_pad, width_pad, (0, 0)])
        # 使用预先定义的卷积层 conv 对填充后的 hidden_state 进行卷积操作
        hidden_state = self.conv(hidden_state)
        # 返回卷积后的结果
        return hidden_state

    # 模型的调用方法，用于执行前向传播
    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 调用 convolution 方法对输入的 hidden_state 进行卷积处理
        hidden_state = self.convolution(hidden_state)
        # 使用 normalization 方法对卷积后的 hidden_state 进行归一化处理
        hidden_state = self.normalization(hidden_state, training=training)
        # 对归一化后的 hidden_state 应用激活函数 activation
        hidden_state = self.activation(hidden_state)
        # 返回经过激活函数处理后的结果
        return hidden_state

    # 在构建模型时被调用，用于定义模型的各个层
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 将模型标记为已构建
        self.built = True
        # 如果已定义卷积层 conv，则构建卷积层，指定输入通道数为 self.in_channels
        if getattr(self, "conv", None) is not None:
            with tf.name_scope(self.conv.name):
                self.conv.build([None, None, None, self.in_channels])
        # 如果已定义归一化层 normalization，则构建归一化层，指定输出通道数为 self.out_channels
        if getattr(self, "normalization", None) is not None:
            with tf.name_scope(self.normalization.name):
                self.normalization.build([None, None, None, self.out_channels])
class TFResNetEmbeddings(keras.layers.Layer):
    """
    ResNet Embeddings (stem) composed of a single aggressive convolution.
    """

    def __init__(self, config: ResNetConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        # 创建一个 ResNet 的卷积层，用于嵌入处理
        self.embedder = TFResNetConvLayer(
            config.num_channels,
            config.embedding_size,
            kernel_size=7,
            stride=2,
            activation=config.hidden_act,
            name="embedder",
        )
        # 创建一个最大池化层，用于池化处理
        self.pooler = keras.layers.MaxPool2D(pool_size=3, strides=2, padding="valid", name="pooler")
        self.num_channels = config.num_channels

    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 获取输入张量的形状信息
        _, _, _, num_channels = shape_list(pixel_values)
        # 如果是即时执行模式并且通道数不匹配，抛出值错误
        if tf.executing_eagerly() and num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        hidden_state = pixel_values
        # 将输入张量传入嵌入器进行处理
        hidden_state = self.embedder(hidden_state)
        # 对处理后的张量进行填充操作
        hidden_state = tf.pad(hidden_state, [[0, 0], [1, 1], [1, 1], [0, 0]])
        # 将填充后的张量传入池化层进行处理
        hidden_state = self.pooler(hidden_state)
        return hidden_state

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果嵌入器已经存在，建立嵌入器层
        if getattr(self, "embedder", None) is not None:
            with tf.name_scope(self.embedder.name):
                self.embedder.build(None)
        # 如果池化层已经存在，建立池化层
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                self.pooler.build(None)


class TFResNetShortCut(keras.layers.Layer):
    """
    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
    downsample the input using `stride=2`.
    """

    def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs) -> None:
        super().__init__(**kwargs)
        # 创建一个卷积层，用于调整残差特征到正确的大小，可以选择性地进行下采样
        self.convolution = keras.layers.Conv2D(
            out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution"
        )
        # 使用与 PyTorch 等效部分相同的默认动量和 epsilon 参数
        self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
        self.in_channels = in_channels
        self.out_channels = out_channels

    def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
        hidden_state = x
        # 通过卷积层处理输入张量
        hidden_state = self.convolution(hidden_state)
        # 通过批量归一化层处理卷积后的特征
        hidden_state = self.normalization(hidden_state, training=training)
        return hidden_state
    # 定义 build 方法，用于构建网络层
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将标记设置为已构建
        self.built = True
        
        # 如果存在卷积层对象
        if getattr(self, "convolution", None) is not None:
            # 在命名空间中为卷积层设置名字作用域
            with tf.name_scope(self.convolution.name):
                # 使用输入通道数构建卷积层
                self.convolution.build([None, None, None, self.in_channels])
        
        # 如果存在归一化层对象
        if getattr(self, "normalization", None) is not None:
            # 在命名空间中为归一化层设置名字作用域
            with tf.name_scope(self.normalization.name):
                # 使用输出通道数构建归一化层
                self.normalization.build([None, None, None, self.out_channels])
    # 定义 TFResNetBasicLayer 类，表示经典 ResNet 的基本残差层，由两个 3x3 卷积组成
    class TFResNetBasicLayer(keras.layers.Layer):
        """
        A classic ResNet's residual layer composed by two `3x3` convolutions.
        """

        def __init__(
            self, in_channels: int, out_channels: int, stride: int = 1, activation: str = "relu", **kwargs
        ) -> None:
            super().__init__(**kwargs)
            # 确定是否应用快捷连接（shortcut），当输入通道数不等于输出通道数或步长不为 1 时应用
            should_apply_shortcut = in_channels != out_channels or stride != 1
            # 第一个 3x3 卷积层，初始化为 TFResNetConvLayer 类的实例
            self.conv1 = TFResNetConvLayer(in_channels, out_channels, stride=stride, name="layer.0")
            # 第二个 3x3 卷积层，初始化为 TFResNetConvLayer 类的实例，激活函数设为 None
            self.conv2 = TFResNetConvLayer(out_channels, out_channels, activation=None, name="layer.1")
            # 快捷连接层，如果需要应用快捷连接，则初始化为 TFResNetShortCut 类的实例；否则使用线性激活函数
            self.shortcut = (
                TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
                if should_apply_shortcut
                else keras.layers.Activation("linear", name="shortcut")
            )
            # 激活函数，根据 activation 参数选择对应的激活函数
            self.activation = ACT2FN[activation]

        def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
            # 保存输入的隐藏状态作为残差（residual）
            residual = hidden_state
            # 经过第一层卷积
            hidden_state = self.conv1(hidden_state, training=training)
            # 经过第二层卷积
            hidden_state = self.conv2(hidden_state, training=training)
            # 经过快捷连接层
            residual = self.shortcut(residual, training=training)
            # 将残差与卷积结果相加
            hidden_state += residual
            # 经过激活函数
            hidden_state = self.activation(hidden_state)
            # 返回处理后的隐藏状态
            return hidden_state

        def build(self, input_shape=None):
            # 如果已经建立，则直接返回
            if self.built:
                return
            # 标记为已建立
            self.built = True
            # 构建第一个卷积层 conv1
            if getattr(self, "conv1", None) is not None:
                with tf.name_scope(self.conv1.name):
                    self.conv1.build(None)
            # 构建第二个卷积层 conv2
            if getattr(self, "conv2", None) is not None:
                with tf.name_scope(self.conv2.name):
                    self.conv2.build(None)
            # 构建快捷连接层 shortcut
            if getattr(self, "shortcut", None) is not None:
                with tf.name_scope(self.shortcut.name):
                    self.shortcut.build(None)


    # 定义 TFResNetBottleNeckLayer 类，表示经典 ResNet 的瓶颈残差层，由三个 3x3 卷积组成
    class TFResNetBottleNeckLayer(keras.layers.Layer):
        """
        A classic ResNet's bottleneck layer composed by three `3x3` convolutions.

        The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
        convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`.
        """

        def __init__(
            self,
            in_channels: int,
            out_channels: int,
            stride: int = 1,
            activation: str = "relu",
            reduction: int = 4,
            **kwargs,
    ) -> None:
        # 调用父类的初始化方法，传递所有参数
        super().__init__(**kwargs)
        # 判断是否应用快捷方式，根据输入通道数、输出通道数和步长来确定
        should_apply_shortcut = in_channels != out_channels or stride != 1
        # 计算减少的通道数，用于第一个卷积层的输出通道数
        reduces_channels = out_channels // reduction
        # 创建第一个卷积层，将输入通道数转换为减少的通道数
        self.conv0 = TFResNetConvLayer(in_channels, reduces_channels, kernel_size=1, name="layer.0")
        # 创建第二个卷积层，将减少的通道数转换为相同的通道数，应用给定的步长
        self.conv1 = TFResNetConvLayer(reduces_channels, reduces_channels, stride=stride, name="layer.1")
        # 创建第三个卷积层，将通道数转换为输出通道数，应用 1x1 的卷积核
        self.conv2 = TFResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None, name="layer.2")
        # 创建快捷连接层，如果应用快捷方式则使用 TFResNetShortCut 类，否则使用线性激活
        self.shortcut = (
            TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
            if should_apply_shortcut
            else keras.layers.Activation("linear", name="shortcut")
        )
        # 选择激活函数，根据给定的激活函数名称从预定义字典中获取对应的函数
        self.activation = ACT2FN[activation]

    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入状态保存为残差
        residual = hidden_state
        # 通过第一层卷积层
        hidden_state = self.conv0(hidden_state, training=training)
        # 通过第二层卷积层
        hidden_state = self.conv1(hidden_state, training=training)
        # 通过第三层卷积层
        hidden_state = self.conv2(hidden_state, training=training)
        # 应用快捷连接，并传入训练状态
        residual = self.shortcut(residual, training=training)
        # 将残差与卷积结果相加
        hidden_state += residual
        # 应用激活函数到加和的结果
        hidden_state = self.activation(hidden_state)
        # 返回最终的隐藏状态
        return hidden_state

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        # 标记模型为已构建
        self.built = True
        # 如果存在 conv0 属性，则构建 conv0
        if getattr(self, "conv0", None) is not None:
            with tf.name_scope(self.conv0.name):
                self.conv0.build(None)
        # 如果存在 conv1 属性，则构建 conv1
        if getattr(self, "conv1", None) is not None:
            with tf.name_scope(self.conv1.name):
                self.conv1.build(None)
        # 如果存在 conv2 属性，则构建 conv2
        if getattr(self, "conv2", None) is not None:
            with tf.name_scope(self.conv2.name):
                self.conv2.build(None)
        # 如果存在 shortcut 属性，则构建 shortcut
        if getattr(self, "shortcut", None) is not None:
            with tf.name_scope(self.shortcut.name):
                self.shortcut.build(None)
class TFResNetStage(keras.layers.Layer):
    """
    A ResNet stage composed of stacked layers.
    """

    def __init__(
        self, config: ResNetConfig, in_channels: int, out_channels: int, stride: int = 2, depth: int = 2, **kwargs
    ) -> None:
        super().__init__(**kwargs)

        # 根据配置选择使用瓶颈块或基本块作为每一层的构建单元
        layer = TFResNetBottleNeckLayer if config.layer_type == "bottleneck" else TFResNetBasicLayer

        # 创建当前阶段的层列表，第一层有可能对输入进行下采样
        layers = [layer(in_channels, out_channels, stride=stride, activation=config.hidden_act, name="layers.0")]
        layers += [
            layer(out_channels, out_channels, activation=config.hidden_act, name=f"layers.{i + 1}")
            for i in range(depth - 1)
        ]
        self.stage_layers = layers

    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 依次通过每一层处理隐藏状态
        for layer in self.stage_layers:
            hidden_state = layer(hidden_state, training=training)
        return hidden_state

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "stage_layers", None) is not None:
            # 对每一层进行构建
            for layer in self.stage_layers:
                with tf.name_scope(layer.name):
                    layer.build(None)


class TFResNetEncoder(keras.layers.Layer):
    def __init__(self, config: ResNetConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        # 根据配置创建多个 ResNet 阶段
        self.stages = [
            TFResNetStage(
                config,
                config.embedding_size,
                config.hidden_sizes[0],
                stride=2 if config.downsample_in_first_stage else 1,
                depth=config.depths[0],
                name="stages.0",
            )
        ]
        for i, (in_channels, out_channels, depth) in enumerate(
            zip(config.hidden_sizes, config.hidden_sizes[1:], config.depths[1:])
        ):
            self.stages.append(TFResNetStage(config, in_channels, out_channels, depth=depth, name=f"stages.{i + 1}"))

    def call(
        self,
        hidden_state: tf.Tensor,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        training: bool = False,
    ) -> TFBaseModelOutputWithNoAttention:
        # 初始化隐藏状态元组
        hidden_states = () if output_hidden_states else None

        # 依次通过每个阶段模块处理隐藏状态
        for stage_module in self.stages:
            if output_hidden_states:
                hidden_states = hidden_states + (hidden_state,)

            hidden_state = stage_module(hidden_state, training=training)

        # 如果需要输出隐藏状态，将当前隐藏状态添加到元组中
        if output_hidden_states:
            hidden_states = hidden_states + (hidden_state,)

        # 根据需要返回输出形式
        if not return_dict:
            return tuple(v for v in [hidden_state, hidden_states] if v is not None)

        # 返回一个 TFBaseModelOutputWithNoAttention 对象，包含最后的隐藏状态和所有隐藏状态元组
        return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
    # 定义 build 方法，用于构建模型层次结构
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将标志位设置为已构建
        self.built = True
        # 检查是否定义了 stages 属性
        if getattr(self, "stages", None) is not None:
            # 遍历每一个层并构建它们
            for layer in self.stages:
                # 使用层的名字作为命名空间，构建该层
                with tf.name_scope(layer.name):
                    layer.build(None)
@keras_serializable
class TFResNetMainLayer(keras.layers.Layer):
    # 设置该层使用的配置类
    config_class = ResNetConfig

    def __init__(self, config: ResNetConfig, **kwargs) -> None:
        super().__init__(**kwargs)
        # 初始化层的配置
        self.config = config
        # 创建 TFResNetEmbeddings 实例作为嵌入器
        self.embedder = TFResNetEmbeddings(config, name="embedder")
        # 创建 TFResNetEncoder 实例作为编码器
        self.encoder = TFResNetEncoder(config, name="encoder")
        # 创建全局平均池化层，用于池化特征图
        self.pooler = keras.layers.GlobalAveragePooling2D(keepdims=True)

    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPoolingAndNoAttention]:
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # TF 2.0 image layers can't use NCHW format when running on CPU.
        # We transpose to NHWC format and then transpose back after the full forward pass.
        # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
        pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1])
        # 使用嵌入器将像素值转换为嵌入输出
        embedding_output = self.embedder(pixel_values, training=training)

        # 使用编码器进行编码
        encoder_outputs = self.encoder(
            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
        )

        # 获取最后一个隐藏状态
        last_hidden_state = encoder_outputs[0]

        # 使用池化器获取池化输出
        pooled_output = self.pooler(last_hidden_state)

        # 将所有输出转置为NCHW格式
        # (batch_size, height, width, num_channels) -> (batch_size, num_channels, height, width)
        last_hidden_state = tf.transpose(last_hidden_state, (0, 3, 1, 2))
        pooled_output = tf.transpose(pooled_output, (0, 3, 1, 2))
        hidden_states = ()
        for hidden_state in encoder_outputs[1:]:
            # 对所有隐藏状态进行转置为NCHW格式
            hidden_states = hidden_states + tuple(tf.transpose(h, (0, 3, 1, 2)) for h in hidden_state)

        if not return_dict:
            # 如果不返回字典，则返回元组形式的输出
            return (last_hidden_state, pooled_output) + hidden_states

        hidden_states = hidden_states if output_hidden_states else None

        # 返回带池化和无注意力机制的基础模型输出
        return TFBaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=hidden_states,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "embedder", None) is not None:
            with tf.name_scope(self.embedder.name):
                # 构建嵌入器
                self.embedder.build(None)
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                # 构建编码器
                self.encoder.build(None)
# 使用装饰器为 TFResNetModel 类添加文档字符串，描述其为不带特定顶部头部的裸 ResNet 模型输出原始特征
@add_start_docstrings(
    "The bare ResNet model outputting raw features without any specific head on top.",
    RESNET_START_DOCSTRING,
)
# 定义 TFResNetModel 类，继承自 TFResNetPreTrainedModel 类
class TFResNetModel(TFResNetPreTrainedModel):
    # 初始化方法，接受一个 ResNetConfig 类型的 config 参数
    def __init__(self, config: ResNetConfig, **kwargs) -> None:
        # 调用父类的初始化方法
        super().__init__(config, **kwargs)
        # 创建一个 TFResNetMainLayer 实例，命名为 "resnet"
        self.resnet = TFResNetMainLayer(config=config, name="resnet")

    # 使用装饰器为 call 方法添加文档字符串，描述其输入和输出
    @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
    # 使用装饰器添加代码示例的文档字符串
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 使用装饰器对输入进行解包，即解开输入的包装
    @unpack_inputs
    # 定义 call 方法，接受多个参数并返回相应的值
    def call(
        self,
        pixel_values: tf.Tensor,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPoolingAndNoAttention]:
        # 如果 output_hidden_states 为 None，则使用 self.config.output_hidden_states
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 为 None，则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 self.resnet 的 __call__ 方法，传递相应的参数
        resnet_outputs = self.resnet(
            pixel_values=pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 返回 resnet_outputs
        return resnet_outputs

    # 定义 build 方法，用于构建模型
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 标记模型为已构建
        self.built = True
        # 如果 self.resnet 存在，则在名为 self.resnet 的命名空间下构建它
        if getattr(self, "resnet", None) is not None:
            with tf.name_scope(self.resnet.name):
                self.resnet.build(None)


# 使用装饰器为 TFResNetForImageClassification 类添加文档字符串，描述其为在顶部带有图像分类头部（线性层位于池化特征之上）的 ResNet 模型
@add_start_docstrings(
    """
    ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """,
    RESNET_START_DOCSTRING,
)
# 定义 TFResNetForImageClassification 类，继承自 TFResNetPreTrainedModel 和 TFSequenceClassificationLoss 类
class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassificationLoss):
    # 初始化方法，接受一个 ResNetConfig 类型的 config 参数
    def __init__(self, config: ResNetConfig, **kwargs) -> None:
        # 调用父类的初始化方法
        super().__init__(config, **kwargs)
        # 设置 self.num_labels 为 config.num_labels
        self.num_labels = config.num_labels
        # 创建一个 TFResNetMainLayer 实例，命名为 "resnet"
        self.resnet = TFResNetMainLayer(config, name="resnet")
        # 分类头部
        self.classifier_layer = (
            keras.layers.Dense(config.num_labels, name="classifier.1")
            if config.num_labels > 0
            else keras.layers.Activation("linear", name="classifier.1")
        )
        # 设置 self.config 为 config
        self.config = config

    # 定义 classifier 方法，接受一个 tf.Tensor 类型的参数 x，并返回分类器的 logits
    def classifier(self, x: tf.Tensor) -> tf.Tensor:
        # 使用 Flatten 层展平输入 x
        x = keras.layers.Flatten()(x)
        # 将展平后的结果传递给分类器层，得到 logits
        logits = self.classifier_layer(x)
        # 返回 logits
        return logits

    # 使用装饰器为 call 方法添加文档字符串，描述其输入和输出
    @add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
    # 使用装饰器添加代码示例的文档字符串
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=TFImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 使用装饰器对输入进行解包，即解开输入的包装
    @unpack_inputs
    def call(
        self,
        pixel_values: tf.Tensor = None,
        labels: tf.Tensor = None,
        output_hidden_states: bool = None,
        return_dict: bool = None,
        training: bool = False,
    ) -> Union[Tuple[tf.Tensor], TFImageClassifierOutputWithNoAttention]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 设置返回字典选项，如果未提供则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ResNet 模型进行前向传播计算
        outputs = self.resnet(
            pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
        )

        # 如果 return_dict 为 True，则使用 pooler_output 作为输出；否则使用 outputs 的第二个元素作为输出
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 将池化输出传入分类器进行分类
        logits = self.classifier(pooled_output)

        # 如果 labels 不为 None，则计算损失；否则损失设为 None
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果 return_dict 为 False，则返回 logits 和额外的 hidden states；否则返回带有损失、logits 和 hidden states 的对象
        if not return_dict:
            output = (logits,) + outputs[2:]
            return (loss,) + output if loss is not None else output

        return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)

    def build(self, input_shape=None):
        # 如果已经构建过网络，直接返回
        if self.built:
            return
        self.built = True

        # 如果存在 resnet 模型，则构建 resnet
        if getattr(self, "resnet", None) is not None:
            with tf.name_scope(self.resnet.name):
                self.resnet.build(None)

        # 如果存在 classifier_layer，则构建 classifier_layer
        if getattr(self, "classifier_layer", None) is not None:
            with tf.name_scope(self.classifier_layer.name):
                self.classifier_layer.build([None, None, self.config.hidden_sizes[-1]])

`.\models\resnet\init.py`

# 引入类型检查模块
from typing import TYPE_CHECKING

# 从工具模块中引入必要的异常和工具函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义一个字典，包含了需要导入的结构
_import_structure = {
    "configuration_resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig", "ResNetOnnxConfig"]
}

# 尝试导入 torch 相关模块，如果不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 torch 模型相关结构到导入结构字典中
    _import_structure["modeling_resnet"] = [
        "RESNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ResNetForImageClassification",
        "ResNetModel",
        "ResNetPreTrainedModel",
        "ResNetBackbone",
    ]

# 尝试导入 TensorFlow 相关模块，如果不可用则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 TensorFlow 模型相关结构到导入结构字典中
    _import_structure["modeling_tf_resnet"] = [
        "TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFResNetForImageClassification",
        "TFResNetModel",
        "TFResNetPreTrainedModel",
    ]

# 尝试导入 Flax 相关模块，如果不可用则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 Flax 模型相关结构到导入结构字典中
    _import_structure["modeling_flax_resnet"] = [
        "FlaxResNetForImageClassification",
        "FlaxResNetModel",
        "FlaxResNetPreTrainedModel",
    ]

# 如果处于类型检查模式，从相应模块导入必要的类型和配置
if TYPE_CHECKING:
    from .configuration_resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig, ResNetOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 torch 模型相关的类型和类
        from .modeling_resnet import (
            RESNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            ResNetBackbone,
            ResNetForImageClassification,
            ResNetModel,
            ResNetPreTrainedModel,
        )

    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入 TensorFlow 模型相关的类型和类
        from .modeling_tf_resnet import (
            TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFResNetForImageClassification,
            TFResNetModel,
            TFResNetPreTrainedModel,
        )

    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果前面的条件不满足，导入以下模块中的指定类和函数
    from .modeling_flax_resnet import FlaxResNetForImageClassification, FlaxResNetModel, FlaxResNetPreTrainedModel
else:
    # 如果前面的条件都不满足，则执行以下代码块
    import sys
    # 导入系统模块 sys

    # 使用当前模块的名称作为键，将 _LazyModule 对象赋值给 sys.modules 中的相应条目
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
    # 这里假设 _LazyModule 是一个自定义的模块加载器类，将当前模块注册到 sys.modules 中

`.\models\roberta\configuration_roberta.py`

# 引入必要的模块和类
from collections import OrderedDict  # 导入 OrderedDict 类，用于有序字典操作
from typing import Mapping  # 导入 Mapping 类型提示，用于类型标注

# 从 transformers 包中导入预训练配置类和其他相关功能
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入 ONNX 配置类
from ...utils import logging  # 导入日志工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置的映射字典，将模型名称映射到配置文件的 URL
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json",
    "FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/config.json",
    "FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/config.json",
    "distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/config.json",
    "openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/config.json",
    "openai-community/roberta-large-openai-detector": "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/config.json",
}

# RoBERTa 的配置类，继承自 PretrainedConfig 类
class RobertaConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`RobertaModel`] or a [`TFRobertaModel`]. It is
    used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
    [FacebookAI/roberta-base](https://huggingface.co/FacebookAI/roberta-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import RobertaConfig, RobertaModel

    >>> # Initializing a RoBERTa configuration
    >>> configuration = RobertaConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = RobertaModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "roberta"  # 模型类型为 RoBERTa
    # 定义一个类的初始化方法，初始化 Transformer 模型的各种参数和选项
    def __init__(
        self,
        vocab_size=50265,  # 词汇表大小，默认为 50265
        hidden_size=768,   # 隐藏层大小，默认为 768
        num_hidden_layers=12,  # Transformer 模型的隐藏层层数，默认为 12
        num_attention_heads=12,  # 注意力头的数量，默认为 12
        intermediate_size=3072,  # 中间层大小，默认为 3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为 GELU
        hidden_dropout_prob=0.1,  # 隐藏层的 dropout 概率，默认为 0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率的 dropout 概率，默认为 0.1
        max_position_embeddings=512,  # 最大位置嵌入大小，默认为 512
        type_vocab_size=2,  # 类型词汇表大小，默认为 2
        initializer_range=0.02,  # 初始化范围，默认为 0.02
        layer_norm_eps=1e-12,  # 层归一化的 epsilon，默认为 1e-12
        pad_token_id=1,  # 填充标记 ID，默认为 1
        bos_token_id=0,  # 开始序列标记 ID，默认为 0
        eos_token_id=2,  # 结束序列标记 ID，默认为 2
        position_embedding_type="absolute",  # 位置嵌入类型，默认为绝对位置编码
        use_cache=True,  # 是否使用缓存，默认为 True
        classifier_dropout=None,  # 分类器的 dropout，可选参数，默认为 None
        **kwargs,  # 其他关键字参数
    ):
        # 调用父类的初始化方法，设置填充、开始和结束序列标记 ID，以及其他传递的关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
    
        # 初始化类的属性
        self.vocab_size = vocab_size  # 设置词汇表大小
        self.hidden_size = hidden_size  # 设置隐藏层大小
        self.num_hidden_layers = num_hidden_layers  # 设置隐藏层数
        self.num_attention_heads = num_attention_heads  # 设置注意力头数
        self.hidden_act = hidden_act  # 设置隐藏层激活函数
        self.intermediate_size = intermediate_size  # 设置中间层大小
        self.hidden_dropout_prob = hidden_dropout_prob  # 设置隐藏层 dropout 概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 设置注意力 dropout 概率
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置嵌入大小
        self.type_vocab_size = type_vocab_size  # 设置类型词汇表大小
        self.initializer_range = initializer_range  # 设置初始化范围
        self.layer_norm_eps = layer_norm_eps  # 设置层归一化的 epsilon
        self.position_embedding_type = position_embedding_type  # 设置位置嵌入类型
        self.use_cache = use_cache  # 设置是否使用缓存
        self.classifier_dropout = classifier_dropout  # 设置分类器的 dropout
# 定义一个继承自 OnnxConfig 的 RobertaOnnxConfig 类，用于配置 ROBERTA 模型的 ONNX 导出设置
class RobertaOnnxConfig(OnnxConfig):
    
    # inputs 属性，返回一个映射，描述了模型输入的结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多项选择，则动态轴包含三个维度：batch、choice、sequence
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则动态轴只包含两个维度：batch、sequence
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，描述了模型输入的名称与对应的动态轴
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),        # 模型输入的 token IDs，使用 dynamic_axis 描述轴
                ("attention_mask", dynamic_axis),   # 模型输入的注意力掩码，使用 dynamic_axis 描述轴
            ]
        )

Transformers-源码解析-九十四-

Transformers 源码解析（九十四）

.\models\rembert\modeling_tf_rembert.py

.\models\rembert\tokenization_rembert.py

.\models\rembert\tokenization_rembert_fast.py

.\models\rembert\__init__.py

.\models\resnet\configuration_resnet.py

.\models\resnet\convert_resnet_to_pytorch.py

.\models\resnet\modeling_flax_resnet.py

.\models\resnet\modeling_resnet.py

.\models\resnet\modeling_tf_resnet.py

.\models\resnet\__init__.py

.\models\roberta\configuration_roberta.py

`.\models\rembert\modeling_tf_rembert.py`

`.\models\rembert\tokenization_rembert.py`

`.\models\rembert\tokenization_rembert_fast.py`

`.\models\rembert\init.py`

`.\models\resnet\configuration_resnet.py`

`.\models\resnet\convert_resnet_to_pytorch.py`

`.\models\resnet\modeling_flax_resnet.py`

`.\models\resnet\modeling_resnet.py`

`.\models\resnet\modeling_tf_resnet.py`

`.\models\resnet\init.py`

`.\models\roberta\configuration_roberta.py`