Transformers 源码解析（一百一十二）

`.\models\timesformer\modeling_timesformer.py`

# coding=utf-8
# 文件编码声明，确保支持 UTF-8 编码格式
# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
# 版权声明及保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License 2.0 版本授权许可
# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用此文件
# You may obtain a copy of the License at
# 可以在以下链接获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则依据"原样"分发本软件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何形式的明示或暗示保证或条件
# See the License for the specific language governing permissions and
# 请参阅许可证了解特定的语言规定和权限
# limitations under the License.
# 许可证下的限制

""" PyTorch TimeSformer model."""
# PyTorch TimeSformer 模型声明

import collections
from typing import Optional, Tuple, Union

import torch
import torch.nn.functional
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_timesformer import TimesformerConfig

# 导入必要的库和模块

logger = logging.get_logger(__name__)
# 获取模块的日志记录器对象

_CONFIG_FOR_DOC = "TimesformerConfig"
_CHECKPOINT_FOR_DOC = "facebook/timesformer"

TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/timesformer-base-finetuned-k400",
    # See all TimeSformer models at https://huggingface.co/models?filter=timesformer
]
# Timesformer 预训练模型的存档列表

# Adapted from https://github.com/facebookresearch/TimeSformer/blob/a5ef29a7b7264baff199a30b3306ac27de901133/timesformer/models/vit.py#L155
class TimesformerPatchEmbeddings(nn.Module):
    """Image to Patch Embedding"""
    # 图像转换为补丁嵌入的模块声明

    def __init__(self, config):
        super().__init__()
        # 调用父类构造函数初始化模块

        image_size = config.image_size
        patch_size = config.patch_size

        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)

        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        # 计算图像中的补丁数目

        self.image_size = image_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        # 设置模块的图像尺寸、补丁尺寸和补丁数目属性

        self.projection = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size)
        # 使用二维卷积将图像像素映射到补丁嵌入空间

    def forward(self, pixel_values):
        batch_size, num_frames, num_channels, height, width = pixel_values.shape
        pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
        # 重塑输入张量以适应卷积层的输入要求

        embeddings = self.projection(pixel_values)
        # 将像素值投影到嵌入空间
        patch_width = embeddings.size(-1)
        embeddings = embeddings.flatten(2).transpose(1, 2)
        # 对嵌入进行扁平化并转置以便进一步处理
        return embeddings, num_frames, patch_width
        # 返回嵌入向量、帧数和补丁宽度信息


class TimesformerEmbeddings(nn.Module):
    """
    Construct the patch and position embeddings.
    """
    # 构建补丁和位置嵌入的模块声明
    # 初始化函数，用于初始化一个新的实例对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 从配置中获取隐藏层大小作为嵌入维度
        embed_dim = config.hidden_size
        # 从配置中获取帧数作为时间维度
        num_frames = config.num_frames
        # 从配置中获取隐藏层的dropout率
        drop_rate = config.hidden_dropout_prob
        # 从配置中获取注意力机制的类型
        attention_type = config.attention_type

        # 将注意力机制的类型保存到实例对象中
        self.attention_type = attention_type
        # 使用TimesformerPatchEmbeddings类初始化补丁嵌入层
        self.patch_embeddings = TimesformerPatchEmbeddings(config)
        # 计算补丁的数量并保存到实例对象中
        self.num_patches = self.patch_embeddings.num_patches

        # 位置嵌入部分
        # 创建一个可学习的用于分类的令牌（CLS token）参数
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        # 创建一个可学习的位置嵌入参数，考虑到补丁数和一个额外的CLS token
        self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_patches + 1, embed_dim))
        # 应用dropout到位置嵌入参数
        self.pos_drop = nn.Dropout(p=drop_rate)

        # 如果注意力机制不是"space_only"，则初始化时间嵌入部分
        if attention_type != "space_only":
            # 创建一个可学习的时间嵌入参数，考虑到帧数
            self.time_embeddings = nn.Parameter(torch.zeros(1, num_frames, embed_dim))
            # 应用dropout到时间嵌入参数
            self.time_drop = nn.Dropout(p=drop_rate)
    def forward(self, pixel_values):
        batch_size = pixel_values.shape[0]  # 获取输入张量的批量大小

        # create patch embeddings
        embeddings, num_frames, patch_width = self.patch_embeddings(pixel_values)
        # 生成图像的补丁嵌入表示，并获取补丁的帧数和宽度信息

        cls_tokens = self.cls_token.expand(embeddings.size(0), -1, -1)
        # 扩展类标记以匹配嵌入张量的批量大小
        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
        # 将类标记与补丁嵌入张量连接起来

        # resizing the positional embeddings in case they don't match the input at inference
        if embeddings.size(1) != self.position_embeddings.size(1):
            position_embeddings = self.position_embeddings
            cls_pos_embed = position_embeddings[0, 0, :].unsqueeze(0).unsqueeze(1)
            # 提取类标记的位置嵌入，并调整维度以匹配嵌入张量的格式
            other_pos_embed = position_embeddings[0, 1:, :].unsqueeze(0).transpose(1, 2)
            # 提取其它位置嵌入并转置维度以匹配嵌入张量的格式
            patch_num = int(other_pos_embed.size(2) ** 0.5)
            patch_height = embeddings.size(1) // patch_width
            other_pos_embed = other_pos_embed.reshape(1, embeddings.size(2), patch_num, patch_num)
            # 重塑其它位置嵌入以匹配嵌入张量的形状
            new_pos_embed = nn.functional.interpolate(
                other_pos_embed, size=(patch_height, patch_width), mode="nearest"
            )
            # 使用最近邻插值调整其它位置嵌入的尺寸以匹配补丁的形状
            new_pos_embed = new_pos_embed.flatten(2)
            new_pos_embed = new_pos_embed.transpose(1, 2)
            new_pos_embed = torch.cat((cls_pos_embed, new_pos_embed), 1)
            # 将调整后的位置嵌入与类标记位置嵌入连接
            embeddings = embeddings + new_pos_embed
            # 将调整后的位置嵌入添加到补丁嵌入张量中
        else:
            embeddings = embeddings + self.position_embeddings
            # 否则，直接将位置嵌入添加到补丁嵌入张量中

        embeddings = self.pos_drop(embeddings)
        # 对位置嵌入后的张量进行dropout操作

        # Time Embeddings
        if self.attention_type != "space_only":
            cls_tokens = embeddings[:batch_size, 0, :].unsqueeze(1)
            # 提取类标记以处理时间嵌入
            embeddings = embeddings[:, 1:]
            _, patch_height, patch_width = embeddings.shape
            embeddings = (
                embeddings.reshape(batch_size, num_frames, patch_height, patch_width)
                .permute(0, 2, 1, 3)
                .reshape(batch_size * patch_height, num_frames, patch_width)
            )
            # 重新排列张量以适应时间嵌入的处理
            if num_frames != self.time_embeddings.size(1):
                time_embeddings = self.time_embeddings.transpose(1, 2)
                new_time_embeddings = nn.functional.interpolate(time_embeddings, size=(num_frames), mode="nearest")
                new_time_embeddings = new_time_embeddings.transpose(1, 2)
                # 调整时间嵌入的尺寸以匹配帧数
                embeddings = embeddings + new_time_embeddings
                # 将调整后的时间嵌入添加到嵌入张量中
            else:
                embeddings = embeddings + self.time_embeddings
                # 否则，直接将时间嵌入添加到嵌入张量中

            embeddings = self.time_drop(embeddings)
            # 对时间嵌入后的张量进行dropout操作
            embeddings = embeddings.view(batch_size, patch_height, num_frames, patch_width).reshape(
                batch_size, patch_height * num_frames, patch_width
            )
            embeddings = torch.cat((cls_tokens, embeddings), dim=1)
            # 将类标记与处理后的张量连接

        return embeddings
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    """
    if drop_prob == 0.0 or not training:
        return input
    keep_prob = 1 - drop_prob
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # binarize
    output = input.div(keep_prob) * random_tensor
    return output


# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->TimeSformer
class TimeSformerDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


# Adapted from https://github.com/facebookresearch/TimeSformer/blob/a5ef29a7b7264baff199a30b3306ac27de901133/timesformer/models/vit.py#L57
class TimesformerSelfAttention(nn.Module):
    def __init__(self, config: TimesformerConfig):
        super().__init__()

        num_heads = config.num_attention_heads
        qkv_bias = config.qkv_bias
        attention_dropout_prob = config.attention_probs_dropout_prob

        self.num_heads = num_heads
        head_dim = config.hidden_size // num_heads
        self.scale = head_dim**-0.5
        # Linear transformation for Query, Key, Value (QKV) using fully connected layer
        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=qkv_bias)
        # Dropout layer for attention scores
        self.attn_drop = nn.Dropout(attention_dropout_prob)
    # 定义一个前向传播函数，用于处理输入的隐藏状态和可能输出注意力分布
    def forward(self, hidden_states, output_attentions: bool = False):
        # 获取输入隐藏状态的批量大小、隐藏大小和通道数量
        batch_size, hidden_size, num_channels = hidden_states.shape
        
        # 将隐藏状态传入QKV层，将输出reshape成(batch_size, hidden_size, 3, num_heads, num_channels // num_heads)，并进行维度转置
        qkv = (
            self.qkv(hidden_states)
            .reshape(batch_size, hidden_size, 3, self.num_heads, num_channels // self.num_heads)
            .permute(2, 0, 3, 1, 4)
        )
        query, key, value = qkv[0], qkv[1], qkv[2]

        # 计算注意力分数，通过query和key的乘积后乘以缩放比例self.scale
        attention_probs = (query @ key.transpose(-2, -1)) * self.scale
        # 对注意力分数进行softmax操作，使得其在最后一个维度上的和为1
        attention_probs = attention_probs.softmax(dim=-1)
        # 对注意力分数应用dropout操作，以防止过拟合
        attention_probs = self.attn_drop(attention_probs)

        # 计算上下文向量，通过attention_probs和value的乘积，然后进行维度转置和reshape操作
        context_layer = (attention_probs @ value).transpose(1, 2).reshape(batch_size, hidden_size, num_channels)

        # 如果指定了output_attentions为True，则同时返回上下文层和注意力分布；否则，只返回上下文层
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        # 返回最终的输出结果
        return outputs
# TimesformerSelfOutput 类定义，继承自 nn.Module
class TimesformerSelfOutput(nn.Module):
    """
    The residual connection is defined in TimesformerLayer instead of here (as is the case with other models), due to
    the layernorm applied before each block.
    """

    def __init__(self, config: TimesformerConfig) -> None:
        super().__init__()
        # 创建一个全连接层，输入和输出大小为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个 dropout 层，以 config.hidden_dropout_prob 为参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收一个名为 hidden_states 的张量，返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对处理后的张量应用 dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# TimesformerAttention 类定义，继承自 nn.Module
class TimeSformerAttention(nn.Module):
    def __init__(self, config: TimesformerConfig) -> None:
        super().__init__()
        # 创建 TimesformerSelfAttention 对象，使用 config 参数
        self.attention = TimesformerSelfAttention(config)
        # 创建 TimesformerSelfOutput 对象，使用 config 参数
        self.output = TimesformerSelfOutput(config)

    # 前向传播函数，接收 hidden_states 张量和 output_attentions 布尔值参数，返回处理后的张量或元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        # 调用 self.attention 的前向传播方法，处理 hidden_states 和 output_attentions
        self_outputs = self.attention(hidden_states, output_attentions)

        # 调用 self.output 的前向传播方法，处理 self_outputs 的第一个元素
        attention_output = self.output(self_outputs[0])

        # 如果输出注意力信息，则在输出元组中添加注意力张量
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# TimesformerIntermediate 类定义，继承自 nn.Module
class TimesformerIntermediate(nn.Module):
    def __init__(self, config: TimesformerConfig) -> None:
        super().__init__()
        # 创建一个全连接层，输入为 config.hidden_size，输出为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 创建一个 dropout 层，以 config.hidden_dropout_prob 为参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 根据 config.hidden_act 的类型选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数，接收 hidden_states 张量，返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理 hidden_states
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数处理处理后的张量
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 对处理后的张量应用 dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# TimesformerOutput 类定义，继承自 nn.Module
class TimesformerOutput(nn.Module):
    def __init__(self, config: TimesformerConfig) -> None:
        super().__init__()
        # 创建一个全连接层，输入为 config.intermediate_size，输出为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 dropout 层，以 config.hidden_dropout_prob 为参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，接收 hidden_states 张量，返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对处理后的张量应用 dropout
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# TimesformerLayer 类定义，继承自 nn.Module
# 代码未完整提供，注释省略
class TimesformerLayer(nn.Module):
    def __init__(self, config: TimesformerConfig, layer_index: int) -> None:
        super().__init__()

        # 从配置中获取注意力类型
        attention_type = config.attention_type

        # 根据规则生成随机深度路径的下降率列表
        drop_path_rates = [
            x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)
        ]  # 随机深度路径的下降率衰减规则
        drop_path_rate = drop_path_rates[layer_index]

        # 如果下降率大于0，则使用自定义的时间变换下降路径；否则使用恒等映射
        self.drop_path = TimeSformerDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
        
        # 初始化注意力机制、中间层和输出层
        self.attention = TimeSformerAttention(config)
        self.intermediate = TimesformerIntermediate(config)
        self.output = TimesformerOutput(config)
        
        # 在层归一化之前和之后使用配置中定义的层归一化
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 存储配置信息和注意力类型，如果类型不在预定义的范围内，则引发值错误
        self.config = config
        self.attention_type = attention_type
        if attention_type not in ["divided_space_time", "space_only", "joint_space_time"]:
            raise ValueError("Unknown attention type: {}".format(attention_type))

        # 如果注意力类型为"divided_space_time"，则初始化时间注意力相关参数
        if self.attention_type == "divided_space_time":
            self.temporal_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
            self.temporal_attention = TimeSformerAttention(config)
            self.temporal_dense = nn.Linear(config.hidden_size, config.hidden_size)
class TimesformerEncoder(nn.Module):
    # TimesformerEncoder 类，用于实现 Timesformer 模型的编码器部分
    def __init__(self, config: TimesformerConfig) -> None:
        super().__init__()
        self.config = config
        # 初始化多层 TimesformerLayer 模块组成的列表
        self.layer = nn.ModuleList([TimesformerLayer(config, ind) for ind in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False  # 是否开启梯度检查点

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutput]:
        # 如果需要输出隐藏状态，则初始化空的元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力权重，则初始化空的元组
        all_self_attentions = () if output_attentions else None

        # 遍历每个 TimesformerLayer 层进行前向传播
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
                all_hidden_states = all_hidden_states + (hidden_states,)

            if self.gradient_checkpointing and self.training:
                # 如果开启了梯度检查点且处于训练模式，则使用梯度检查点函数进行前向传播
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的前向传播函数
                layer_outputs = layer_module(hidden_states, output_attentions)

            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            if output_attentions:
                # 如果需要输出注意力权重，则将当前层的注意力权重添加到 all_self_attentions 中
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        if output_hidden_states:
            # 如果需要输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            # 如果不需要返回字典形式的输出，则返回所有非空的结果元组
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则返回 BaseModelOutput 对象，包含最终的输出
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


class TimesformerPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    
    # Timesformer 预训练模型的抽象类，处理权重初始化以及预训练模型下载和加载的简单接口

    config_class = TimesformerConfig  # Timesformer 模型配置类
    base_model_prefix = "timesformer"  # Timesformer 模型的基础名称前缀
    main_input_name = "pixel_values"  # Timesformer 模型的主要输入名称
    supports_gradient_checkpointing = True  # 支持梯度检查点

    def _init_weights(self, module):
        # 初始化模型权重的函数
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 如果是线性层或卷积层，则使用截断正态分布初始化权重
            nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)  # 如果有偏置，初始化为常数 0
        elif isinstance(module, nn.LayerNorm):
            # 如果是 LayerNorm 层，则初始化偏置为常数 0，权重为常数 1
            nn.init.constant_(module.bias, 0)
            nn.init.constant_(module.weight, 1.0)
        elif isinstance(module, TimesformerEmbeddings):
            # 如果是 TimesformerEmbeddings 类型，则分别初始化 cls_token 和 position_embeddings
            nn.init.trunc_normal_(module.cls_token, std=self.config.initializer_range)
            nn.init.trunc_normal_(module.position_embeddings, std=self.config.initializer_range)
            module.patch_embeddings.apply(self._init_weights)


TIMESFORMER_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    Parameters:
        config ([`TimesformerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
TIMESFORMER_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`VideoMAEImageProcessor.preprocess`] for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare TimeSformer Model transformer outputting raw hidden-states without any specific head on top.",
    TIMESFORMER_START_DOCSTRING,
)
class TimesformerModel(TimesformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        # Initialize embeddings and encoder based on provided configuration
        self.embeddings = TimesformerEmbeddings(config)
        self.encoder = TimesformerEncoder(config)

        # Layer normalization for post-encoder processing
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Returns the patch embeddings used in the model's input layer.
        """
        return self.embeddings.patch_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model.

        Args:
            heads_to_prune (dict): dict of {layer_num: list of heads to prune in this layer}
                                  See base class PreTrainedModel.
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the Timesformer Model.

        Args:
            pixel_values (torch.FloatTensor): Pixel values of shape `(batch_size, num_frames, num_channels, height, width)`.
            output_attentions (bool, optional): Whether to return attentions tensors of all attention layers.
            output_hidden_states (bool, optional): Whether to return hidden states of all layers.
            return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.

        Returns:
            BaseModelOutput or tuple:
                A BaseModelOutput (if return_dict=True) or a tuple of torch.FloatTensor containing various model outputs.
        """
        # Implementation of forward pass goes here
        pass


@add_start_docstrings(
    """TimeSformer Model transformer with a video classification head on top (a linear layer on top of the final hidden state
of the [CLS] token) e.g. for ImageNet.""",
    TIMESFORMER_START_DOCSTRING,
)
class TimesformerForVideoClassification(TimesformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # Initialize number of labels and base Timesformer model
        self.num_labels = config.num_labels
        self.timesformer = TimesformerModel(config)

        # Classifier head for video classification
        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the TimesformerForVideoClassification model.

        Args:
            pixel_values (torch.FloatTensor): Pixel values of shape `(batch_size, num_frames, num_channels, height, width)`.
            output_attentions (bool, optional): Whether to return attentions tensors of all attention layers.
            output_hidden_states (bool, optional): Whether to return hidden states of all layers.
            return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.

        Returns:
            BaseModelOutput or tuple:
                A BaseModelOutput (if return_dict=True) or a tuple of torch.FloatTensor containing various model outputs.
        """
        # Implementation of forward pass goes here
        pass
    # 使用装饰器替换返回文档字符串，设置输出类型为ImageClassifierOutput，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 定义前向传播函数，接受多个参数：
    # pixel_values: 可选的torch.Tensor，表示输入像素值
    # labels: 可选的torch.Tensor，表示标签数据
    # output_attentions: 可选的bool值，控制是否输出注意力权重
    # output_hidden_states: 可选的bool值，控制是否输出隐藏状态
    # return_dict: 可选的bool值，控制是否返回字典形式的结果
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\timesformer\init.py`

# 版权声明和许可信息
#
# 版权所有 2022 年 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）进行许可；
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件是基于“按原样”提供的，
# 没有任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

# 引入依赖检查相关模块
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义导入结构
_import_structure = {
    "configuration_timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
}

# 检查是否有 Torch 可用
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果有 Torch 可用，则添加 Timesformer 相关模块到导入结构
    _import_structure["modeling_timesformer"] = [
        "TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TimesformerModel",
        "TimesformerForVideoClassification",
        "TimesformerPreTrainedModel",
    ]

# 如果是类型检查阶段，导入 Timesformer 相关配置和模型
if TYPE_CHECKING:
    from .configuration_timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_timesformer import (
            TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TimesformerForVideoClassification,
            TimesformerModel,
            TimesformerPreTrainedModel,
        )

# 如果不是类型检查阶段，则动态地将当前模块设置为延迟加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\time_series_transformer\configuration_time_series_transformer.py`

# 设置文件编码为 UTF-8

# 版权声明和许可信息，指定此代码的使用条款和条件
# 根据 Apache License, Version 2.0 许可，除非符合许可条件，否则不得使用此文件
# 可以在下面的链接获取完整的许可证文本：http://www.apache.org/licenses/LICENSE-2.0

# 引入必要的模块和类
from typing import List, Optional, Union

# 从 Transformers 库中导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从 Transformers 库中导入日志记录工具
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义一个映射，将预训练模型名称映射到其配置文件的 URL
TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "huggingface/time-series-transformer-tourism-monthly": (
        "https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json"
    ),
    # 更多 TimeSeriesTransformer 模型可以在 https://huggingface.co/models?filter=time_series_transformer 查看
}

# TimeSeriesTransformerConfig 类，继承自 PretrainedConfig 类，用于存储时间序列 Transformer 模型的配置信息
class TimeSeriesTransformerConfig(PretrainedConfig):
    r"""
    这是用于存储 [`TimeSeriesTransformerModel`] 配置的类。根据指定的参数实例化一个 Time Series Transformer 模型的配置，
    定义模型架构。使用默认配置实例化将产生类似于 Time Series Transformer
    [huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
    架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可以用于控制模型的输出。查阅 [`PretrainedConfig`] 的文档以获取更多信息。

    ```
    >>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel

    >>> # 使用 12 个时间步进行预测初始化 Time Series Transformer 配置
    >>> configuration = TimeSeriesTransformerConfig(prediction_length=12)

    >>> # 从配置中随机初始化一个模型（带有随机权重）
    >>> model = TimeSeriesTransformerModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型标识为 "time_series_transformer"
    model_type = "time_series_transformer"

    # 属性映射字典，将配置属性名映射到模型参数名
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
        "num_hidden_layers": "encoder_layers",
    }
    # 初始化函数，用于设置模型的参数和配置
    def __init__(
        self,
        prediction_length: Optional[int] = None,  # 预测长度，可以为None
        context_length: Optional[int] = None,  # 上下文长度，可以为None
        distribution_output: str = "student_t",  # 输出分布类型，默认为学生t分布
        loss: str = "nll",  # 损失函数类型，默认为负对数似然
        input_size: int = 1,  # 输入数据的维度，默认为1
        lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],  # 滞后序列，用于特征提取，默认包含1到7的整数
        scaling: Optional[Union[str, bool]] = "mean",  # 数据缩放方式，默认为均值缩放
        num_dynamic_real_features: int = 0,  # 动态实数特征数量，默认为0
        num_static_categorical_features: int = 0,  # 静态分类特征数量，默认为0
        num_static_real_features: int = 0,  # 静态实数特征数量，默认为0
        num_time_features: int = 0,  # 时间特征数量，默认为0
        cardinality: Optional[List[int]] = None,  # 分类特征的基数列表，可以为None
        embedding_dimension: Optional[List[int]] = None,  # 嵌入层的维度列表，可以为None
        encoder_ffn_dim: int = 32,  # 编码器前馈神经网络的维度，默认为32
        decoder_ffn_dim: int = 32,  # 解码器前馈神经网络的维度，默认为32
        encoder_attention_heads: int = 2,  # 编码器注意力头数，默认为2
        decoder_attention_heads: int = 2,  # 解码器注意力头数，默认为2
        encoder_layers: int = 2,  # 编码器层数，默认为2
        decoder_layers: int = 2,  # 解码器层数，默认为2
        is_encoder_decoder: bool = True,  # 是否是编码解码结构，默认为True
        activation_function: str = "gelu",  # 激活函数类型，默认为GELU
        d_model: int = 64,  # 模型维度，默认为64
        dropout: float = 0.1,  # dropout率，默认为0.1
        encoder_layerdrop: float = 0.1,  # 编码器层dropout率，默认为0.1
        decoder_layerdrop: float = 0.1,  # 解码器层dropout率，默认为0.1
        attention_dropout: float = 0.1,  # 注意力机制的dropout率，默认为0.1
        activation_dropout: float = 0.1,  # 激活函数的dropout率，默认为0.1
        num_parallel_samples: int = 100,  # 并行采样的样本数量，默认为100
        init_std: float = 0.02,  # 初始化的标准差，默认为0.02
        use_cache=True,  # 是否使用缓存，默认为True
        **kwargs,  # 其他可选参数
        # time series specific configuration
        self.prediction_length = prediction_length
        # 设置预测长度，用于时间序列预测模型
        self.context_length = context_length or prediction_length
        # 设置上下文长度，若未提供则默认与预测长度相同
        self.distribution_output = distribution_output
        # 分布输出配置，指定预测分布的类型（如正态分布、负二项分布等）
        self.loss = loss
        # 损失函数配置，用于模型训练时的优化目标
        self.input_size = input_size
        # 输入特征的大小
        self.num_time_features = num_time_features
        # 时间特征的数量
        self.lags_sequence = lags_sequence
        # 滞后序列的配置，用于时间序列模型的输入
        self.scaling = scaling
        # 是否进行数据缩放的标志
        self.num_dynamic_real_features = num_dynamic_real_features
        # 动态实数特征的数量
        self.num_static_real_features = num_static_real_features
        # 静态实数特征的数量
        self.num_static_categorical_features = num_static_categorical_features
        # 静态分类特征的数量
        if cardinality and num_static_categorical_features > 0:
            if len(cardinality) != num_static_categorical_features:
                raise ValueError(
                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
                )
            self.cardinality = cardinality
        else:
            self.cardinality = [0]
        # 静态分类特征的基数，用于嵌入编码
        if embedding_dimension and num_static_categorical_features > 0:
            if len(embedding_dimension) != num_static_categorical_features:
                raise ValueError(
                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                )
            self.embedding_dimension = embedding_dimension
        else:
            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
        # 静态分类特征的嵌入维度配置
        self.num_parallel_samples = num_parallel_samples
        # 并行采样的数量

        # Transformer architecture configuration
        self.feature_size = input_size * len(lags_sequence) + self._number_of_features
        # 特征向量的大小，由输入特征、滞后序列长度和额外特征共同决定
        self.d_model = d_model
        # Transformer 模型的隐藏层维度
        self.encoder_attention_heads = encoder_attention_heads
        # 编码器注意力头的数量
        self.decoder_attention_heads = decoder_attention_heads
        # 解码器注意力头的数量
        self.encoder_ffn_dim = encoder_ffn_dim
        # 编码器前馈神经网络的维度
        self.decoder_ffn_dim = decoder_ffn_dim
        # 解码器前馈神经网络的维度
        self.encoder_layers = encoder_layers
        # 编码器层数
        self.decoder_layers = decoder_layers
        # 解码器层数

        self.dropout = dropout
        # 总体丢弃率
        self.attention_dropout = attention_dropout
        # 注意力机制的丢弃率
        self.activation_dropout = activation_dropout
        # 激活函数的丢弃率
        self.encoder_layerdrop = encoder_layerdrop
        # 编码器层的丢弃率
        self.decoder_layerdrop = decoder_layerdrop
        # 解码器层的丢弃率

        self.activation_function = activation_function
        # 激活函数的选择
        self.init_std = init_std
        # 初始化标准差

        self.use_cache = use_cache
        # 是否使用缓存

        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
        # 调用父类初始化方法，设置是否为编码-解码模型等参数

    @property
    def _number_of_features(self) -> int:
        return (
            sum(self.embedding_dimension)
            + self.num_dynamic_real_features
            + self.num_time_features
            + self.num_static_real_features
            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
        )
        # 计算特征数量，包括嵌入维度、动态实数特征、时间特征、静态实数特征和额外特征

`.\models\time_series_transformer\modeling_time_series_transformer.py`

# 设置编码格式为 UTF-8
# 版权声明，指出本代码版权归 HuggingFace Inc. 和 Amazon.com, Inc. 所有
# 根据 Apache 许可证 2.0 版本授权使用本代码
# 除非符合许可证的要求，否则禁止使用本文件
# 可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 根据适用法律或书面同意，本软件根据"原样"分发，不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

""" PyTorch 时间序列 Transformer 模型。"""

from typing import List, Optional, Tuple, Union

import numpy as np  # 引入 NumPy 库
import torch  # 引入 PyTorch 库
from torch import nn  # 引入 PyTorch 的 nn 模块

# 从其他模块引入相关内容
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    SampleTSPredictionOutput,
    Seq2SeqTSModelOutput,
    Seq2SeqTSPredictionOutput,
)
from ...modeling_utils import PreTrainedModel
from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_time_series_transformer import TimeSeriesTransformerConfig  # 从本地模块导入特定配置

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"  # 针对文档的配置信息

# 预训练模型存档列表，包含 HuggingFace 的时间序列 Transformer 模型
TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "huggingface/time-series-transformer-tourism-monthly",
    # 查看所有 TimeSeriesTransformer 模型：https://huggingface.co/models?filter=time_series_transformer
]


class TimeSeriesFeatureEmbedder(nn.Module):
    """
    Embed a sequence of categorical features.

    Args:
        cardinalities (`list[int]`):
            List of cardinalities of the categorical features.
        embedding_dims (`list[int]`):
            List of embedding dimensions of the categorical features.
    """

    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
        super().__init__()

        self.num_features = len(cardinalities)  # 特征数量为分类特征的个数
        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
        # 创建一个 nn.ModuleList，其中包含多个嵌入层，每个嵌入层对应一个分类特征的不同嵌入维度
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        # 如果特征数大于1，则将最后一个维度切片，得到一个长度为self.num_features的数组，形状为(N,T)或(N)
        if self.num_features > 1:
            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
        else:
            cat_feature_slices = [features]

        # 对每个切片进行嵌入操作，然后在最后一个维度上进行连接
        return torch.cat(
            [
                embed(cat_feature_slice.squeeze(-1))
                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
            ],
            dim=-1,
        )
class TimeSeriesStdScaler(nn.Module):
    """
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    """

    def __init__(self, config: TimeSeriesTransformerConfig):
        super().__init__()
        # 初始化标准化器，从配置中获取相关参数，如果不存在则使用默认值
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # 计算每个特征维度的均值
        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
        # 确保分母至少为1，避免除零错误
        denominator = denominator.clamp_min(1.0)
        # 计算均值
        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator

        # 计算方差
        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
        # 计算标准差，加上最小缩放值以确保数值稳定性
        scale = torch.sqrt(variance + self.minimum_scale)
        # 标准化数据并返回标准化后的数据、均值和标准差
        return (data - loc) / scale, loc, scale


class TimeSeriesMeanScaler(nn.Module):
    """
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    """

    def __init__(self, config: TimeSeriesTransformerConfig):
        super().__init__()
        # 初始化均值标准化器，从配置中获取相关参数，如果不存在则使用默认值
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor
    ) -> torch.Tensor:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`:
                Normalized data.
        """
        # 计算每个特征维度的加权平均绝对值
        scale = (torch.abs(data) * observed_indicator).sum(self.dim, keepdim=self.keepdim)
        # 加上最小缩放值以确保数值稳定性
        scale = scale.clamp_min(self.minimum_scale)
        
        # 如果存在默认缩放值，则用默认缩放值代替
        if self.default_scale is not None:
            scale = torch.where(scale == 0, torch.tensor(self.default_scale).to(scale.device), scale)
        
        # 标准化数据并返回
        return data / scale
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # Calculate the sum of absolute values of `data` multiplied by `observed_indicator`,
        # summed across the specified dimension (`self.dim`).
        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
        
        # Count the number of True values in `observed_indicator` across the specified dimension (`self.dim`).
        num_observed = observed_indicator.sum(self.dim, keepdim=True)

        # Compute the scale as the element-wise division of `ts_sum` by `num_observed`, ensuring `num_observed`
        # is clamped to a minimum of 1 to avoid division by zero.
        scale = ts_sum / torch.clamp(num_observed, min=1)

        # If `default_scale` is not provided, calculate it as the sum of `ts_sum` across the batches divided by
        # the sum of `num_observed` across the batches, clamping the latter to a minimum of 1.
        if self.default_scale is None:
            batch_sum = ts_sum.sum(dim=0)
            batch_observations = torch.clamp(num_observed.sum(0), min=1)
            default_scale = torch.squeeze(batch_sum / batch_observations)
        else:
            # If `default_scale` is provided, use it across all elements of `scale`.
            default_scale = self.default_scale * torch.ones_like(scale)

        # Apply `default_scale` where `num_observed` is greater than 0; otherwise, use `scale`.
        scale = torch.where(num_observed > 0, scale, default_scale)

        # Ensure that `scale` is not less than `self.minimum_scale`.
        scale = torch.clamp(scale, min=self.minimum_scale)

        # Normalize `data` by dividing it element-wise by `scale`.
        scaled_data = data / scale

        # If `self.keepdim` is False, squeeze `scale` along the specified dimension (`self.dim`).
        if not self.keepdim:
            scale = scale.squeeze(dim=self.dim)

        # Return `scaled_data`, a tensor of zeros like `scale`, and `scale` itself.
        return scaled_data, torch.zeros_like(scale), scale
class TimeSeriesNOPScaler(nn.Module):
    """
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    """

    def __init__(self, config: TimeSeriesTransformerConfig):
        super().__init__()
        # 初始化时根据配置设置维度和是否保持维度信息
        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        """
        # 计算沿指定维度的均值，作为数据的缩放因子
        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        # 计算沿指定维度的均值，作为数据的位置偏移
        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        # 返回原始数据、位置偏移和缩放因子
        return data, loc, scale


def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
    """
    Computes the negative log likelihood loss from input distribution with respect to target.
    """
    # 计算输入分布相对于目标的负对数似然损失
    return -input.log_prob(target)


def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
    """
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    """
    if weights is not None:
        # 根据权重计算加权平均值，并处理权重为零的情况
        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
        # 对权重求和，确保不为零
        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
        # 返回加权平均值
        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
    else:
        # 若未提供权重，则计算简单的平均值
        return input_tensor.mean(dim=dim)


# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->TimeSeries
class TimeSeriesSinusoidalPositionalEmbedding(nn.Embedding):
    """This module produces sinusoidal positional embeddings of any length."""
    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
        # 调用父类的初始化方法，传入位置数量和嵌入维度
        super().__init__(num_positions, embedding_dim)
        # 初始化权重矩阵，并将结果赋给self.weight
        self.weight = self._init_weight(self.weight)

    @staticmethod
    def _init_weight(out: nn.Parameter) -> nn.Parameter:
        """
        与 XLM 的 create_sinusoidal_embeddings 方法相同，不同之处在于特征没有交错。
        余弦特征位于向量的第二半部分 [dim // 2:]
        """
        # 获取输出张量的维度信息
        n_pos, dim = out.shape
        # 创建位置编码数组，使用正弦和余弦函数填充
        position_enc = np.array(
            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
        )
        # 设置梯度计算为False，以避免在pytorch-1.8+版本中的错误
        out.requires_grad = False
        # 根据维度是否为偶数，确定sentinel的值，用于分割正弦和余弦编码
        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
        # 将正弦编码部分赋值给out的前半部分
        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
        # 将余弦编码部分赋值给out的后半部分
        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
        # 分离out张量，使其不再追踪计算梯度
        out.detach_()
        return out

    @torch.no_grad()
    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
        """`input_ids_shape` 期望是 [bsz x seqlen]。"""
        # 解析输入形状，bsz是批量大小，seq_len是序列长度
        bsz, seq_len = input_ids_shape[:2]
        # 根据序列长度生成位置张量，加上past_key_values_length作为起始位置
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        # 调用父类的forward方法，传入位置张量，并返回结果张量
        return super().forward(positions)
class TimeSeriesValueEmbedding(nn.Module):
    def __init__(self, feature_size, d_model):
        super().__init__()
        # 初始化线性层，用于将特征大小 feature_size 的输入投影到维度为 d_model 的输出空间，无偏置
        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)

    def forward(self, x):
        # 前向传播函数，将输入 x 投影到 d_model 维度的空间
        return self.value_projection(x)


# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->TimeSeriesTransformer
class TimeSeriesTransformerAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[TimeSeriesTransformerConfig] = None,
    ):
        super().__init__()
        # 初始化注意力层参数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        # 检查 embed_dim 是否能被 num_heads 整除
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子，用于缩放注意力分数
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        # 初始化投影层：键（k_proj）、值（v_proj）、查询（q_proj）和输出（out_proj）
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 重新整形张量以适应多头注意力计算的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 实现多头注意力机制的前向传播
        pass  # 实际实现未包含在提供的代码片段中

# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->TimeSeriesTransformer, BART->TIME_SERIES_TRANSFORMER
class TimeSeriesTransformerEncoderLayer(nn.Module):
    # 此类定义编码器层的结构，与 TimeSeriesTransformer 模型相关
    # 初始化函数，接受一个时间序列转换器的配置对象作为参数
    def __init__(self, config: TimeSeriesTransformerConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置嵌入维度等于配置中的模型维度
        self.embed_dim = config.d_model
    
        # 创建自注意力机制，根据配置选择不同的实现类
        self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        # 自注意力层归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        
        # 设置丢弃率
        self.dropout = config.dropout
        # 激活函数选择，根据配置中的激活函数名从预定义映射中选择
        self.activation_fn = ACT2FN[config.activation_function]
        # 激活函数的丢弃率
        self.activation_dropout = config.activation_dropout
        
        # 第一个全连接层，将嵌入维度映射到编码器前馈神经网络维度
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        # 第二个全连接层，将编码器前馈神经网络维度映射回嵌入维度
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        # 最终的层归一化
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 保存输入的残差连接
        # 使用 self_attn 层进行自注意力计算
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 应用 dropout，防止过拟合
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 添加残差连接
        hidden_states = residual + hidden_states
        # 对输出应用 layer normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)

        residual = hidden_states  # 再次保存残差连接
        # 应用激活函数和线性变换 fc1
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用 dropout，防止过拟合
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 应用线性变换 fc2
        hidden_states = self.fc2(hidden_states)
        # 应用 dropout，防止过拟合
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 添加残差连接
        hidden_states = residual + hidden_states
        # 对输出应用 layer normalization
        hidden_states = self.final_layer_norm(hidden_states)

        # 如果 hidden_states 的数据类型为 torch.float16 并且包含无穷大或 NaN 值，则进行截断处理
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)  # 输出为元组形式，包含最终的隐藏状态

        # 如果需要返回注意力权重 tensors，则将 attn_weights 添加到输出中
        if output_attentions:
            outputs += (attn_weights,)  # 将注意力权重 tensors 添加到输出元组中

        return outputs
# 定义一个字典，将字符串 "eager" 映射到 TimeSeriesTransformerAttention 类
TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES = {
    "eager": TimeSeriesTransformerAttention,
}


# 从 transformers.models.bart.modeling_bart.BartDecoderLayer 复制而来，将 Bart 替换为 TimeSeriesTransformer，BART 替换为 TIME_SERIES_TRANSFORMER
class TimeSeriesTransformerDecoderLayer(nn.Module):
    def __init__(self, config: TimeSeriesTransformerConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 初始化 embedding 维度为 config 中的 d_model

        # 初始化自注意力机制，使用预先选择的注意力类
        self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            is_causal=True,
            config=config,
        )

        self.dropout = config.dropout  # 初始化 dropout 概率
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数根据配置选择
        self.activation_dropout = config.activation_dropout  # 激活函数的 dropout

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # LayerNorm 层，标准化自注意力输出

        # 初始化编码器-解码器注意力机制，使用预先选择的注意力类
        self.encoder_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            config=config,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # LayerNorm 层，标准化编码器-解码器注意力输出

        # 第一个全连接层，用于多头自注意力和编码器-解码器注意力之后的线性变换
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 第二个全连接层，用于上述变换后的再次线性变换
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)

        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终的 LayerNorm 层，标准化最终输出

    # 前向传播函数，处理各个层的输入输出以及相关参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,



class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
    config_class = TimeSeriesTransformerConfig  # 配置类的引用为 TimeSeriesTransformerConfig
    base_model_prefix = "model"  # 基础模型前缀为 "model"
    main_input_name = "past_values"  # 主输入名称为 "past_values"
    supports_gradient_checkpointing = True  # 支持梯度检查点

    # 初始化权重函数，根据模块类型不同，应用不同的初始化方法
    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)  # 线性层权重初始化为正态分布
            if module.bias is not None:
                module.bias.data.zero_()  # 如果有偏置，初始化为零
        elif isinstance(module, TimeSeriesSinusoidalPositionalEmbedding):
            pass  # 对于特定类型的模块，不进行任何初始化
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)  # Embedding 层权重初始化为正态分布
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()  # 如果有 padding_idx，将其对应的权重初始化为零

# 开始模型文档字符串，用于 TimeSeriesTransformer
TIME_SERIES_TRANSFORMER_START_DOCSTRING = r"""
    # 该模型继承自PreTrainedModel。查看超类文档，了解库实现的所有模型的通用方法（例如下载或保存、调整输入嵌入、剪枝头等）。
    # 该模型也是PyTorch的torch.nn.Module子类。将其视为常规的PyTorch模块，并查阅PyTorch文档，了解与一般用法和行为有关的所有事项。
    
    # 参数:
    # config ([TimeSeriesTransformerConfig]):
    # 模型配置类，包含模型的所有参数。使用配置文件进行初始化不会加载与模型关联的权重，只会加载配置。查看PreTrainedModel.from_pretrained方法以加载模型权重。
"""
TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
"""

class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`TimeSeriesTransformerEncoderLayer`].

    Args:
        config: TimeSeriesTransformerConfig
    """

    def __init__(self, config: TimeSeriesTransformerConfig):
        super().__init__(config)

        self.dropout = config.dropout  # 设置 dropout 比率
        self.layerdrop = config.encoder_layerdrop  # 设置 encoder 层的 dropout 比率
        if config.prediction_length is None:
            raise ValueError("The `prediction_length` config needs to be specified.")

        self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)  # 定义时间序列数值嵌入
        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
            config.context_length + config.prediction_length, config.d_model
        )  # 定义位置嵌入，使用正弦函数
        self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])  # 创建 encoder 层列表
        self.layernorm_embedding = nn.LayerNorm(config.d_model)  # 对嵌入进行 layer normalization

        self.gradient_checkpointing = False  # 是否使用梯度检查点
        # 初始化权重并进行最终处理
        self.post_init()

    def forward(
        self,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):  # 前向传播函数定义
        """
        在输入的时间序列数据上执行前向传播。

        Args:
            attention_mask: 可选的注意力遮罩张量
            head_mask: 可选的注意力头部遮罩张量
            inputs_embeds: 可选的输入嵌入张量
            output_attentions: 可选的是否输出注意力张量
            output_hidden_states: 可选的是否输出隐藏状态张量
            return_dict: 可选的是否返回字典形式的输出

        Returns:
            输出字典或元组，根据 return_dict 参数决定
        """
        pass  # 实际代码会在这里完成

class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
    [`TimeSeriesTransformerDecoderLayer`]

    Args:
        config: TimeSeriesTransformerConfig
    """

    def __init__(self, config: TimeSeriesTransformerConfig):
        super().__init__(config)
        self.dropout = config.dropout  # 设置 dropout 比率
        self.layerdrop = config.decoder_layerdrop  # 设置 decoder 层的 dropout 比率
        if config.prediction_length is None:
            raise ValueError("The `prediction_length` config needs to be specified.")

        self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)  # 定义时间序列数值嵌入
        self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
            config.context_length + config.prediction_length, config.d_model
        )  # 定义位置嵌入，使用正弦函数
        self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])  # 创建 decoder 层列表
        self.layernorm_embedding = nn.LayerNorm(config.d_model)  # 对嵌入进行 layer normalization

        self.gradient_checkpointing = False  # 是否使用梯度检查点
        # 初始化权重并进行最终处理
        self.post_init()
    # 定义一个方法用于执行前向传播，通常用于模型推理或训练过程中的前向计算
    def forward(
        self,
        # 可选参数：用于注意力机制的掩码，指定哪些位置是padding的，哪些是有效的
        attention_mask: Optional[torch.Tensor] = None,
        # 可选参数：编码器的隐藏状态，用于跨层注意力机制或连接不同模型的情况
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        # 可选参数：编码器的注意力掩码，指定哪些位置是padding的，哪些是有效的
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        # 可选参数：用于掩盖指定的注意力头，以便控制每个头的重要性
        head_mask: Optional[torch.Tensor] = None,
        # 可选参数：用于跨层注意力机制中掩盖指定的注意力头
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 可选参数：过去的键值对，用于自回归生成过程中保存先前计算的键值对
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        # 可选参数：用于指定输入的嵌入表示，覆盖模型内部嵌入层的输入
        inputs_embeds: Optional[torch.FloatTensor] = None,
        # 可选参数：是否使用缓存来加快计算，适用于需要多次调用的场景
        use_cache: Optional[bool] = None,
        # 可选参数：是否输出注意力权重
        output_attentions: Optional[bool] = None,
        # 可选参数：是否输出所有隐藏状态
        output_hidden_states: Optional[bool] = None,
        # 可选参数：是否返回一个字典格式的输出
        return_dict: Optional[bool] = None,
# 添加文档字符串注释，描述该类作为不带特定顶部头部的裸时间序列Transformer模型的输出
@add_start_docstrings(
    "The bare Time Series Transformer Model outputting raw hidden-states without any specific head on top.",
    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
)
class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
    
    # 初始化方法，接收一个TimeSeriesTransformerConfig类型的参数config
    def __init__(self, config: TimeSeriesTransformerConfig):
        super().__init__(config)

        # 根据配置选择合适的数据缩放器
        if config.scaling == "mean" or config.scaling is True:
            self.scaler = TimeSeriesMeanScaler(config)
        elif config.scaling == "std":
            self.scaler = TimeSeriesStdScaler(config)
        else:
            self.scaler = TimeSeriesNOPScaler(config)

        # 如果存在静态分类特征，则初始化时间序列特征嵌入器
        if config.num_static_categorical_features > 0:
            self.embedder = TimeSeriesFeatureEmbedder(
                cardinalities=config.cardinality,
                embedding_dims=config.embedding_dimension,
            )

        # 初始化Transformer编码器和解码器
        self.encoder = TimeSeriesTransformerEncoder(config)
        self.decoder = TimeSeriesTransformerDecoder(config)

        # 初始化权重并应用最终处理
        self.post_init()

    # 属性方法，返回过去长度，即上下文长度加上最大滞后序列长度
    @property
    def _past_length(self) -> int:
        return self.config.context_length + max(self.config.lags_sequence)

    # 方法用于获取给定序列的滞后子序列
    def get_lagged_subsequences(
        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
    ) -> torch.Tensor:
        """
        Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
            where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
            j, :, k] = sequence[i, -indices[k]-S+j, :].

        Args:
            sequence: Tensor
                The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
            subsequences_length : int
                Length of the subsequences to be extracted.
            shift: int
                Shift the lags by this amount back.
        """
        # 获取输入序列的长度
        sequence_length = sequence.shape[1]
        
        # 根据配置中的滞后序列计算滞后的索引
        indices = [lag - shift for lag in self.config.lags_sequence]

        # 检查滞后的最大索引加上子序列长度是否超过序列的长度，如果超过则抛出异常
        if max(indices) + subsequences_length > sequence_length:
            raise ValueError(
                f"lags cannot go further than history length, found lag {max(indices)} "
                f"while history length is only {sequence_length}"
            )

        # 初始化一个列表，用于存储滞后的值
        lagged_values = []
        
        # 遍历每个滞后索引，提取对应的滞后子序列并存储在列表中
        for lag_index in indices:
            begin_index = -lag_index - subsequences_length
            end_index = -lag_index if lag_index > 0 else None
            lagged_values.append(sequence[:, begin_index:end_index, ...])
        
        # 将所有滞后子序列堆叠成一个张量并返回，最后一个维度表示滞后的数量
        return torch.stack(lagged_values, dim=-1)
    # 创建网络输入的方法，用于组装神经网络所需的输入数据
    def create_network_inputs(
        self,
        # 过去的数值数据，作为神经网络输入的一部分
        past_values: torch.Tensor,
        # 过去的时间特征数据，用于神经网络输入
        past_time_features: torch.Tensor,
        # 可选参数：静态分类特征数据，如果存在的话
        static_categorical_features: Optional[torch.Tensor] = None,
        # 可选参数：静态实数特征数据，如果存在的话
        static_real_features: Optional[torch.Tensor] = None,
        # 可选参数：过去观测掩码，如果存在的话
        past_observed_mask: Optional[torch.Tensor] = None,
        # 可选参数：未来的数值数据，如果存在的话
        future_values: Optional[torch.Tensor] = None,
        # 可选参数：未来的时间特征数据，如果存在的话
        future_time_features: Optional[torch.Tensor] = None,
        # time feature
        # 按照指定的上下文长度将过去时间特征和未来时间特征连接起来
        time_feat = (
            torch.cat(
                (
                    past_time_features[:, self._past_length - self.config.context_length :, ...],
                    future_time_features,
                ),
                dim=1,
            )
            if future_values is not None  # 如果存在未来数值，则连接未来时间特征
            else past_time_features[:, self._past_length - self.config.context_length :, ...]  # 否则仅使用过去时间特征
        )

        # target
        # 如果过去观察掩码为空，则将其初始化为与过去数值形状相同的全1张量
        if past_observed_mask is None:
            past_observed_mask = torch.ones_like(past_values)

        # 获取当前上下文的数值
        context = past_values[:, -self.config.context_length :]
        # 获取当前上下文的观察掩码
        observed_context = past_observed_mask[:, -self.config.context_length :]
        # 使用规模器对象处理上下文数据，返回位置、缩放参数
        _, loc, scale = self.scaler(context, observed_context)

        # 构建模型输入数据
        inputs = (
            (torch.cat((past_values, future_values), dim=1) - loc) / scale
            if future_values is not None  # 如果存在未来数值，则将过去和未来数值归一化
            else (past_values - loc) / scale  # 否则仅归一化过去数值
        )

        # static features
        # 计算位置绝对值的对数并添加到静态特征中
        log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
        # 计算缩放参数的对数并添加到静态特征中
        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
        # 合并位置和缩放的对数作为静态特征
        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)

        # 如果存在实数类型的静态特征，则将其添加到静态特征中
        if static_real_features is not None:
            static_feat = torch.cat((static_real_features, static_feat), dim=1)
        # 如果存在分类类型的静态特征，则将其嵌入后添加到静态特征中
        if static_categorical_features is not None:
            embedded_cat = self.embedder(static_categorical_features)
            static_feat = torch.cat((embedded_cat, static_feat), dim=1)
        # 将静态特征扩展以匹配时间特征的长度
        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)

        # all features
        # 将静态特征和时间特征连接成为模型的所有输入特征
        features = torch.cat((expanded_static_feat, time_feat), dim=-1)

        # lagged features
        # 计算滞后序列的长度
        subsequences_length = (
            self.config.context_length + self.config.prediction_length
            if future_values is not None  # 如果存在未来数值，则将预测长度加入上下文长度
            else self.config.context_length  # 否则只使用上下文长度
        )
        # 获取滞后的子序列
        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
        lags_shape = lagged_sequence.shape
        # 将滞后序列重塑为三维张量
        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)

        # 如果重塑后的滞后序列长度与时间特征长度不匹配，则引发数值错误
        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
            raise ValueError(
                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
            )

        # transformer inputs
        # 将重塑后的滞后序列和所有特征连接成为变换器的输入
        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)

        # 返回变换器的输入、位置、缩放参数和静态特征
        return transformer_inputs, loc, scale, static_feat
    # 定义一个方法 `forward`，用于模型的前向传播
    def forward(
        self,
        # 过去时间步的值，类型为 Torch 张量
        past_values: torch.Tensor,
        # 过去时间步的时间特征，类型为 Torch 张量
        past_time_features: torch.Tensor,
        # 过去时间步的观察掩码，类型为 Torch 张量
        past_observed_mask: torch.Tensor,
        # 静态分类特征，可选的 Torch 张量
        static_categorical_features: Optional[torch.Tensor] = None,
        # 静态实数特征，可选的 Torch 张量
        static_real_features: Optional[torch.Tensor] = None,
        # 未来时间步的值，可选的 Torch 张量
        future_values: Optional[torch.Tensor] = None,
        # 未来时间步的时间特征，可选的 Torch 张量
        future_time_features: Optional[torch.Tensor] = None,
        # 解码器注意力掩码，可选的 Torch 长整型张量
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        # 头部掩码，可选的 Torch 张量
        head_mask: Optional[torch.Tensor] = None,
        # 解码器头部掩码，可选的 Torch 张量
        decoder_head_mask: Optional[torch.Tensor] = None,
        # 交叉注意力头部掩码，可选的 Torch 张量
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        # 编码器输出列表，可选的浮点数 Torch 列表
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        # 过去关键值列表，可选的浮点数 Torch 列表
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        # 输出隐藏状态的标志，可选的布尔值
        output_hidden_states: Optional[bool] = None,
        # 输出注意力的标志，可选的布尔值
        output_attentions: Optional[bool] = None,
        # 是否使用缓存的标志，可选的布尔值
        use_cache: Optional[bool] = None,
        # 是否返回字典的标志，可选的布尔值
        return_dict: Optional[bool] = None,
# 给 TimeSeriesTransformerForPrediction 类添加文档字符串，描述其作为基于时间序列的变压器模型预测模型的用途和结构
@add_start_docstrings(
    "The Time Series Transformer Model with a distribution head on top for time-series forecasting.",
    TIME_SERIES_TRANSFORMER_START_DOCSTRING,
)
# 定义 TimeSeriesTransformerForPrediction 类，继承自 TimeSeriesTransformerPreTrainedModel
class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
    
    # 初始化方法，接受一个 TimeSeriesTransformerConfig 类型的 config 参数
    def __init__(self, config: TimeSeriesTransformerConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        
        # 创建 TimeSeriesTransformerModel 模型实例
        self.model = TimeSeriesTransformerModel(config)
        
        # 根据 config 中的 distribution_output 参数选择合适的输出分布
        if config.distribution_output == "student_t":
            self.distribution_output = StudentTOutput(dim=config.input_size)
        elif config.distribution_output == "normal":
            self.distribution_output = NormalOutput(dim=config.input_size)
        elif config.distribution_output == "negative_binomial":
            self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
        else:
            raise ValueError(f"Unknown distribution output {config.distribution_output}")
        
        # 根据模型配置的维度，初始化参数投影
        self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
        
        # 获取分布输出的事件形状
        self.target_shape = self.distribution_output.event_shape
        
        # 根据 config 中的 loss 参数选择损失函数
        if config.loss == "nll":
            self.loss = nll
        else:
            raise ValueError(f"Unknown loss function {config.loss}")
        
        # 调用后处理初始化方法
        self.post_init()

    # 输出参数的方法，接受解码器输出作为参数，返回参数投影后的结果
    def output_params(self, dec_output):
        return self.parameter_projection(dec_output)

    # 获取编码器的方法，返回模型的编码器部分
    def get_encoder(self):
        return self.model.get_encoder()

    # 获取解码器的方法，返回模型的解码器部分
    def get_decoder(self):
        return self.model.get_decoder()

    # 输出分布的方法，接受参数 params、loc、scale 和 trailing_n，返回分布对象
    @torch.jit.ignore
    def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
        # 如果 trailing_n 不为 None，则对 params 进行切片操作
        sliced_params = params
        if trailing_n is not None:
            sliced_params = [p[:, -trailing_n:] for p in params]
        # 调用 distribution_output 对象的 distribution 方法生成分布对象
        return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)

    # 将 TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING 和 _CONFIG_FOR_DOC 添加到模型前向方法的文档字符串中
    @add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法 `forward`，用于模型的前向传播
    def forward(
        self,
        past_values: torch.Tensor,
        past_time_features: torch.Tensor,
        past_observed_mask: torch.Tensor,
        static_categorical_features: Optional[torch.Tensor] = None,
        static_real_features: Optional[torch.Tensor] = None,
        future_values: Optional[torch.Tensor] = None,
        future_time_features: Optional[torch.Tensor] = None,
        future_observed_mask: Optional[torch.Tensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        use_cache: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 标记这个方法不需要梯度计算
        @torch.no_grad()
        # 定义一个方法 `generate`，用于模型的生成
        def generate(
            self,
            past_values: torch.Tensor,
            past_time_features: torch.Tensor,
            future_time_features: torch.Tensor,
            past_observed_mask: Optional[torch.Tensor] = None,
            static_categorical_features: Optional[torch.Tensor] = None,
            static_real_features: Optional[torch.Tensor] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
        ):

`.\models\time_series_transformer\init.py`

# 引入类型检查器的功能，用于在类型检查时导入特定模块和对象
from typing import TYPE_CHECKING

# 引入自定义的异常类，用于处理依赖不可用的情况，以及 LazyModule 的实现和 torch 是否可用的检查
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块导入结构的字典，包含了配置和模型的名称
_import_structure = {
    "configuration_time_series_transformer": [
        "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "TimeSeriesTransformerConfig",
    ],
}

# 检查是否可用 torch 库，若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 torch 可用，则添加相关模型的导入结构到 _import_structure 字典中
    _import_structure["modeling_time_series_transformer"] = [
        "TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TimeSeriesTransformerForPrediction",
        "TimeSeriesTransformerModel",
        "TimeSeriesTransformerPreTrainedModel",
    ]

# 如果正在进行类型检查
if TYPE_CHECKING:
    # 导入配置模块中所需的内容
    from .configuration_time_series_transformer import (
        TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        TimeSeriesTransformerConfig,
    )

    try:
        # 再次检查 torch 是否可用
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模型模块中所需的内容
        from .modeling_time_series_transformer import (
            TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TimeSeriesTransformerForPrediction,
            TimeSeriesTransformerModel,
            TimeSeriesTransformerPreTrainedModel,
        )

# 如果不是类型检查模式，则进行 LazyModule 的初始化
else:
    import sys

    # 将当前模块替换为 LazyModule，以支持按需导入和延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\timm_backbone\configuration_timm_backbone.py`

# coding=utf-8
# 版权声明及许可信息，指明代码版权归 HuggingFace Inc. 团队所有，遵循 Apache License 2.0
#
# 导入必要的模块和类
from ...configuration_utils import PretrainedConfig  # 导入预训练模型配置类
from ...utils import logging  # 导入日志记录工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# TimmBackboneConfig 类，用于配置 timm backbone 模型
class TimmBackboneConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration for a timm backbone [`TimmBackbone`].

    It is used to instantiate a timm backbone model according to the specified arguments, defining the model.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone (`str`, *optional*):
            The timm checkpoint to load.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        features_only (`bool`, *optional*, defaults to `True`):
            Whether to output only the features or also the logits.
        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
            Whether to use a pretrained backbone.
        out_indices (`List[int]`, *optional*):
            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
            many stages the model has). Will default to the last stage if unset.
        freeze_batch_norm_2d (`bool`, *optional*, defaults to `False`):
            Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`.

    Example:
    ```
    >>> from transformers import TimmBackboneConfig, TimmBackbone

    >>> # Initializing a timm backbone
    >>> configuration = TimmBackboneConfig("resnet50")

    >>> # Initializing a model from the configuration
    >>> model = TimmBackbone(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "timm_backbone"

    def __init__(
        self,
        backbone=None,
        num_channels=3,
        features_only=True,
        use_pretrained_backbone=True,
        out_indices=None,
        freeze_batch_norm_2d=False,
        **kwargs,
    ):
        # 调用父类 PretrainedConfig 的初始化方法，传递参数以初始化模型配置
        super().__init__(**kwargs)
        # 设置当前实例的特定属性，用于配置 timm backbone 模型的参数
        self.backbone = backbone
        self.num_channels = num_channels
        self.features_only = features_only
        self.use_pretrained_backbone = use_pretrained_backbone
        self.out_indices = out_indices
        self.freeze_batch_norm_2d = freeze_batch_norm_2d
        ):
            # 调用父类的初始化方法，传递所有关键字参数
            super().__init__(**kwargs)
            # 设置网络的主干模型
            self.backbone = backbone
            # 设置网络的通道数
            self.num_channels = num_channels
            # 设置是否仅输出特征
            self.features_only = features_only
            # 设置是否使用预训练的主干模型
            self.use_pretrained_backbone = use_pretrained_backbone
            # 设置是否使用timm的主干模型
            self.use_timm_backbone = True
            # 设置要输出的特征索引，如果未指定则默认为最后一个索引
            self.out_indices = out_indices if out_indices is not None else (-1,)
            # 设置是否冻结2D批处理规范化层
            self.freeze_batch_norm_2d = freeze_batch_norm_2d

`.\models\timm_backbone\modeling_timm_backbone.py`

# 引入必要的模块和函数
from typing import Optional, Tuple, Union  # 导入类型提示所需的模块

import torch  # 导入 PyTorch 库

from ...modeling_outputs import BackboneOutput  # 导入模型输出的背景输出
from ...modeling_utils import PreTrainedModel  # 导入预训练模型的实用工具
from ...utils import is_timm_available, is_torch_available, requires_backends  # 导入用于检查库是否可用的工具函数
from ...utils.backbone_utils import BackboneMixin  # 导入背景混合类
from .configuration_timm_backbone import TimmBackboneConfig  # 导入 Timm 模型的配置类

# 检查是否安装了 timm 库
if is_timm_available():
    import timm  # 如果可用，导入 timm 库

# 检查是否安装了 torch 库
if is_torch_available():
    from torch import Tensor  # 如果可用，从 torch 库导入 Tensor 类型

class TimmBackbone(PreTrainedModel, BackboneMixin):
    """
    Wrapper class for timm models to be used as backbones. This enables using the timm models interchangeably with the
    other models in the library keeping the same API.
    """
    
    main_input_name = "pixel_values"  # 定义主要输入名称为 "pixel_values"
    supports_gradient_checkpointing = False  # 不支持梯度检查点
    config_class = TimmBackboneConfig  # 指定配置类为 TimmBackboneConfig
    # 初始化方法，接受配置和其他关键字参数
    def __init__(self, config, **kwargs):
        # 要求使用"timm"库作为后端
        requires_backends(self, "timm")
        # 调用父类的初始化方法
        super().__init__(config)
        # 将配置信息保存在实例中
        self.config = config

        # 如果配置中未设置backbone，则抛出数值错误
        if config.backbone is None:
            raise ValueError("backbone is not set in the config. Please set it to a timm model name.")

        # 如果配置中指定的backbone不在timm支持的模型列表中，则抛出数值错误
        if config.backbone not in timm.list_models():
            raise ValueError(f"backbone {config.backbone} is not supported by timm.")

        # 如果配置中存在out_features参数（不支持），则抛出数值错误，建议使用out_indices
        if hasattr(config, "out_features") and config.out_features is not None:
            raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.")

        # 获取配置中的use_pretrained_backbone参数，如果未设置，则抛出数值错误
        pretrained = getattr(config, "use_pretrained_backbone", None)
        if pretrained is None:
            raise ValueError("use_pretrained_backbone is not set in the config. Please set it to True or False.")

        # 默认情况下，仅使用最后一层。这与transformers模型的默认行为匹配
        # 如果配置中存在out_indices参数，则使用该参数；否则默认使用最后一层（-1）
        out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,)

        # 使用timm库创建指定的模型
        self._backbone = timm.create_model(
            config.backbone,
            pretrained=pretrained,
            features_only=config.features_only,
            in_chans=config.num_channels,
            out_indices=out_indices,
            **kwargs,
        )

        # 如果配置中设置了freeze_batch_norm_2d参数为True，则冻结模型中所有的BatchNorm2d和SyncBatchNorm层
        if getattr(config, "freeze_batch_norm_2d", False):
            self.freeze_batch_norm_2d()

        # _backbone的return_layers属性用于控制模型调用时的输出
        self._return_layers = self._backbone.return_layers
        # _backbone的feature_info.info属性包含所有层的信息，将其转换为字典形式保存在_all_layers中
        self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)}
        
        # 调用父类的_init_backbone方法，初始化backbone模型
        super()._init_backbone(config)
    # 通过预训练模型名或路径创建一个新的实例
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        # 要求类依赖的后端模块是 "vision" 和 "timm"
        requires_backends(cls, ["vision", "timm"])
        # 从特定位置导入 TimmBackboneConfig 类
        from ...models.timm_backbone import TimmBackboneConfig

        # 从关键字参数中弹出配置对象，若不存在则使用 TimmBackboneConfig 的默认配置
        config = kwargs.pop("config", TimmBackboneConfig())

        # 确定是否使用 timm 的 backbone，默认为 True；若不是，则抛出 ValueError
        use_timm = kwargs.pop("use_timm_backbone", True)
        if not use_timm:
            raise ValueError("use_timm_backbone must be True for timm backbones")

        # 从关键字参数中获取或使用 TimmBackboneConfig 中的默认值来设置通道数
        num_channels = kwargs.pop("num_channels", config.num_channels)
        # 从关键字参数中获取或使用 TimmBackboneConfig 中的默认值来设置仅提取特征的标志
        features_only = kwargs.pop("features_only", config.features_only)
        # 从关键字参数中获取或使用 TimmBackboneConfig 中的默认值来设置是否使用预训练的 backbone
        use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
        # 从关键字参数中获取或使用 TimmBackboneConfig 中的默认值来设置输出的索引列表
        out_indices = kwargs.pop("out_indices", config.out_indices)

        # 使用给定的参数创建一个 TimmBackboneConfig 对象
        config = TimmBackboneConfig(
            backbone=pretrained_model_name_or_path,
            num_channels=num_channels,
            features_only=features_only,
            use_pretrained_backbone=use_pretrained_backbone,
            out_indices=out_indices,
        )
        # 调用父类的 _from_config 方法，传递配置对象和其它关键字参数
        return super()._from_config(config, **kwargs)

    # 冻结模型中所有 2D 批归一化层的参数
    def freeze_batch_norm_2d(self):
        timm.layers.freeze_batch_norm_2d(self._backbone)

    # 解冻模型中所有 2D 批归一化层的参数
    def unfreeze_batch_norm_2d(self):
        timm.layers.unfreeze_batch_norm_2d(self._backbone)

    # 空的初始化权重函数，确保类在库中的兼容性
    def _init_weights(self, module):
        """
        Empty init weights function to ensure compatibility of the class in the library.
        """
        pass

    # 前向传播函数，接收像素值作为输入，并可以选择返回注意力、隐藏状态或以字典形式返回结果
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ):
        # 如果 return_dict 为 None，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 output_hidden_states 为 None，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 output_attentions 为 None，则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # 如果要输出注意力机制信息，则抛出 ValueError，因为 timm 模型暂不支持注意力输出
        if output_attentions:
            raise ValueError("Cannot output attentions for timm backbones at the moment")

        # 如果要输出隐藏状态信息
        if output_hidden_states:
            # 修改返回的层级以包括所有的 backbone 阶段
            self._backbone.return_layers = self._all_layers
            # 使用 backbone 模型提取特征，返回隐藏状态
            hidden_states = self._backbone(pixel_values, **kwargs)
            # 恢复返回的层级设置
            self._backbone.return_layers = self._return_layers
            # 从隐藏状态中提取指定索引的特征图
            feature_maps = tuple(hidden_states[i] for i in self.out_indices)
        else:
            # 直接使用 backbone 模型提取特征，不返回隐藏状态
            feature_maps = self._backbone(pixel_values, **kwargs)
            hidden_states = None

        # 将特征图转换为元组
        feature_maps = tuple(feature_maps)
        # 如果隐藏状态不为 None，则将其转换为元组；否则设置为 None
        hidden_states = tuple(hidden_states) if hidden_states is not None else None

        # 如果不需要返回字典形式的输出
        if not return_dict:
            # 构造输出元组，包含特征图
            output = (feature_maps,)
            # 如果需要输出隐藏状态，则将隐藏状态也添加到输出中
            if output_hidden_states:
                output = output + (hidden_states,)
            # 返回构造的输出元组
            return output

        # 如果需要返回字典形式的输出，则构造 BackboneOutput 对象并返回
        return BackboneOutput(feature_maps=feature_maps, hidden_states=hidden_states, attentions=None)

`.\models\timm_backbone\init.py`

# flake8: noqa
# 禁止 flake8 对本模块进行检查，因为无法忽略 "F401 '...' imported but unused" 警告，而保留其他警告。

# Copyright 2023 The HuggingFace Team. All rights reserved.
# 版权 2023 年 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本进行许可；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则根据此许可证分发的软件是基于“按原样”提供的，不附带任何明示或暗示的保证或条件。
# 请查阅许可证了解具体的法律语言和权限限制。

from typing import TYPE_CHECKING

from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构
_import_structure = {"configuration_timm_backbone": ["TimmBackboneConfig"]}

try:
    # 检查是否有可用的 Torch
    if not is_torch_available():
        # 如果没有可用的 Torch，则抛出 OptionalDependencyNotAvailable 异常
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 如果出现 OptionalDependencyNotAvailable 异常，则什么也不做，继续执行
    pass
else:
    # 如果没有异常，则将 TimmBackbone 添加到导入结构中
    _import_structure["modeling_timm_backbone"] = ["TimmBackbone"]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 导入配置文件的 TimmBackboneConfig 类
    from .configuration_timm_backbone import TimmBackboneConfig

    try:
        # 再次检查是否有可用的 Torch
        if not is_torch_available():
            # 如果没有可用的 Torch，则抛出 OptionalDependencyNotAvailable 异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果出现 OptionalDependencyNotAvailable 异常，则什么也不做，继续执行
        pass
    else:
        # 如果没有异常，则导入 modeling_timm_backbone 模块的 TimmBackbone 类
        from .modeling_timm_backbone import TimmBackbone

# 如果不是类型检查阶段
else:
    # 导入 sys 模块
    import sys

    # 将当前模块替换为 _LazyModule 的实例，延迟加载模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\trocr\configuration_trocr.py`

# coding=utf-8
# 上面的行指定了源文件的编码格式为UTF-8，确保可以正确处理各种字符

# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
# 以下几行是版权声明，指明此代码的版权归HuggingFace Inc.团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 使用Apache License, Version 2.0授权许可，允许在符合许可条件下使用本代码

# you may not use this file except in compliance with the License.
# 除非符合许可条件，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在上述链接获取许可证的副本

# http://www.apache.org/licenses/LICENSE-2.0
# 许可证详情请访问该网址

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed的代码基于"AS IS"基础分发，即无任何形式的明示或暗示保证

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何形式的明示或暗示保证，包括但不限于适销性或特定用途适用性的保证

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查看许可证了解权限的具体限制

""" TrOCR model configuration"""
# 下面是对TrOCR模型配置的简短描述

from ...configuration_utils import PretrainedConfig
from ...utils import logging
# 导入所需的模块

logger = logging.get_logger(__name__)
# 获取日志记录器实例

TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/trocr-base-handwritten": (
        "https://huggingface.co/microsoft/trocr-base-handwritten/resolve/main/config.json"
    ),
    # 用于存储TrOCR预训练模型的配置映射，指定了模型名称和其配置文件的URL
    # 可在https://huggingface.co/models?filter=trocr查看所有TrOCR模型
}


class TrOCRConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`TrOCRForCausalLM`]. It is used to instantiate an
    TrOCR model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the TrOCR
    [microsoft/trocr-base-handwritten](https://huggingface.co/microsoft/trocr-base-handwritten) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # TrOCRConfig类用于存储TrOCR模型的配置信息，继承自PretrainedConfig类
    # 用于根据指定的参数实例化TrOCR模型，定义模型的架构
    # 使用默认参数实例化配置将生成与TrOCR microsoft/trocr-base-handwritten架构相似的配置
    # TrOCR 模型的配置类，定义了模型的各种参数和选项
    Args:
        vocab_size (`int`, *optional*, defaults to 50265):
            TrOCR 模型的词汇表大小，定义了在调用 `TrOCRForCausalLM` 时可以表示的不同标记数量。
        d_model (`int`, *optional*, defaults to 1024):
            层和池化层的维度。
        decoder_layers (`int`, *optional*, defaults to 12):
            解码器层数。
        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Transformer 解码器中每个注意力层的注意力头数。
        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            解码器中“中间”（通常称为前馈）层的维度。
        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            池化器中的非线性激活函数（函数或字符串）。支持的字符串包括 "gelu"、"relu"、"silu" 和 "gelu_new"。
        max_position_embeddings (`int`, *optional*, defaults to 512):
            模型可能使用的最大序列长度。通常设置为一个很大的值（例如 512、1024 或 2048）。
        dropout (`float`, *optional*, defaults to 0.1):
            嵌入层和池化器中所有全连接层的dropout概率。
        attention_dropout (`float`, *optional*, defaults to 0.0):
            注意力概率的dropout比率。
        activation_dropout (`float`, *optional*, defaults to 0.0):
            全连接层内部激活的dropout比率。
        init_std (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准偏差。
        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
            解码器的层丢弃概率。详见 LayerDrop 论文(https://arxiv.org/abs/1909.11556) 了解更多细节。
        use_cache (`bool`, *optional*, defaults to `True`):
            模型是否应返回最后的键/值注意力（并非所有模型都使用）。
        scale_embedding (`bool`, *optional*, defaults to `False`):
            是否对词嵌入进行 sqrt(d_model) 的缩放。
        use_learned_position_embeddings (`bool`, *optional*, defaults to `True`):
            是否使用学习到的位置嵌入。如果不使用，则使用正弦位置嵌入。
        layernorm_embedding (`bool`, *optional*, defaults to `True`):
            是否在词 + 位置嵌入后使用 layernorm。
    
    Example:

    ```
    >>> from transformers import TrOCRConfig, TrOCRForCausalLM

    >>> # Initializing a TrOCR-base style configuration
    >>> configuration = TrOCRConfig()
    ```
    # 初始化一个 TrOCRForCausalLM 模型实例，使用指定的配置 (configuration)。模型权重是随机的。
    model = TrOCRForCausalLM(configuration)
    
    # 访问模型的配置信息
    configuration = model.config

`.\models\trocr\convert_trocr_unilm_to_pytorch.py`

# 设置代码文件的编码格式为 UTF-8
# 版权声明，说明代码的版权归 The HuggingFace Inc. team. 所有
# 根据 Apache 许可证 2.0 版本使用此文件，详细信息可查阅许可证
"""从 unilm 代码库转换 TrOCR 检查点。"""


# 导入必要的库和模块
import argparse  # 用于处理命令行参数
from pathlib import Path  # 用于处理文件路径

import requests  # 用于发送 HTTP 请求
import torch  # PyTorch 深度学习框架
from PIL import Image  # Python 图像处理库

# 导入 Transformers 库中的相关模块和类
from transformers import (
    RobertaTokenizer,  # RoBERTa 模型的分词器
    TrOCRConfig,  # TrOCR 模型的配置类
    TrOCRForCausalLM,  # TrOCR 用于有因果语言建模的模型类
    TrOCRProcessor,  # 处理 TrOCR 模型的数据处理器类
    VisionEncoderDecoderModel,  # 视觉编码器解码器模型
    ViTConfig,  # Vision Transformer (ViT) 的配置类
    ViTImageProcessor,  # 处理 ViT 模型输入图像的类
    ViTModel,  # Vision Transformer (ViT) 模型类
)
from transformers.utils import logging  # Transformers 库中的日志记录工具


logging.set_verbosity_info()  # 设置日志记录级别为信息
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


# 定义函数：创建需要重命名的键值对列表（原始名称在左边，我们的名称在右边）
def create_rename_keys(encoder_config, decoder_config):
    rename_keys = []  # 初始化空列表，用于存储重命名键值对
    for i in range(encoder_config.num_hidden_layers):
        # encoder 层的配置：输出投影，2 个前馈神经网络和 2 个层归一化
        rename_keys.append(
            (f"encoder.deit.blocks.{i}.norm1.weight", f"encoder.encoder.layer.{i}.layernorm_before.weight")
        )
        rename_keys.append((f"encoder.deit.blocks.{i}.norm1.bias", f"encoder.encoder.layer.{i}.layernorm_before.bias"))
        rename_keys.append(
            (f"encoder.deit.blocks.{i}.attn.proj.weight", f"encoder.encoder.layer.{i}.attention.output.dense.weight")
        )
        rename_keys.append(
            (f"encoder.deit.blocks.{i}.attn.proj.bias", f"encoder.encoder.layer.{i}.attention.output.dense.bias")
        )
        rename_keys.append(
            (f"encoder.deit.blocks.{i}.norm2.weight", f"encoder.encoder.layer.{i}.layernorm_after.weight")
        )
        rename_keys.append((f"encoder.deit.blocks.{i}.norm2.bias", f"encoder.encoder.layer.{i}.layernorm_after.bias"))
        rename_keys.append(
            (f"encoder.deit.blocks.{i}.mlp.fc1.weight", f"encoder.encoder.layer.{i}.intermediate.dense.weight")
        )
        rename_keys.append(
            (f"encoder.deit.blocks.{i}.mlp.fc1.bias", f"encoder.encoder.layer.{i}.intermediate.dense.bias")
        )
        rename_keys.append(
            (f"encoder.deit.blocks.{i}.mlp.fc2.weight", f"encoder.encoder.layer.{i}.output.dense.weight")
        )
        rename_keys.append((f"encoder.deit.blocks.{i}.mlp.fc2.bias", f"encoder.encoder.layer.{i}.output.dense.bias"))

    # encoder 的 cls token、位置嵌入和 patch 嵌入
    rename_keys.extend(
        [
            ("encoder.deit.cls_token", "encoder.embeddings.cls_token"),
            ("encoder.deit.pos_embed", "encoder.embeddings.position_embeddings"),
            ("encoder.deit.patch_embed.proj.weight", "encoder.embeddings.patch_embeddings.projection.weight"),
            ("encoder.deit.patch_embed.proj.bias", "encoder.embeddings.patch_embeddings.projection.bias"),
            ("encoder.deit.norm.weight", "encoder.layernorm.weight"),
            ("encoder.deit.norm.bias", "encoder.layernorm.bias"),
        ]
    )



# 将一组元组添加到 rename_keys 列表中，用于重新命名模型的特定参数路径
rename_keys.extend(
    [
        ("encoder.deit.cls_token", "encoder.embeddings.cls_token"),  # 将 encoder.deit.cls_token 重命名为 encoder.embeddings.cls_token
        ("encoder.deit.pos_embed", "encoder.embeddings.position_embeddings"),  # 将 encoder.deit.pos_embed 重命名为 encoder.embeddings.position_embeddings
        ("encoder.deit.patch_embed.proj.weight", "encoder.embeddings.patch_embeddings.projection.weight"),  # 将 encoder.deit.patch_embed.proj.weight 重命名为 encoder.embeddings.patch_embeddings.projection.weight
        ("encoder.deit.patch_embed.proj.bias", "encoder.embeddings.patch_embeddings.projection.bias"),  # 将 encoder.deit.patch_embed.proj.bias 重命名为 encoder.embeddings.patch_embeddings.projection.bias
        ("encoder.deit.norm.weight", "encoder.layernorm.weight"),  # 将 encoder.deit.norm.weight 重命名为 encoder.layernorm.weight
        ("encoder.deit.norm.bias", "encoder.layernorm.bias"),  # 将 encoder.deit.norm.bias 重命名为 encoder.layernorm.bias
    ]
)

return rename_keys
# 将每个编码器层的权重矩阵分割为查询、键和值
def read_in_q_k_v(state_dict, encoder_config):
    # 遍历编码器的每一层
    for i in range(encoder_config.num_hidden_layers):
        # 提取查询、键和值的权重（没有偏置）
        in_proj_weight = state_dict.pop(f"encoder.deit.blocks.{i}.attn.qkv.weight")

        # 将权重分配给查询的权重矩阵
        state_dict[f"encoder.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
            : encoder_config.hidden_size, :
        ]
        # 将权重分配给键的权重矩阵
        state_dict[f"encoder.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
            encoder_config.hidden_size : encoder_config.hidden_size * 2, :
        ]
        # 将权重分配给值的权重矩阵
        state_dict[f"encoder.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
            -encoder_config.hidden_size :, :
        ]


# 将字典中的一个键名改为另一个键名
def rename_key(dct, old, new):
    # 弹出旧键名对应的值
    val = dct.pop(old)
    # 将该值插入到新的键名下
    dct[new] = val


# 在IAM手写数据库的图像上验证结果
def prepare_img(checkpoint_url):
    if "handwritten" in checkpoint_url:
        url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-00.jpg"  # industry
        # 下面的url是一些备用的图像链接，供验证使用
        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-12.jpg" # have
        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-10.jpg" # let
        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"  #
        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122.jpg"
    elif "printed" in checkpoint_url or "stage1" in checkpoint_url:
        # 如果是打印体或者是stage1的检查点，使用另一个图像链接
        url = "https://www.researchgate.net/profile/Dinh-Sang/publication/338099565/figure/fig8/AS:840413229350922@1577381536857/An-receipt-example-in-the-SROIE-2019-dataset_Q640.jpg"
    # 使用请求获取图像的原始数据流并将其转换为RGB格式的图像
    im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
    return im


@torch.no_grad()
def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    """
    将模型的权重复制/粘贴/调整到我们的VisionEncoderDecoderModel结构中。
    """
    # 根据checkpoint_url定义编码器和解码器的配置
    encoder_config = ViTConfig(image_size=384, qkv_bias=False)
    decoder_config = TrOCRConfig()

    # 根据checkpoint_url选择架构的大小
    if "base" in checkpoint_url:
        decoder_config.encoder_hidden_size = 768
    elif "large" in checkpoint_url:
        # 使用ViT-large编码器
        encoder_config.hidden_size = 1024
        encoder_config.intermediate_size = 4096
        encoder_config.num_hidden_layers = 24
        encoder_config.num_attention_heads = 16
        decoder_config.encoder_hidden_size = 1024
    else:
        # 如果checkpoint_url不包含'base'或'large'，则引发错误
        raise ValueError("Should either find 'base' or 'large' in checkpoint URL")

    # 对于large-printed + stage1的检查点，使用正弦位置嵌入，之后没有layernorm
    # 如果 checkpoint_url 中包含 "large-printed" 或者 "stage1"
    if "large-printed" in checkpoint_url or "stage1" in checkpoint_url:
        # 设置解码器配置的一些属性
        decoder_config.tie_word_embeddings = False
        decoder_config.activation_function = "relu"
        decoder_config.max_position_embeddings = 1024
        decoder_config.scale_embedding = True
        decoder_config.use_learned_position_embeddings = False
        decoder_config.layernorm_embedding = False

    # 加载 HuggingFace 模型，创建编码器和解码器
    encoder = ViTModel(encoder_config, add_pooling_layer=False)
    decoder = TrOCRForCausalLM(decoder_config)
    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
    model.eval()

    # 加载原始模型的状态字典，并进行一些键名的重命名
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["model"]

    # 创建一个用于重命名键的列表，并应用到状态字典上
    rename_keys = create_rename_keys(encoder_config, decoder_config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    read_in_q_k_v(state_dict, encoder_config)

    # 移除不需要的参数
    del state_dict["encoder.deit.head.weight"]
    del state_dict["encoder.deit.head.bias"]
    del state_dict["decoder.version"]

    # 对解码器键名添加前缀
    for key, val in state_dict.copy().items():
        val = state_dict.pop(key)
        if key.startswith("decoder") and "output_projection" not in key:
            state_dict["decoder.model." + key] = val
        else:
            state_dict[key] = val

    # 加载状态字典到模型中
    model.load_state_dict(state_dict)

    # 在图像上进行输出检查
    image_processor = ViTImageProcessor(size=encoder_config.image_size)
    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-large")
    processor = TrOCRProcessor(image_processor, tokenizer)

    # 准备图像并获取像素值
    pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values

    # 验证模型的输出
    decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]])
    outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
    logits = outputs.logits

    # 预期输出的形状应为 [1, 1, 50265]
    expected_shape = torch.Size([1, 1, 50265])

    # 根据 checkpoint_url 的不同类型设置预期的输出切片
    if "trocr-base-handwritten" in checkpoint_url:
        expected_slice = torch.tensor(
            [-1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311]
        )
    elif "trocr-large-handwritten" in checkpoint_url:
        expected_slice = torch.tensor(
            [-2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702, 5.6113, 2.0170]
        )
    elif "trocr-base-printed" in checkpoint_url:
        expected_slice = torch.tensor(
            [-5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210]
        )
    elif "trocr-large-printed" in checkpoint_url:
        expected_slice = torch.tensor(
            [-6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466, -0.3081, -0.8106, -1.7535]
        )
    # 如果 checkpoint_url 中不包含 "stage1" 字符串，则执行以下断言
    if "stage1" not in checkpoint_url:
        # 检查 logits 的形状是否符合预期形状
        assert logits.shape == expected_shape, "Shape of logits not as expected"
        # 检查 logits 的前10个元素是否与预期的切片（expected_slice）非常接近，容差为 1e-3
        assert torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-3), "First elements of logits not as expected"

    # 根据给定的路径创建一个目录，如果目录已存在则不执行任何操作
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    # 打印消息，指示正在将模型保存到指定路径中
    print(f"Saving model to {pytorch_dump_folder_path}")
    # 调用模型对象的 save_pretrained 方法，将模型保存到指定路径中
    model.save_pretrained(pytorch_dump_folder_path)
    # 打印消息，指示正在将处理器保存到指定路径中
    print(f"Saving processor to {pytorch_dump_folder_path}")
    # 调用处理器对象的 save_pretrained 方法，将处理器保存到指定路径中
    processor.save_pretrained(pytorch_dump_folder_path)
# 如果这个脚本被直接运行（而不是被作为模块导入），则执行以下代码
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()

    # 添加命令行参数：--checkpoint_url
    parser.add_argument(
        "--checkpoint_url",
        default="https://layoutlm.blob.core.windows.net/trocr/model_zoo/fairseq/trocr-base-handwritten.pt",
        type=str,
        help="URL to the original PyTorch checkpoint (.pth file).",
    )

    # 添加命令行参数：--pytorch_dump_folder_path
    parser.add_argument(
        "--pytorch_dump_folder_path", 
        default=None, 
        type=str, 
        help="Path to the folder to output PyTorch model."
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_tr_ocr_checkpoint，传递解析后的参数 checkpoint_url 和 pytorch_dump_folder_path
    convert_tr_ocr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)

`.\models\trocr\modeling_trocr.py`

# 定义了一个 Python 脚本的版权声明和编码声明
# 该模块实现了一个基于 RoBERTa 的 PyTorch TrOCR 解码器模型

import copy  # 导入 copy 模块，用于复制对象
import math  # 导入 math 模块，提供数学函数
from typing import Optional, Tuple, Union  # 导入类型提示工具

import torch  # 导入 PyTorch 库
from torch import nn  # 导入 PyTorch 中的神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 从内部模块导入激活函数映射
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask  # 导入处理注意力掩码的函数
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions  # 导入模型输出相关类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...utils import add_start_docstrings, logging, replace_return_docstrings  # 导入工具函数和日志记录器
from .configuration_trocr import TrOCRConfig  # 导入 TrOCR 模型的配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

_CONFIG_FOR_DOC = "TrOCRConfig"  # 文档中使用的 TrOCR 配置类名
_CHECKPOINT_FOR_DOC = "microsoft/trocr-base-handwritten"  # 文档中使用的 TrOCR 预训练模型地址

# 预训练模型的列表，包含了所有可用的 TrOCR 预训练模型地址
TROCR_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/trocr-base-handwritten",
    # 更多预训练模型地址可以在 https://huggingface.co/models?filter=trocr 查看
]

# 从 transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding 复制代码，并修改为 TrOCR
class TrOCRLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        # TrOCR 特有的设置，如果指定了 padding_idx 则需要将 embedding ids 偏移 2，并相应地调整 num_embeddings
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        """`input_ids' shape is expected to be [bsz x seqlen]."""

        bsz, seq_len = input_ids.shape[:2]
        # 根据序列长度和过去键值对长度创建位置张量
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        ).expand(bsz, -1)

        return super().forward(positions + self.offset)


class TrOCRSinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""
    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
        super().__init__()
        # 设置起始位置偏移量为2
        self.offset = 2
        # 设置嵌入维度
        self.embedding_dim = embedding_dim
        # 设置填充索引（如果有的话）
        self.padding_idx = padding_idx
        # 调用get_embedding方法获取嵌入权重
        self.weights = self.get_embedding(num_positions, embedding_dim, padding_idx)
        # 注册一个用于存储浮点张量的缓冲区
        self.register_buffer("_float_tensor", torch.FloatTensor(1))

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        """
        构建正弦嵌入。这与tensor2tensor中的实现相匹配，但与《Attention Is All You Need》第3.5节中的描述略有不同。
        """
        # 计算嵌入维度的一半
        half_dim = embedding_dim // 2
        # 计算正弦波的周期
        emb = math.log(10000) / (half_dim - 1)
        # 计算正弦和余弦的权重
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        # 如果embedding_dim为奇数，填充一个零向量
        if embedding_dim % 2 == 1:
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        # 如果有填充索引，将其对应行置零
        if padding_idx is not None:
            emb[padding_idx, :] = 0

        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        bsz, seq_len = input_ids.size()
        # 根据输入的token id创建位置id。任何填充的token保持填充状态。
        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
            input_ids.device
        )

        # 如果权重为None或者最大位置超过了当前权重的大小，则重新计算/扩展权重
        max_pos = self.padding_idx + 1 + seq_len
        if self.weights is None or max_pos > self.weights.size(0):
            # 如有需要，重新计算/扩展嵌入权重
            self.weights = self.get_embedding(max_pos, self.embedding_dim, self.padding_idx)
        self.weights = self.weights.to(self._float_tensor)

        # 根据位置id选择相应的权重，形成输出张量x
        x = self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()

        return x

    def create_position_ids_from_input_ids(
        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
    ):
        """
        将非填充符号替换为它们的位置号码。位置号码从padding_idx+1开始。忽略填充符号。这是从fairseq的`utils.make_positions`修改而来。
        """
        # 这里的类型转换和转换序列被精心平衡，既能与ONNX导出一起工作，也能与XLA一起工作。
        # 创建一个mask，用于标记非填充符号
        mask = input_ids.ne(padding_idx).int()
        # 生成增量索引，考虑过去的键值长度，并根据mask调整
        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
        return incremental_indices.long() + padding_idx
class TrOCRAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper."""

    def __init__(
        self,
        config,
        embed_dim: int,
        num_heads: int,
        kdim: int = None,
        vdim: int = None,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_cross_attention: bool = False,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        if not (self.head_dim * num_heads == self.embed_dim):
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子，用于调整注意力分数的大小

        self.is_decoder = is_decoder

        # 下面开始定义用于注意力计算的线性变换层
        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)  # K 矩阵的线性投影
        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)  # V 矩阵的线性投影
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # Q 矩阵的线性投影

        # 最后的输出投影层，用于将注意力输出映射回原始的 embed_dim 维度
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入的 tensor 重塑为适合多头注意力的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 注意力层的前向传播函数，接受多个输入和参数进行注意力计算和输出
    # 初始化方法，接受一个TrOCRConfig类型的配置参数
    def __init__(self, config: TrOCRConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置嵌入维度为配置参数中的隐藏大小
        self.embed_dim = config.hidden_size

        # 创建自注意力机制对象
        self.self_attn = TrOCRAttention(
            config,
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        # 设置dropout比例
        self.dropout = config.dropout
        # 设置激活函数为配置参数中指定的激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 设置激活函数的dropout比例
        self.activation_dropout = config.activation_dropout

        # 对自注意力输出进行LayerNorm归一化
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 如果配置中指定为decoder模式，则创建编码器注意力机制对象
        if config.is_decoder:
            self.encoder_attn = TrOCRAttention(
                config,
                embed_dim=self.embed_dim,
                num_heads=config.decoder_attention_heads,
                kdim=config.cross_attention_hidden_size,
                vdim=config.cross_attention_hidden_size,
                dropout=config.attention_dropout,
                is_decoder=True,
                is_cross_attention=True,
            )
            # 对编码器注意力输出进行LayerNorm归一化
            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 全连接层1，将嵌入维度映射到配置参数中的解码器FFN维度
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 全连接层2，将解码器FFN维度映射回嵌入维度
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        # 最终输出的LayerNorm归一化
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
class TrOCRPreTrainedModel(PreTrainedModel):
    # 指定该类的配置类为TrOCRConfig
    config_class = TrOCRConfig
    # 模型中基础模型的前缀名称为"model"
    base_model_prefix = "model"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        # 从配置中获取初始化的标准差
        std = self.config.init_std
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 如果是线性层或者一维卷积层，使用正态分布初始化权重，偏置初始化为零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层，使用正态分布初始化权重，如果有padding_idx，则将对应索引的权重初始化为零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


TROCR_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`TrOCRConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


class TrOCRDecoder(TrOCRPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]

    Args:
        config: TrOCRConfig
    """

    def __init__(self, config: TrOCRConfig):
        super().__init__(config)
        # 从配置中获取参数
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = config.pad_token_id
        # 如果配置中指定了缩放嵌入，则计算嵌入缩放因子为隐藏大小的平方根，否则为1.0
        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0

        # 创建词嵌入层，vocab_size为词汇表大小，hidden_size为隐藏大小，padding_idx为填充索引
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)

        # 根据配置选择学习的位置编码还是正弦位置编码
        if config.use_learned_position_embeddings:
            self.embed_positions = TrOCRLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
        else:
            self.embed_positions = TrOCRSinusoidalPositionalEmbedding(
                config.max_position_embeddings + self.padding_idx + 1,
                config.hidden_size,
                self.padding_idx,
            )

        # 根据配置选择是否使用层归一化
        if config.layernorm_embedding:
            self.layernorm_embedding = nn.LayerNorm(config.hidden_size)
        else:
            self.layernorm_embedding = None

        # 创建多层Transformer解码器层列表
        self.layers = nn.ModuleList([TrOCRDecoderLayer(config) for _ in range(config.decoder_layers)])

        # 默认关闭梯度检查点
        self.gradient_checkpointing = False
        # 初始化权重并进行最终处理
        self.post_init()
    # 返回输入的嵌入向量
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入的嵌入向量
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 模型的前向传播函数，接受多个参数用于 Transformer 模型的输入和控制行为
    def forward(
        self,
        input_ids=None,  # 输入的 token IDs
        attention_mask=None,  # 注意力掩码，指示哪些 token 是真实的输入还是填充
        encoder_hidden_states=None,  # 编码器的隐藏状态（用于 encoder-decoder 模型）
        encoder_attention_mask=None,  # 编码器的注意力掩码（用于 encoder-decoder 模型）
        head_mask=None,  # 多头注意力的头部掩码，用于控制哪些头部参与注意力计算
        cross_attn_head_mask=None,  # 跨注意力的头部掩码，用于 encoder-decoder 模型
        past_key_values=None,  # 用于存储过去的键值对，以便于循环生成器等场景
        inputs_embeds=None,  # 直接输入的嵌入向量（替代 input_ids）
        use_cache=None,  # 是否使用缓存
        output_attentions=None,  # 是否输出注意力权重
        output_hidden_states=None,  # 是否输出所有隐藏状态
        return_dict=None,  # 是否返回字典格式的输出
@add_start_docstrings(
    "The TrOCR Model with a language modeling head. Can be used for summarization.",
    TROCR_START_DOCSTRING,
)
class TrOCRDecoderWrapper(TrOCRPreTrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    """

    def __init__(self, config):
        super().__init__(config)
        self.decoder = TrOCRDecoder(config)

    def forward(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)


@add_start_docstrings(
    "The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and"
    " [`VisionEncoderDecoder`].",
    TROCR_START_DOCSTRING,
)
class TrOCRForCausalLM(TrOCRPreTrainedModel):
    _tied_weights_keys = ["output_projection.weight"]

    def __init__(self, config):
        # 深度复制配置，标记为解码器，不作为编码器-解码器模型
        config = copy.deepcopy(config)
        config.is_decoder = True
        config.is_encoder_decoder = False
        super().__init__(config)
        # 创建 TrOCRDecoderWrapper 实例
        self.model = TrOCRDecoderWrapper(config)

        # 创建线性层用于输出投影，无偏置
        self.output_projection = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回模型中的嵌入层
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置模型的输入嵌入层
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        # 返回输出投影层
        return self.output_projection

    def set_output_embeddings(self, new_embeddings):
        # 设置新的输出投影层
        self.output_projection = new_embeddings

    def set_decoder(self, decoder):
        # 设置解码器
        self.model.decoder = decoder

    def get_decoder(self):
        # 获取解码器
        return self.model.decoder

    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处包含模型前向传播逻辑，详细的参数说明可以参考函数定义的文档字符串

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
    ):
        # 为生成准备输入，具体实现逻辑可能包括处理输入参数和缓存的键值对
    ):
        # 如果模型作为编码器-解码器模型的解码器使用，则动态创建解码器注意力遮罩
        if attention_mask is None:
            # 如果注意力遮罩为 None，则创建一个与输入长度相同的全 1 矩阵作为注意力遮罩
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            # 获取过去键值对的长度
            past_length = past_key_values[0][0].shape[2]

            # 某些生成方法可能只传递最后一个输入 ID
            if input_ids.shape[1] > past_length:
                # 如果输入长度大于过去长度，则设置移除前缀的长度为过去长度
                remove_prefix_length = past_length
            else:
                # 否则，默认保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 移除前缀，保留后缀部分作为新的 input_ids
            input_ids = input_ids[:, remove_prefix_length:]
        # 第一步，decoder_cached_states 是空的
        return {
            "input_ids": input_ids,  # encoder_outputs 已经定义，input_ids 不再需要
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 根据 beam_idx 重新排序每层的过去状态
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

`.\models\trocr\processing_trocr.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for TrOCR.
"""
import warnings
from contextlib import contextmanager

from ...processing_utils import ProcessorMixin


class TrOCRProcessor(ProcessorMixin):
    r"""
    Constructs a TrOCR processor which wraps a vision image processor and a TrOCR tokenizer into a single processor.

    [`TrOCRProcessor`] offers all the functionalities of [`ViTImageProcessor`/`DeiTImageProcessor`] and
    [`RobertaTokenizer`/`XLMRobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and [`~TrOCRProcessor.decode`] for
    more information.

    Args:
        image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`], *optional*):
            An instance of [`ViTImageProcessor`/`DeiTImageProcessor`]. The image processor is a required input.
        tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`], *optional*):
            An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        # 如果提供了`feature_extractor`参数，则发出警告并将其转换为`image_processor`
        feature_extractor = None
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # 如果未提供`image_processor`，则使用`feature_extractor`，否则会引发错误
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # 调用父类的初始化方法，将`image_processor`和`tokenizer`传入
        super().__init__(image_processor, tokenizer)
        # 设置当前处理器为`image_processor`
        self.current_processor = self.image_processor
        # 设置目标上下文管理器状态为False
        self._in_target_context_manager = False
    def __call__(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
        [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
        [`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
        """
        # 如果在目标处理器上下文中，则调用当前处理器对象处理输入参数并返回结果
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

        # 从 kwargs 中弹出 'images' 和 'text' 参数
        images = kwargs.pop("images", None)
        text = kwargs.pop("text", None)

        # 如果没有传入参数，则检查是否有位置参数传入并赋值给 images，同时将其余的位置参数赋给 args
        if len(args) > 0:
            images = args[0]
            args = args[1:]

        # 如果 images 和 text 都为 None，则抛出 ValueError
        if images is None and text is None:
            raise ValueError("You need to specify either an `images` or `text` input to process.")

        # 如果 images 不为 None，则使用 image_processor 处理 images 和其他参数
        if images is not None:
            inputs = self.image_processor(images, *args, **kwargs)

        # 如果 text 不为 None，则使用 tokenizer 处理 text 和其他参数
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

        # 如果 text 为 None，则返回 inputs；如果 images 为 None，则返回 encodings；否则将 labels 添加到 inputs 后返回
        if text is None:
            return inputs
        elif images is None:
            return encodings
        else:
            inputs["labels"] = encodings["input_ids"]
            return inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        """
        # 调用 tokenizer 的 batch_decode 方法并返回结果
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        """
        # 调用 tokenizer 的 decode 方法并返回结果
        return self.tokenizer.decode(*args, **kwargs)

    @contextmanager
    def as_target_processor(self):
        """
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
        """
        # 发出警告信息，说明这个方法即将被移除，建议使用 __call__ 方法的 text 参数处理标签
        warnings.warn(
            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
            "your images inputs, or in a separate call."
        )
        # 设置 _in_target_context_manager 标志为 True，设置当前处理器为 tokenizer，并在退出时恢复为 image_processor
        self._in_target_context_manager = True
        self.current_processor = self.tokenizer
        yield
        self.current_processor = self.image_processor
        self._in_target_context_manager = False

    @property
    def feature_extractor_class(self):
        """
        Warns about deprecation of `feature_extractor_class` and suggests using `image_processor_class`.
        """
        # 发出警告信息，说明 feature_extractor_class 将在 v5 版本移除，建议使用 image_processor_class 替代
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        # 返回 image_processor_class 属性
        return self.image_processor_class
    def feature_extractor(self):
        # 发出警告，提醒用户 `feature_extractor` 方法已经废弃，将在 v5 版本中移除，请使用 `image_processor` 方法代替。
        warnings.warn(
            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
            FutureWarning,
        )
        # 返回当前对象的 `image_processor` 属性作为特征提取器
        return self.image_processor

`.\models\trocr\init.py`

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 导入必要的依赖项和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_speech_available,
    is_torch_available,
)

# 定义模块导入结构
_import_structure = {
    "configuration_trocr": ["TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP", "TrOCRConfig"],
    "processing_trocr": ["TrOCRProcessor"],
}

# 检查是否可以使用 Torch，若不可用则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，添加相关模块到导入结构中
    _import_structure["modeling_trocr"] = [
        "TROCR_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TrOCRForCausalLM",
        "TrOCRPreTrainedModel",
    ]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 导入类型检查所需的配置和处理模块
    from .configuration_trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig
    from .processing_trocr import TrOCRProcessor

    # 再次检查 Torch 是否可用，若不可用则引发异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入类型检查所需的模型处理模块
        from .modeling_trocr import TROCR_PRETRAINED_MODEL_ARCHIVE_LIST, TrOCRForCausalLM, TrOCRPreTrainedModel

# 如果不是类型检查模式
else:
    import sys

    # 将当前模块注册为 LazyModule，并使用指定的导入结构
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\tvlt\configuration_tvlt.py`

# coding=utf-8
# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TVLT model configuration"""

# 导入必要的类和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# TVLT 预训练模型配置文件映射字典，指定了模型名称及其配置文件的 URL
TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "ZinengTang/tvlt-base": "https://huggingface.co/ZinengTang/tvlt-base/blob/main/config.json",
}

# TvltConfig 类，用于存储 TVLT 模型的配置信息，继承自 PretrainedConfig 类
class TvltConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the TVLT
    [ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import TvltConfig, TvltModel

    >>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
    >>> configuration = TvltConfig()

    >>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
    >>> model = TvltModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型定义为 "tvlt"
    model_type = "tvlt"

    # 初始化函数，用于设定 TVLT 模型的各种配置参数
    def __init__(
        self,
        image_size=224,
        spectrogram_length=2048,
        frequency_length=128,
        image_patch_size=[16, 16],
        audio_patch_size=[16, 16],
        num_image_channels=3,
        num_audio_channels=1,
        num_frames=8,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-6,
        qkv_bias=True,
        use_mean_pooling=False,
        decoder_num_attention_heads=16,
        decoder_hidden_size=512,
        decoder_num_hidden_layers=8,
        decoder_intermediate_size=2048,
        pixel_mask_ratio=0.75,
        audio_mask_ratio=0.15,
        audio_mask_type="frame-level",
        task_matching=True,
        task_mae=True,
        loss_type="classification",
        **kwargs,
    ):
        """
        Initializes TvltConfig with various parameters to define the TVLT model architecture and behavior.
        """
        # 调用父类 PretrainedConfig 的初始化函数，设定通用的模型配置参数
        super().__init__(**kwargs)
        # 设置模型的特定参数，用于控制模型结构和行为
        self.image_size = image_size
        self.spectrogram_length = spectrogram_length
        self.frequency_length = frequency_length
        self.image_patch_size = image_patch_size
        self.audio_patch_size = audio_patch_size
        self.num_image_channels = num_image_channels
        self.num_audio_channels = num_audio_channels
        self.num_frames = num_frames
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.qkv_bias = qkv_bias
        self.use_mean_pooling = use_mean_pooling
        self.decoder_num_attention_heads = decoder_num_attention_heads
        self.decoder_hidden_size = decoder_hidden_size
        self.decoder_num_hidden_layers = decoder_num_hidden_layers
        self.decoder_intermediate_size = decoder_intermediate_size
        self.pixel_mask_ratio = pixel_mask_ratio
        self.audio_mask_ratio = audio_mask_ratio
        self.audio_mask_type = audio_mask_type
        self.task_matching = task_matching
        self.task_mae = task_mae
        self.loss_type = loss_type
        # 将额外的参数存储在 kwargs 中
        self.update(kwargs)
        ):
            super().__init__(**kwargs)
            
            # 调用父类初始化函数，并传递所有关键字参数
            if audio_mask_type not in ("frame-level", "patch_level"):
                # 检查音频遮罩类型是否为有效的值之一，如果不是则引发值错误异常
                raise ValueError(
                    "audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
                    f"got {audio_mask_type}"
                )
    
            # 设置图像大小、频谱长度、频率长度、图像补丁大小、音频补丁大小、图像通道数、音频通道数、帧数
            self.image_size = image_size
            self.spectrogram_length = spectrogram_length
            self.frequency_length = frequency_length
            self.image_patch_size = image_patch_size
            self.audio_patch_size = audio_patch_size
            self.num_image_channels = num_image_channels
            self.num_audio_channels = num_audio_channels
            self.num_frames = num_frames
    
            # 设置隐藏层大小、隐藏层数量、注意力头数量、中间层大小、隐藏层激活函数、隐藏层丢弃率、注意力机制概率丢弃率、初始化范围、层归一化的 epsilon 值、QKV 偏置、是否使用均值池化
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.initializer_range = initializer_range
            self.layer_norm_eps = layer_norm_eps
            self.qkv_bias = qkv_bias
            self.use_mean_pooling = use_mean_pooling
    
            # 设置解码器注意力头数量、解码器隐藏层大小、解码器隐藏层数量、解码器中间层大小、像素遮罩比例、音频遮罩比例、音频遮罩类型
            self.decoder_num_attention_heads = decoder_num_attention_heads
            self.decoder_hidden_size = decoder_hidden_size
            self.decoder_num_hidden_layers = decoder_num_hidden_layers
            self.decoder_intermediate_size = decoder_intermediate_size
            self.pixel_mask_ratio = pixel_mask_ratio
            self.audio_mask_ratio = audio_mask_ratio
            self.audio_mask_type = audio_mask_type
    
            # 设置任务匹配、任务均方误差、损失类型
            self.task_matching = task_matching
            self.task_mae = task_mae
            self.loss_type = loss_type

`.\models\tvlt\feature_extraction_tvlt.py`

# 设置代码文件的编码格式为UTF-8
# 版权声明：2023年由HuggingFace Inc.团队保留所有权利。
#
# 根据Apache许可证2.0版（“许可证”）获得许可；除非符合许可证要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发的软件
# 没有任何明示或暗示的保证或条件。
# 有关特定语言的权限，请参阅许可证。
"""TVLT的特征提取器类。"""

from math import ceil  # 导入ceil函数，用于向上取整
from typing import List, Optional, Union  # 引入类型提示模块

import numpy as np  # 导入NumPy库

from ...audio_utils import mel_filter_bank, spectrogram, window_function  # 导入音频处理函数
from ...feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor  # 导入序列特征提取器
from ...utils import TensorType, logging  # 导入Tensor类型和日志记录工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

class TvltFeatureExtractor(SequenceFeatureExtractor):
    r"""
    构造一个TVLT音频特征提取器。此特征提取器用于准备模型的音频输入数据。

    此特征提取器继承自[`FeatureExtractionMixin`]，其中包含大多数主要方法。用户
    应参考此超类以获取有关这些方法的更多信息。

    Args:
        spectrogram_length (`Dict[str, int]` *可选*, 默认为2048):
            每个音频频谱图的时间长度。
        num_channels (`int` *可选*, 默认为1):
            音频通道数。
        patch_size (`List[int]` *可选*, 默认为`[16, 16]`):
            音频补丁嵌入的补丁大小。
        feature_size (`int`, *可选*, 默认为128):
            音频频谱图的频率长度。
        sampling_rate (`int`, *可选*, 默认为44100):
            应数字化音频文件的采样率，以赫兹（Hz）表示。
        hop_length_to_sampling_rate (`int`, *可选*, 默认为86):
            Hop length是用于获取Mel频率系数的STFT的重叠窗口的长度。
            例如，对于采样率44100，跳跃长度为512，即44100 / 512 = 86。
        n_fft (`int`, *可选*, 默认为2048):
            傅里叶变换的大小。
        padding_value (`float`, *可选*, 默认为0.0):
            用于填充音频的填充值。应该对应于静音部分。
    """

    model_input_names = ["audio_values", "audio_mask"]

    def __init__(
        self,
        spectrogram_length=2048,
        num_channels=1,
        patch_size=[16, 16],
        feature_size=128,
        sampling_rate=44100,
        hop_length_to_sampling_rate=86,
        n_fft=2048,
        padding_value=0.0,
        **kwargs,
    ):
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            padding_value=padding_value,
            **kwargs,
        )
        # 调用父类构造函数，初始化特征大小、采样率、填充值等参数

        self.spectrogram_length = spectrogram_length
        # 设置频谱长度属性

        self.num_channels = num_channels
        # 设置通道数属性

        self.patch_size = patch_size
        # 设置补丁大小属性

        self.freq_len = feature_size // self.patch_size[1]
        # 计算频率长度，特征大小除以补丁大小的第二个维度

        self.n_fft = n_fft
        # 设置 FFT 窗口大小属性

        self.hop_length = sampling_rate // hop_length_to_sampling_rate
        # 计算帧移大小属性

        self.sampling_rate = sampling_rate
        # 设置采样率属性

        self.padding_value = padding_value
        # 设置填充值属性

        self.mel_filters = mel_filter_bank(
            num_frequency_bins=1 + n_fft // 2,
            num_mel_filters=feature_size,
            min_frequency=0.0,
            max_frequency=22050.0,
            sampling_rate=sampling_rate,
            norm="slaney",
            mel_scale="slaney",
        ).T
        # 使用 mel_filter_bank 函数生成梅尔滤波器系数矩阵并转置后赋给 mel_filters 属性

    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
        """
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        """
        # 计算提供音频的对数梅尔频谱，使用 hann 窗口函数，帧长度为 n_fft，帧移为 hop_length
        # 使用 mel_filters.T 作为梅尔滤波器，输出对数梅尔频谱，单位为 dB，幅度范围为 80.0 dB
        log_spec = spectrogram(
            waveform,
            window_function(self.n_fft, "hann"),
            frame_length=self.n_fft,
            hop_length=self.hop_length,
            power=2.0,
            mel_filters=self.mel_filters.T,
            log_mel="dB",
            db_range=80.0,
        )
        log_spec = log_spec[:, :-1]  # 去除最后一列，用于兼容 Whisper 实现
        log_spec = log_spec - 20.0  # 对数梅尔频谱减去 20.0
        log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0  # 裁剪、缩放对数梅尔频谱
        return log_spec

    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_attention_mask: Optional[bool] = True,
        sampling_rate: Optional[int] = None,
        resample: bool = False,
        mask_audio: bool = False,
        **kwargs,

`.\models\tvlt\image_processing_tvlt.py`

# 设定文件编码为 UTF-8
# 版权声明及保留所有权利给 HuggingFace Inc. 团队，未经许可不得使用
#
# 根据 Apache 许可证 2.0 版本使用本文件；您不得在未遵守许可证的情况下使用此文件。
# 您可以从以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“现状”提供的，不附带任何明示或暗示的担保或条件。
# 请参阅许可证了解具体语言条款和权限限制。
"""TVLT 的图像处理类。"""

from typing import Dict, List, Optional, Union

import numpy as np  # 导入 NumPy 库

# 导入 TVLT 图像处理工具函数和类
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    get_resize_output_image_size,  # 导入获取调整后图像尺寸的函数
    resize,  # 导入调整图像大小的函数
    to_channel_dimension_format,  # 导入将图像转换为通道维度格式的函数
)
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,  # 导入 ImageNet 标准均值
    IMAGENET_STANDARD_STD,  # 导入 ImageNet 标准标准差
    ChannelDimension,  # 导入通道维度枚举
    ImageInput,  # 导入图像输入类型
    PILImageResampling,  # 导入 PIL 图像重采样方法
    infer_channel_dimension_format,  # 导入推断通道维度格式的函数
    is_scaled_image,  # 导入判断是否为缩放图像的函数
    is_valid_image,  # 导入判断是否为有效图像的函数
    to_numpy_array,  # 导入将图像转换为 NumPy 数组的函数
    valid_images,  # 导入判断有效图像的函数
    validate_kwargs,  # 导入验证关键字参数的函数
    validate_preprocess_arguments,  # 导入验证预处理参数的函数
)
from ...utils import TensorType, logging  # 导入 Tensor 类型和日志模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def make_batched(videos) -> List[List[ImageInput]]:
    """将输入的视频或图像列表转换为批处理列表形式。

    Args:
        videos: 输入的视频或图像列表

    Returns:
        List[List[ImageInput]]: 批处理后的视频或图像列表

    Raises:
        ValueError: 如果无法从输入中生成批处理视频
    """
    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
        return videos

    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
        videos_dim = np.array(videos[0]).ndim
        if videos_dim == 3:
            return [videos]
        elif videos_dim == 4:
            return videos

    elif is_valid_image(videos):
        videos_dim = np.array(videos).ndim
        if videos_dim == 3:
            return [[videos]]
        elif videos_dim == 4:
            return [videos]
        elif videos_dim == 5:
            return videos

    raise ValueError(f"Could not make batched video from {videos}")


class TvltImageProcessor(BaseImageProcessor):
    r"""
    构造一个 TVLT 图像处理器。

    此处理器可用于通过将图像转换为单帧视频来为模型准备视频或图像。

    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
            Size of the output image after resizing. The shortest edge of the image will be resized to
            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
            `size` in the `preprocess` method.
        patch_size (`List[int]` *optional*, defaults to [16,16]):
            The patch size of image patch embedding.
        num_frames (`int` *optional*, defaults to 8):
            The maximum number of video frames.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
            parameter in the `preprocess` method.
        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
            `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
            in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    """
    # 定义模型输入的名称列表，包含四个元素
    model_input_names = [
        "pixel_values",        # 像素数值
        "pixel_mask",          # 像素掩码
        "pixel_values_mixed",  # 混合像素数值
        "pixel_mask_mixed",    # 混合像素掩码
    ]
    # 初始化方法，用于设置图像处理器的各种参数和属性
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图像大小调整，默认为True
        size: Dict[str, int] = None,  # 图像大小的字典，包含最短边或其他指定尺寸，默认为{"shortest_edge": 224}
        patch_size: List[int] = [16, 16],  # 图像的分块大小，默认为[16, 16]
        num_frames: int = 8,  # 处理视频时的帧数，默认为8
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像重采样方法，默认为双线性插值
        do_center_crop: bool = True,  # 是否进行中心裁剪，默认为True
        crop_size: Dict[str, int] = None,  # 裁剪后图像的尺寸，默认为{"height": 224, "width": 224}
        do_rescale: bool = True,  # 是否进行图像像素值缩放，默认为True
        rescale_factor: Union[int, float] = 1 / 255,  # 图像像素值缩放因子，默认为1/255
        do_normalize: bool = True,  # 是否进行图像归一化，默认为True
        image_mean: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_MEAN,  # 图像归一化均值，默认为ImageNet标准均值
        image_std: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_STD,  # 图像归一化标准差，默认为ImageNet标准标准差
        init_mask_generator=False,  # 是否初始化遮罩生成器，默认为False
        **kwargs,  # 其他可选参数
    ) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 如果未提供size参数，则设置默认的size字典
        size = size if size is not None else {"shortest_edge": 224}
        # 根据提供的size参数获取最终的size字典，保证其含有必要的尺寸信息
        size = get_size_dict(size, default_to_square=False)
        # 如果未提供crop_size参数，则设置默认的crop_size字典
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 根据提供的crop_size参数获取最终的crop_size字典
        crop_size = get_size_dict(crop_size, param_name="crop_size")

        # 将初始化方法中的各个参数设置为对象的属性
        self.do_resize = do_resize
        self.size = size
        self.patch_size = patch_size
        self.num_frames = num_frames
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
        # 定义一个包含所有有效处理器键的列表，用于后续验证和使用
        self._valid_processor_keys = [
            "videos",
            "do_resize",
            "size",
            "patch_size",
            "num_frames",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "is_mixed",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 图像大小调整方法，用于调整输入图像的尺寸
    def resize(
        self,
        image: np.ndarray,  # 输入的图像数组
        size: Dict[str, int],  # 目标图像尺寸的字典
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像重采样方法，默认为双线性插值
        data_format: Optional[Union[str, ChannelDimension]] = None,  # 数据格式参数
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的格式参数
        **kwargs,  # 其他可选参数
    ) -> np.ndarray:
        """
        Resize an image.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
                shortest edge of length `s` while keeping the aspect ratio of the original image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 根据 size 获取实际的大小字典，确保不是默认的正方形输出
        size = get_size_dict(size, default_to_square=False)

        # 如果 size 字典中包含 "shortest_edge" 键
        if "shortest_edge" in size:
            # 根据最短边长度调整输出图像大小
            output_size = get_resize_output_image_size(
                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
            )
        # 如果 size 字典中包含 "height" 和 "width" 键
        elif "height" in size and "width" in size:
            # 设置输出大小为指定的高度和宽度
            output_size = (size["height"], size["width"])
        else:
            # 如果 size 字典既不包含 "shortest_edge" 也不同时包含 "height" 和 "width" 键，抛出数值错误
            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")

        # 调用 resize 函数，返回调整大小后的图像
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def _preprocess_image(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Preprocesses a single image."""

        validate_preprocess_arguments(
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            do_center_crop=do_center_crop,
            crop_size=crop_size,
            do_resize=do_resize,
            size=size,
            resample=resample,
        )

        # All transformations expect numpy arrays.
        image = to_numpy_array(image)  # Convert input image to numpy array format

        if is_scaled_image(image) and do_rescale:
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )

        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)  # Infer input data format if not provided

        if do_resize:
            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)  # Resize image if required

        if do_center_crop:
            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)  # Perform center cropping if specified

        if do_rescale:
            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)  # Rescale image if specified

        if do_normalize:
            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)  # Normalize image if specified

        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # Convert image to desired channel dimension format
        return image

    def preprocess(
        self,
        videos: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        patch_size: List[int] = None,
        num_frames: int = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        is_mixed: bool = False,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\tvlt\modeling_tvlt.py`

# 导入标准库和第三方库
import collections.abc  # 导入 collections.abc 模块，用于检查对象是否是可迭代的序列
import math  # 导入 math 模块，提供基本的数学函数实现
from copy import deepcopy  # 从 copy 模块中导入 deepcopy 函数，用于深度复制对象
from dataclasses import dataclass  # 从 dataclasses 模块中导入 dataclass 装饰器，用于简化类的定义
from typing import Optional, Tuple, Union  # 导入类型提示，声明可选类型、元组、联合类型

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 模块，提供基于检查点的内存优化方法
from torch import nn  # 从 PyTorch 导入 nn 模块，用于神经网络的构建
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 从 nn 模块中导入损失函数类

# 导入自定义模块和函数
from ...activations import ACT2FN  # 从相对路径导入 ACT2FN，用于激活函数
from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput  # 导入模型输出类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer  # 导入 PyTorch 辅助函数
from ...utils import (
    ModelOutput,  # 导入模型输出基类
    add_start_docstrings,  # 导入函数用于添加文档字符串的装饰器
    add_start_docstrings_to_model_forward,  # 导入函数用于添加模型前向传播文档字符串的装饰器
    logging,  # 导入 logging 模块，用于日志记录
    replace_return_docstrings,  # 导入函数用于替换返回值文档字符串的装饰器
)

# 导入 TVLT 模型配置类
from .configuration_tvlt import TvltConfig

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 定义用于文档的配置名称
_CONFIG_FOR_DOC = "TvltConfig"
# 定义用于文档的检查点名称
_CHECKPOINT_FOR_DOC = "ZinengTang/tvlt-base"

# 预定义 TVLT 预训练模型的存档列表
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "ZinengTang/tvlt-base",
    # 查看所有 TVLT 模型的列表地址
    # https://huggingface.co/ZinengTang/tvlt-base
]


@dataclass
# TvltModelOutput 类，用于 TvltModel 的输出，包含潜在的隐藏状态和注意力
class TvltModelOutput(ModelOutput):
    """
    Class for TvltModel's outputs, with potential hidden states and attentions.
    """
    # 定义函数参数，表示模型输出的隐藏状态及其它相关信息
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层输出的隐藏状态序列。
        last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
            模型最后一层输出的像素序列的隐藏状态。
        last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
            模型最后一层输出的音频序列的隐藏状态。
        pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
            表示哪些像素补丁被掩盖（置为1），哪些未被掩盖（置为0）的张量。
        audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
            表示哪些音频补丁被掩盖（置为1），哪些未被掩盖（置为0）的张量。
        pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
            像素掩盖的id排列顺序的张量。
        audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
            音频掩盖的id排列顺序的张量。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组，包含模型每层的隐藏状态张量（嵌入输出和每层的输出），形状为 `(batch_size, sequence_length, hidden_size)`。
            当参数 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            元组，包含模型每层的注意力权重张量，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力softmax后的注意力权重，用于计算自注意力头中的加权平均。
            当参数 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
    """

    # 初始化函数参数，均为None，表示这些参数在调用时可以传入具体的张量数据
    last_hidden_state: torch.FloatTensor = None
    last_pixel_hidden_state: torch.FloatTensor = None
    last_audio_hidden_state: torch.FloatTensor = None
    pixel_label_masks: torch.LongTensor = None
    audio_label_masks: torch.LongTensor = None
    pixel_ids_restore: torch.LongTensor = None
    audio_ids_restore: torch.LongTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# TvltDecoderOutput 类用于存储 TvltDecoder 模型的输出结果，可能包含隐藏状态和注意力信息

@dataclass
class TvltDecoderOutput(ModelOutput):
    """
    Class for TvltDecoder's outputs, with potential hidden states and attentions.

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits. 像素重构的逻辑回归输出。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs. 模型每一层输出的隐藏状态，包括初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads. 经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


# TvltForPreTrainingOutput 类用于存储 TvltForPreTraining 模型的输出结果，可能包含隐藏状态和注意力信息

@dataclass
class TvltForPreTrainingOutput(ModelOutput):
    """
    Class for TvltForPreTraining's outputs, with potential hidden states and attentions.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`):
            Pixel reconstruction loss. 像素重构损失。
        matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
            Matching objective logits. 匹配目标的逻辑回归输出。
        pixel_logits (`torch.FloatTensor` of shape
            `(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
            logits. 像素重构的逻辑回归输出。
        audio_logits (`torch.FloatTensor` of shape
            `(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
            logits. 音频重构的逻辑回归输出。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs. 模型每一层输出的隐藏状态，包括初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads. 经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    loss: Optional[torch.FloatTensor] = None
    # 定义一个变量 matching_logits，类型为 torch 的 FloatTensor，初始值为 None，用于存储匹配 logits
    matching_logits: torch.FloatTensor = None
    
    # 定义一个变量 pixel_logits，类型为 torch 的 FloatTensor，初始值为 None，用于存储像素 logits
    pixel_logits: torch.FloatTensor = None
    
    # 定义一个变量 audio_logits，类型为 torch 的 FloatTensor，初始值为 None，用于存储音频 logits
    audio_logits: torch.FloatTensor = None
    
    # 定义一个变量 hidden_states，类型为可选的元组，元素为 torch 的 FloatTensor，初始值为 None，用于存储隐藏状态
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    
    # 定义一个变量 attentions，类型为可选的元组，元素为 torch 的 FloatTensor，初始值为 None，用于存储注意力机制
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 生成用于像素值屏蔽的噪声，用于音频屏蔽。
def generate_pixel_mask_noise(pixel_values, pixel_mask=None, mask_ratio=0.75):
    """Generate noise for audio masking."""
    # 获取批次大小和序列长度
    batch_size, seq_len = pixel_values.shape[:2]
    # 生成在 [0, 1] 范围内的随机噪声
    noise = torch.rand((batch_size, seq_len), device=pixel_values.device)  # noise in [0, 1]
    # 计算需要保留的序列长度
    len_keep = int(seq_len * (1 - mask_ratio))
    return noise, len_keep


# 生成用于音频屏蔽的噪声。
def generate_audio_mask_noise(audio_values, audio_mask=None, mask_ratio=0.75, mask_type="patch-level", freq_len=8):
    """Generate noise for audio masking."""
    # 获取批次大小和序列长度
    batch_size, seq_len = audio_values.shape[:2]
    if mask_type == "frame-level":
        # 计算帧级别的时间片段数
        num_time_patches = seq_len // freq_len
        # 生成 [0, 1] 范围内的随机噪声并重复以匹配序列长度
        noise = (
            torch.rand(batch_size, num_time_patches, device=audio_values.device)
            .unsqueeze(-1)
            .repeat(1, 1, freq_len)
            .view(batch_size, seq_len)
        )  # noise in [0, 1]
    elif mask_type == "patch-level":
        # 生成 [0, 1] 范围内的随机噪声
        noise = torch.rand(batch_size, seq_len, device=audio_values.device)  # noise in [0, 1]
    # 计算需要保留的序列长度
    len_keep = int(seq_len * (1 - mask_ratio))
    return noise, len_keep


# 随机屏蔽，通过样本内帧级别的乱序进行随机屏蔽。通过 argsort 随机噪声进行样本内的乱序。
def random_masking(sequence, noise, len_keep, attention_masks=None):
    """
    Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
    noise. sequence: [batch_size, seq_len, hidden_dim], sequence
    """
    batch_size, seq_len, hidden_dim = sequence.shape

    # 对每个样本的噪声进行排序
    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
    # 恢复原始顺序
    ids_restore = torch.argsort(ids_shuffle, dim=1)

    # 保留第一个子集
    ids_keep = ids_shuffle[:, :len_keep]
    # 使用乱序索引收集序列数据
    sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, hidden_dim))

    # 生成二进制屏蔽：0 表示保留，1 表示移除
    label_masks = torch.ones([batch_size, seq_len], device=sequence.device)
    label_masks[:, :len_keep] = 0
    # 使用 ids_restore 恢复原始顺序得到二进制屏蔽
    label_masks = torch.gather(label_masks, dim=1, index=ids_restore)

    if attention_masks is not None:
        # 若存在注意力屏蔽，则将其乘以二进制屏蔽
        label_masks *= attention_masks
        # 使用 ids_keep 乱序索引 attention_masks
        attention_masks = torch.gather(attention_masks, dim=1, index=ids_keep)

    return sequence_masked, attention_masks, label_masks, ids_restore


class TvltPixelEmbeddings(nn.Module):
    """Construct the patch and position embeddings."""

    def __init__(self, config):
        super().__init__()

        # 初始化像素块和位置嵌入
        self.patch_embeddings = TvltPixelPatchEmbeddings(config)
        self.num_patches_per_image = self.patch_embeddings.num_patches_per_image

        # 初始化类型嵌入向量、时间嵌入和位置嵌入向量
        self.type_embed_v = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        self.temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
        self.pos_embed_v = nn.Parameter(torch.zeros(1, self.num_patches_per_image, config.hidden_size))

        self.config = config
    def forward(self, pixel_values, attention_masks=None):
        # 定义函数 forward，用于模型的前向传播计算
        # 获取输入张量的维度信息
        batch_size, num_frames, num_channels, height, width = pixel_values.shape
        
        # 通过 patch_embeddings 方法将像素值转换为补丁嵌入向量
        embeddings = self.patch_embeddings(pixel_values)
        
        # 加上位置嵌入向量，重复 num_frames 次以适应每一帧
        embeddings += self.pos_embed_v.repeat(1, num_frames, 1)
        
        # 使用 torch.repeat_interleave 方法，根据 num_patches_per_image 重复填充时间嵌入向量的部分，以适应每个补丁
        embeddings += torch.repeat_interleave(self.temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1)
        
        # 加上类型嵌入向量，以适应输入数据的类型特征
        embeddings += self.type_embed_v
        
        # 返回嵌入向量和注意力掩码（可选）
        return embeddings, attention_masks
class TvltAudioEmbeddings(nn.Module):
    """Construct the patch and position embeddings."""

    def __init__(self, config):
        super().__init__()

        # 初始化音频补丁嵌入对象
        self.patch_embeddings = TvltAudioPatchEmbeddings(config)
        # 获取补丁数量
        self.num_patches = self.patch_embeddings.num_patches

        # 初始化音频类型嵌入
        self.type_embed_a = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        # 计算频率补丁数量
        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
        # 初始化位置嵌入
        self.pos_embed_a = nn.Parameter(torch.zeros(1, self.num_patches // self.num_freq_patches, config.hidden_size))
        # 初始化频率嵌入
        self.freq_embed = nn.Parameter(torch.zeros(1, self.num_freq_patches, config.hidden_size))

        # 重新计算频率补丁数量
        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
        # 保存配置信息
        self.config = config

    def forward(self, audio_values, attention_masks=None):
        # 创建补丁嵌入
        embeddings = self.patch_embeddings(audio_values)

        # 计算时间补丁数量
        num_time_patches = embeddings.size(1) // self.num_freq_patches
        # 添加频率嵌入到每个时间补丁
        embeddings += self.freq_embed.repeat(1, num_time_patches, 1)
        # 添加位置嵌入到每个时间补丁
        embeddings += torch.repeat_interleave(self.pos_embed_a[:, :num_time_patches], self.num_freq_patches, dim=1)
        # 添加类型嵌入
        embeddings += self.type_embed_a

        # 返回嵌入和注意力掩码（可选）
        return embeddings, attention_masks


class TvltPixelPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        
        # 初始化图像大小、补丁大小、通道数和隐藏层大小
        image_size, patch_size = config.image_size, config.image_patch_size
        num_channels, hidden_size = config.num_image_channels, config.hidden_size

        # 确保图像大小和补丁大小是迭代对象
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算每张图像的补丁数量
        num_patches_per_image = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])

        # 保存初始化参数
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches_per_image = num_patches_per_image
        self.hidden_size = hidden_size

        # 使用卷积层进行投影
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
    # 定义一个方法，用于对输入的像素值进行前向传播计算
    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 从输入的像素值张量中提取批量大小、帧数、通道数、高度和宽度
        batch_size, num_frames, num_channels, height, width = pixel_values.shape
        
        # 检查输入的像素值通道数是否与配置中指定的通道数一致，若不一致则抛出数值错误
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        
        # 检查输入图像的高度和宽度是否与模型配置中的设置一致，若不一致则抛出数值错误
        if height != self.image_size[0] or width != self.image_size[1]:
            raise ValueError(
                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
            )

        # 将输入的像素值张量重塑为(batch_size * num_frames, num_channels, height, width)的形状
        pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
        
        # 使用模型中的投影层对重塑后的像素值进行投影，并将结果展平并转置
        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
        
        # 将投影后的嵌入张量重新形状为(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)
        embeddings = embeddings.reshape(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)

        # 返回计算得到的嵌入张量作为前向传播的结果
        return embeddings
# 定义一个名为 `TvltAudioPatchEmbeddings` 的类，继承自 `nn.Module`，用于将形状为 `(batch_size, num_channels, height, width)` 的音频值转换为形状为 `(batch_size, seq_length, hidden_size)` 的初始隐藏状态（即补丁嵌入），以供 Transformer 模型使用。

    """
    This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        # 从配置中获取频谱长度、频率长度和补丁大小
        spectrogram_length, frequency_length, patch_size = (
            config.spectrogram_length,
            config.frequency_length,
            config.audio_patch_size,
        )
        # 从配置中获取音频通道数和隐藏状态的大小
        num_channels, hidden_size = config.num_audio_channels, config.hidden_size

        # 定义频谱大小为元组 `(spectrogram_length, frequency_length)`
        spectrogram_size = (spectrogram_length, frequency_length)
        # 如果 `patch_size` 是可迭代对象，则保持不变；否则转换为元组 `(patch_size, patch_size)`
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        # 计算补丁数量，即 `(spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])`
        num_patches = (spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])
        # 定义补丁形状为 `(spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])`
        patch_shape = (spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])

        # 设置类的属性，包括频谱大小、补丁大小、音频通道数、补丁数量和补丁形状
        self.spectrogram_size = spectrogram_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        self.patch_shape = patch_shape

        # 使用 `nn.Conv2d` 定义投影层，将输入的音频通道转换为隐藏状态的卷积操作
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, audio_values: torch.Tensor) -> torch.Tensor:
        # 获取输入音频的形状信息 `(batch_size, num_channels, height, width)`
        batch_size, num_channels, height, width = audio_values.shape
        # 如果输入音频的通道数与设定的音频通道数不匹配，抛出数值错误
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 如果输入音频的高度大于设定的频谱高度或者宽度不等于设定的频率长度，抛出数值错误
        if height > self.spectrogram_size[0] or width != self.spectrogram_size[1]:
            raise ValueError(
                f"Input audio size ({height}*{width}) doesn't match model"
                f" ({self.spectrogram_size[0]}*{self.spectrogram_size[1]})."
            )
        # 将输入音频值投影到隐藏状态空间，并展平成形状 `(batch_size, hidden_size, seq_length)`
        embeddings = self.projection(audio_values).flatten(2).transpose(1, 2)

        # 返回嵌入后的结果
        return embeddings


# 从 `transformers.models.vilt.modeling_vilt.ViltSelfAttention` 复制到 `TvltSelfAttention`，仅修改类名
class TvltSelfAttention(nn.Module):
    # 初始化函数，接收一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        
        # 检查隐藏层大小是否能被注意力头数整除，同时检查是否有嵌入大小的属性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            # 如果不满足条件，抛出数值错误异常
            raise ValueError(
                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
                f"heads {config.num_attention_heads}."
            )
        
        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建用于查询、键和值的线性层，并指定是否使用偏置
        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)

        # 创建用于 dropout 的层，以减少注意力概率
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 将输入张量 x 转换为适合计算注意力分数的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，接收隐藏状态、注意力掩码、头掩码和是否输出注意力作为参数
    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
        # 通过查询线性层生成混合查询层
        mixed_query_layer = self.query(hidden_states)

        # 使用键和值线性层生成适合计算注意力分数的键和值张量
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算查询与键的点积，得到原始的注意力分数
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        
        # 如果存在注意力掩码，将其应用到注意力分数上
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        # 将注意力分数归一化为概率值
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # 使用 dropout 层减少注意力概率
        attention_probs = self.dropout(attention_probs)

        # 如果存在头掩码，将其应用到注意力概率上
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算上下文张量，即注意力概率加权的值层
        context_layer = torch.matmul(attention_probs, value_layer)

        # 将上下文张量的维度重新排列为 [batch_size, seq_length, all_head_size]
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        # 根据输出注意力的设置返回结果
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfOutput with Vilt->Tvlt
class TvltSelfOutput(nn.Module):
    """
    The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    """

    def __init__(self, config: TvltConfig) -> None:
        super().__init__()
        # 定义一个全连接层，输入和输出大小都为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个 dropout 层，根据 config.hidden_dropout_prob 概率随机将输入设置为0
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理输入 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行 dropout 处理
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# Copied from transformers.models.vilt.modeling_vilt.ViltAttention with Vilt->Tvlt
class TvltAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化 TvltSelfAttention 和 TvltSelfOutput
        self.attention = TvltSelfAttention(config)
        self.output = TvltSelfOutput(config)
        self.pruned_heads = set()

    def prune_heads(self, heads):
        # 如果 heads 列表为空，直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数获取需要修剪的头信息
        heads, index = find_pruneable_heads_and_indices(
            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
        )

        # 对 attention 和 output 中的相关层进行修剪
        self.attention.query = prune_linear_layer(self.attention.query, index)
        self.attention.key = prune_linear_layer(self.attention.key, index)
        self.attention.value = prune_linear_layer(self.attention.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪后的头信息
        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
        # 调用 TvltSelfAttention 的 forward 方法处理 hidden_states
        self_outputs = self.attention(hidden_states, attention_mask, head_mask, output_attentions)

        # 使用 TvltSelfOutput 处理 self_outputs 的第一个元素和 hidden_states
        attention_output = self.output(self_outputs[0], hidden_states)

        # 如果需要输出 attention，则将其加入 outputs
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.vilt.modeling_vilt.ViltIntermediate with Vilt->Tvlt
class TvltIntermediate(nn.Module):
    def __init__(self, config: TvltConfig) -> None:
        super().__init__()
        # 定义一个全连接层，输入大小为 config.hidden_size，输出大小为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择隐藏层激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
    # 定义一个前向传播方法，接收隐藏状态张量作为输入，并返回处理后的张量作为输出
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态张量传入全连接层，进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的结果应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)

        # 返回经过线性变换和激活函数处理后的隐藏状态张量
        return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltOutput with Vilt->Tvlt
class TvltOutput(nn.Module):
    def __init__(self, config: TvltConfig) -> None:
        super().__init__()
        # 定义一个全连接层，将输入维度为config.intermediate_size的向量映射到config.hidden_size的向量
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 定义一个dropout层，用于在训练过程中随机置零输入张量中的部分元素，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入hidden_states通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的hidden_states进行dropout操作
        hidden_states = self.dropout(hidden_states)

        # 将dropout后的hidden_states与输入的input_tensor相加，实现残差连接
        hidden_states = hidden_states + input_tensor

        return hidden_states


# Copied from transformers.models.vilt.modeling_vilt.ViltLayer with Vilt->Tvlt
class TvltLayer(nn.Module):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(self, config):
        super().__init__()
        # 初始化TvltLayer类，设置一些需要的参数
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        # 初始化self.attention为TvltAttention类的实例
        self.attention = TvltAttention(config)
        # 初始化self.intermediate为TvltIntermediate类的实例
        self.intermediate = TvltIntermediate(config)
        # 初始化self.output为TvltOutput类的实例
        self.output = TvltOutput(config)
        # 初始化layernorm_before，使用nn.LayerNorm对输入向量进行归一化处理
        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化layernorm_after，同样使用nn.LayerNorm对输入向量进行归一化处理
        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
        # 在ViLT中，先对输入hidden_states进行layernorm处理，再进行自注意力计算
        self_attention_outputs = self.attention(
            self.layernorm_before(hidden_states),
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
        )
        # 取出self_attention计算后的输出
        attention_output = self_attention_outputs[0]
        # 如果需要输出注意力权重，则将注意力权重也包含在输出中
        outputs = self_attention_outputs[1:]

        # 第一个残差连接，将自注意力计算的输出与原始hidden_states相加
        hidden_states = attention_output + hidden_states.to(attention_output.device)

        # 在ViLT中，再次对输出进行layernorm处理
        layer_output = self.layernorm_after(hidden_states)
        # 将layernorm处理后的输出传递给intermediate层进行进一步的非线性变换
        layer_output = self.intermediate(layer_output)

        # 第二个残差连接，将intermediate层的输出与原始hidden_states相加
        layer_output = self.output(layer_output, hidden_states)

        # 将最终的layer_output作为输出结果，并将可能的注意力权重等其他信息也包含在outputs中返回
        outputs = (layer_output,) + outputs

        return outputs


# Copied from transformers.models.vilt.modeling_vilt.ViltEncoder with Vilt->Tvlt
class TvltEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 初始化TvltEncoder类，创建包含config.num_hidden_layers个TvltLayer层的ModuleList
        self.layer = nn.ModuleList([TvltLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        # 如果输出隐藏状态为真，则初始化一个空元组，否则为None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力分布为真，则初始化一个空元组，否则为None
        all_self_attentions = () if output_attentions else None

        # 遍历所有的Transformer层
        for i, layer_module in enumerate(self.layer):
            # 如果输出隐藏状态为真，将当前隐藏状态添加到all_hidden_states中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果开启了梯度检查点且处于训练状态
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数进行前向传播
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 普通的Transformer层前向传播
                layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            # 如果输出注意力分布为真，将当前层的注意力分布添加到all_self_attentions中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果输出隐藏状态为真，将最终隐藏状态添加到all_hidden_states中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不使用返回字典形式，则返回非None的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则，使用BaseModelOutput对象包装并返回
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
        Args:
            pixel_values (:obj:`torch.Tensor` of shape :obj:`(batch_size, channels, height, width)`):
                Pixel values. Pixel values are expected to be in the range [0, 1]. If the model expects a different
                range, you can rescale it accordingly before passing it to the model.
            Return: A dictionary containing the following entries:

                - **last_hidden_state** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
                    Sequence of hidden-states at the output of the last layer of the model.
                - **pooler_output** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
                    Last layer hidden-state of the first token of the sequence (classification token) further processed
                    by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the
                    last layer hidden-state and the bias is initialized as a zero vector.
    # 定义函数签名，描述输入参数的类型和形状
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            像素数值。可以使用 [`TvltProcessor`] 获取像素数值。有关详细信息，请参见 [`TvltProcessor.__call__`]。

        audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            音频数值。可以使用 [`TvltProcessor`] 获取音频数值。有关详细信息，请参见 [`TvltProcessor.__call__`]。

        pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
            像素掩码。可以使用 [`TvltProcessor`] 获取像素掩码。有关详细信息，请参见 [`TvltProcessor.__call__`]。

        audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
            音频掩码。可以使用 [`TvltProcessor`] 获取音频掩码。有关详细信息，请参见 [`TvltProcessor.__call__`]。

        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Tvlt 视听匹配中混合了正负样本的像素数值。可以使用 [`TvltProcessor`] 获取混合像素数值。有关详细信息，请参见 [`TvltProcessor.__call__`]。

        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            `pixel_values_mixed` 的像素掩码。可以使用 [`TvltProcessor`] 获取混合像素掩码。有关详细信息，请参见 [`TvltProcessor.__call__`]。

        mask_pixel (`bool`, *optional*):
            是否为 MAE 任务屏蔽像素。仅在 `TvltForPreTraining` 中设置为 True。

        mask_audio (`bool`, *optional*):
            是否为 MAE 任务屏蔽音频。仅在 `TvltForPreTraining` 中设置为 True。

        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。有关返回的张量中 `attentions` 的更多详细信息，请参阅。

        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。有关返回的张量中 `hidden_states` 的更多详细信息，请参阅。

        return_dict (`bool`, *optional*):
            是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
定义 TvltModel 类，继承自 TvltPreTrainedModel 类，实现了 TVLT 模型的基础功能。

这是一个 Transformer 模型，用于处理 TVLT 相关任务，返回原始隐藏状态而不添加任何特定的输出头部。

@param config: 模型的配置对象，包含了模型的各种参数设置

初始化 TvltModel 类，设置模型的各个组件和参数。

self.pixel_embeddings = TvltPixelEmbeddings(config)
    # 初始化像素嵌入层，根据配置创建 TvltPixelEmbeddings 对象

self.audio_embeddings = TvltAudioEmbeddings(config)
    # 初始化音频嵌入层，根据配置创建 TvltAudioEmbeddings 对象

self.encoder = TvltEncoder(config)
    # 初始化编码器，根据配置创建 TvltEncoder 对象

self.cls_embedding = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
    # 创建一个可学习的参数，用于分类嵌入

if config.use_mean_pooling:
    self.layernorm = None
else:
    self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    # 如果配置要求使用均值池化，则不使用 LayerNorm；否则使用 LayerNorm 对隐藏状态进行标准化

调用 post_init 方法，用于初始化权重并进行最终处理

提供方法 get_input_embeddings，用于获取像素和音频嵌入的 patch 嵌入对象

_prune_heads 方法用于剪枝模型的注意力头部

@param heads_to_prune: 要剪枝的模型头部的字典，格式为 {layer_num: 在该层要剪枝的头部列表}，参见基类 PreTrainedModel

前向传播方法 forward，接受像素值和音频值作为输入，并可选地接受掩码和其他参数，返回 TvltModelOutput 对象

@param pixel_values: 像素值的张量输入
@param audio_values: 音频值的张量输入
@param pixel_mask: 可选的像素掩码张量
@param audio_mask: 可选的音频掩码张量
@param mask_pixel: 是否对像素值进行掩码处理
@param mask_audio: 是否对音频值进行掩码处理
@param output_attentions: 是否输出注意力权重
@param output_hidden_states: 是否输出隐藏状态
@param return_dict: 是否返回字典形式的输出结果

@return: TvltModelOutput 对象，包含前向传播的输出结果

定义 TvltDecoder 类，继承自 nn.Module 类，用于 TVLT 模型的解码器部分

@param config: 模型配置对象，包含解码器的各种参数设置

初始化 TvltDecoder 类，设置解码器的层列表和标准化层

self.decoder_layers = nn.ModuleList([TvltLayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)])
    # 创建 TvltLayer 的模块列表，用于组成解码器的层

self.layernorm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
    # 创建解码器层的 LayerNorm 层，对隐藏状态进行标准化

设置梯度检查点为 False，并保存配置信息
"""
    ):
        # 如果输出隐藏状态设置为 True，则初始化空元组以保存所有隐藏状态，默认为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重设置为 True，则初始化空元组以保存所有自注意力权重，默认为 None
        all_self_attentions = () if output_attentions else None
        
        # 遍历 Transformer 解码器的每个层
        for i, layer_module in enumerate(self.decoder_layers):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
            
            # 如果启用了梯度检查点且处于训练模式，则使用梯度检查点函数调用层模块
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    None,
                    output_attentions,
                )
            else:
                # 否则直接调用层模块，得到层的输出
                layer_outputs = layer_module(hidden_states, output_attentions=output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素（通常是下一层的输入）
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力权重，则将当前层的自注意力权重添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 对最终的隐藏状态进行层归一化，得到最终的预测结果 logits
        logits = self.layernorm(hidden_states)

        # 如果不需要返回字典格式的结果，则按照顺序返回 logits、all_hidden_states、all_self_attentions 中不为 None 的部分
        if not return_dict:
            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
        
        # 否则，将结果封装成 TvltDecoderOutput 对象并返回
        return TvltDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
# 添加自动文档字符串以描述该类的作用和功能
@add_start_docstrings(
    "The TVLT Model transformer with the decoder on top for self-supervised pre-training.",
    TVLT_START_DOCSTRING,
)
# TvltForPreTraining 类继承自 TvltPreTrainedModel 类
class TvltForPreTraining(TvltPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将配置信息存储在实例中
        self.config = config

        # 从配置中获取任务匹配和任务 MAE 的标志位
        self.task_matching = config.task_matching
        self.task_mae = config.task_mae
        # 如果既没有设置任务匹配也没有设置任务 MAE，则抛出值错误异常
        if not (self.task_matching or self.task_mae):
            raise ValueError("Must set at least one of matching task and MAE task to true")

        # 创建 TVLT 模型实例
        self.tvlt = TvltModel(config)

        # 如果配置了任务匹配，则创建匹配头部实例
        if self.task_matching:
            self.matching_head = TvltMatchingHead(config)

        # 如果配置了任务 MAE，则进行以下初始化操作
        if self.task_mae:
            # 创建编码器到解码器的线性层
            self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)

            # 创建像素级和音频级掩码标记参数
            self.pixel_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
            self.audio_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))

            # 创建 TVLT 解码器实例
            self.decoder = TvltDecoder(config)

            # 从配置中获取解码器的隐藏层大小
            decoder_hidden_size = config.decoder_hidden_size

            # 从 TVLT 模型的像素嵌入中获取相关参数并创建相应的解码器位置嵌入
            num_frames = config.num_frames
            num_patches_per_image = self.tvlt.pixel_embeddings.num_patches_per_image
            self.decoder_pixel_pos_embed = nn.Parameter(torch.zeros(1, num_patches_per_image, decoder_hidden_size))
            self.decoder_temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, decoder_hidden_size))
            self.decoder_pixel_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))

            # 从 TVLT 模型的音频嵌入中获取相关参数并创建相应的解码器位置嵌入
            num_audio_patches = self.tvlt.audio_embeddings.num_patches
            num_freq_patches = config.frequency_length // config.audio_patch_size[1]
            self.decoder_audio_pos_embed = nn.Parameter(
                torch.zeros(1, num_audio_patches // num_freq_patches, decoder_hidden_size)
            )
            self.decoder_freq_embed = nn.Parameter(torch.zeros(1, num_freq_patches, decoder_hidden_size))
            self.decoder_audio_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))

            # 创建像素级和音频级 MAE 头部实例
            pixel_mae_output_dim = self.config.image_patch_size[0] ** 2 * self.config.num_image_channels
            self.pixel_mae_head = TvltMAEHead(config, pixel_mae_output_dim)
            audio_mae_output_dim = (
                self.config.audio_patch_size[0] * self.config.audio_patch_size[1] * self.config.num_audio_channels
            )
            self.audio_mae_head = TvltMAEHead(config, audio_mae_output_dim)

            # 存储一些与解码器相关的参数信息
            self.num_frames = num_frames
            self.num_patches_per_image = num_patches_per_image
            self.num_freq_patches = num_freq_patches
            self.image_patch_size = config.image_patch_size
            self.audio_patch_size = config.audio_patch_size

        # 执行后续的初始化步骤，包括权重初始化和最终处理
        self.post_init()
    # 将输入的像素值按照指定的图像块大小进行分块处理
    def patchify_pixel(self, pixel_values):
        """
        pixel_values: [batch_size, num_frames, 3, height, width]
        """
        # 获取输入像素值张量的维度信息
        batch_size, num_frames, num_channels, height, width = pixel_values.shape
        # 计算在高度和宽度上可以分成多少个图像块
        num_patches_height = pixel_values.shape[3] // self.image_patch_size[0]
        num_patches_width = pixel_values.shape[4] // self.image_patch_size[1]
        # 将像素值重新组织成指定形状的张量，以便后续处理
        patchified_pixel_values = pixel_values.reshape(
            shape=(
                batch_size,
                num_frames,
                num_channels,
                num_patches_height,
                self.image_patch_size[0],
                num_patches_width,
                self.image_patch_size[1],
            )
        )
        # 使用 Einstein Summation Convention 进行张量乘积计算，重新排列张量维度
        patchified_pixel_values = torch.einsum("ntchpwq->nthwpqc", patchified_pixel_values)
        # 将重新排列的张量再次整形为指定形状，以便后续计算
        patchified_pixel_values = patchified_pixel_values.reshape(
            shape=(
                batch_size,
                num_patches_height * num_patches_width * num_frames,
                self.image_patch_size[0] * self.image_patch_size[1] * num_channels,
            )
        )
        return patchified_pixel_values

    # 将输入的音频值按照指定的音频块大小进行分块处理
    def patchify_audio(self, audio_values):
        """
        audio_values: [batch_size, 1, height, width]
        """
        # 获取输入音频值张量的维度信息
        batch_size, num_channels, height, width = audio_values.shape
        # 计算在高度和宽度上可以分成多少个音频块
        num_patches_height = height // self.audio_patch_size[0]
        num_patches_width = width // self.audio_patch_size[1]
        # 将音频值重新组织成指定形状的张量，以便后续处理
        patchified_audio_values = audio_values.reshape(
            shape=(
                batch_size,
                num_channels,
                num_patches_height,
                self.audio_patch_size[0],
                num_patches_width,
                self.audio_patch_size[1],
            )
        )
        # 使用 Einstein Summation Convention 进行张量乘积计算，重新排列张量维度
        patchified_audio_values = torch.einsum("nchpwq->nhwpqc", patchified_audio_values)
        # 将重新排列的张量再次整形为指定形状，以便后续计算
        patchified_audio_values = patchified_audio_values.reshape(
            shape=(
                batch_size,
                num_patches_height * num_patches_width,
                self.audio_patch_size[0] * self.audio_patch_size[1] * num_channels,
            )
        )
        return patchified_audio_values

    # 计算像素预测和实际像素之间的均方误差损失
    def pixel_mae_loss(self, pixel_values, pixel_predictions, mask):
        # 将输入的像素值进行分块处理
        patchified_pixel_values = self.patchify_pixel(pixel_values)
        # 计算预测像素值和分块像素值之间的平方差
        loss = (pixel_predictions - patchified_pixel_values) ** 2
        # 计算每个图像块上的平均损失
        loss = loss.mean(dim=-1)  # [batch_size, pixel_pixel_length], mean loss per patch
        # 根据掩码计算移除的图像块上的平均损失
        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
        return loss

    # 计算音频预测和实际音频值之间的均方误差损失
    def audio_mae_loss(self, audio_values, audio_predictions, mask):
        # 将输入的音频值进行分块处理
        patchified_audio_values = self.patchify_audio(audio_values)
        # 计算预测音频值和分块音频值之间的平方差
        loss = (audio_predictions - patchified_audio_values) ** 2
        # 计算每个音频块上的平均损失
        loss = loss.mean(dim=-1)  # [batch_size, audio_pixel_length], mean loss per patch
        # 根据掩码计算移除的音频块上的平均损失
        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
        return loss
    # 定义一个方法用于拼接掩码到序列的末尾
    def concatenate_mask(self, mask_token, sequence, ids_restore):
        # 获取序列的批大小、序列长度和维度
        batch_size, seq_length, dim = sequence.shape
        # 将掩码标记重复添加到每个样本序列末尾，以匹配恢复后的序列长度
        mask_tokens = mask_token.repeat(batch_size, ids_restore.shape[1] - seq_length, 1)
        # 在序列的末尾连接掩码标记
        padded_sequence = torch.cat([sequence, mask_tokens], dim=1)
        # 根据恢复的索引ids_restore重新排序序列，以恢复原始顺序
        padded_sequence = torch.gather(
            padded_sequence, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, dim)
        )  # unshuffle
        # 返回重新排序后的序列
        return padded_sequence

    # 定义模型的前向传播方法，此处注释通过装饰器已添加到模型前向方法的输入和输出文档字符串
    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TvltForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        audio_values: torch.FloatTensor,
        pixel_mask: Optional[torch.FloatTensor] = None,
        audio_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        pixel_values_mixed: Optional[torch.FloatTensor] = None,
        pixel_mask_mixed: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 定义一个自定义的 Transformer 模型，用于处理音频和视觉分类任务，例如 CMU-MOSEI 情感分析和音频到视频检索
@add_start_docstrings(
    """
    Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
    for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
    """,
    TVLT_START_DOCSTRING,
)
class TvltForAudioVisualClassification(TvltPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 TvltModel，这是主要的 Transformer 模型
        self.tvlt = TvltModel(config)

        # 分类器头部网络
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size * 2),  # 线性层，扩展隐藏层大小
            nn.LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps),  # LayerNorm 层
            nn.GELU(),  # GELU 激活函数
            nn.Linear(config.hidden_size * 2, config.num_labels),  # 线性层，输出分类标签数
        )
        self.config = config

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        audio_values: torch.FloatTensor,
        pixel_mask: Optional[torch.FloatTensor] = None,
        audio_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple[torch.FloatTensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
            refers to the number of classes in audiovisual tasks.

        Return:

        Examples:
        ```
        >>> from transformers import TvltProcessor, TvltForAudioVisualClassification
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 8
        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
        >>> audio = list(np.random.randn(10000))
        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
        >>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")

        >>> outputs = model(**input_dict)
        >>> loss = outputs.loss
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 若未指定 return_dict 则使用模型配置中的默认值

        outputs = self.tvlt(
            pixel_values,
            audio_values,
            pixel_mask=pixel_mask,
            audio_mask=audio_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 调用 Tvlt 模型进行前向传播，获取输出

        sequence_output = outputs[0][:, 0]  # 获取序列输出的第一个位置的结果
        logits = self.classifier(sequence_output)  # 将序列输出传入分类器，得到分类 logits

        loss = None
        if labels is not None:
            if self.config.loss_type == "regression":  # 如果损失类型为回归
                loss_fct = MSELoss()
                loss = loss_fct(logits, labels)  # 计算均方误差损失
            elif self.config.loss_type == "classification":  # 如果损失类型为分类
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits, labels)  # 计算交叉熵损失

        if not return_dict:
            output = (logits,) + outputs[4:]  # 如果不返回字典，则组合输出结果
            return ((loss,) + output) if loss is not None else output  # 返回包含损失的输出

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )  # 返回包含损失、logits、隐藏状态和注意力的 SequenceClassifierOutput 对象

Transformers-源码解析-一百一十二-

Transformers 源码解析（一百一十二）

.\models\timesformer\modeling_timesformer.py

.\models\timesformer\__init__.py

.\models\time_series_transformer\configuration_time_series_transformer.py

.\models\time_series_transformer\modeling_time_series_transformer.py

.\models\time_series_transformer\__init__.py

.\models\timm_backbone\configuration_timm_backbone.py

.\models\timm_backbone\modeling_timm_backbone.py

.\models\timm_backbone\__init__.py

.\models\trocr\configuration_trocr.py

.\models\trocr\convert_trocr_unilm_to_pytorch.py

.\models\trocr\modeling_trocr.py

.\models\trocr\processing_trocr.py

.\models\trocr\__init__.py

.\models\tvlt\configuration_tvlt.py

.\models\tvlt\feature_extraction_tvlt.py

.\models\tvlt\image_processing_tvlt.py

.\models\tvlt\modeling_tvlt.py

`.\models\timesformer\modeling_timesformer.py`

`.\models\timesformer\init.py`

`.\models\time_series_transformer\configuration_time_series_transformer.py`

`.\models\time_series_transformer\modeling_time_series_transformer.py`

`.\models\time_series_transformer\init.py`

`.\models\timm_backbone\configuration_timm_backbone.py`

`.\models\timm_backbone\modeling_timm_backbone.py`

`.\models\timm_backbone\init.py`

`.\models\trocr\configuration_trocr.py`

`.\models\trocr\convert_trocr_unilm_to_pytorch.py`

`.\models\trocr\modeling_trocr.py`

`.\models\trocr\processing_trocr.py`

`.\models\trocr\init.py`

`.\models\tvlt\configuration_tvlt.py`

`.\models\tvlt\feature_extraction_tvlt.py`

`.\models\tvlt\image_processing_tvlt.py`

`.\models\tvlt\modeling_tvlt.py`