Transformers 源码解析（一百零四）

`.\models\siglip\init.py`

# 引入必要的依赖和模块结构定义
from typing import TYPE_CHECKING

# 从 HuggingFace 的 utils 模块中导入所需的工具和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_torch_available,
    is_vision_available,
)

# 定义模块的导入结构，用于动态导入所需的类和函数
_import_structure = {
    "configuration_siglip": [
        "SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "SiglipConfig",
        "SiglipTextConfig",
        "SiglipVisionConfig",
    ],
    "processing_siglip": ["SiglipProcessor"],
}

# 检查是否存在 sentencepiece，若不存在则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在，将 SiglipTokenizer 加入导入结构
    _import_structure["tokenization_siglip"] = ["SiglipTokenizer"]

# 检查是否存在 vision 库，若不存在则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在，将 SiglipImageProcessor 加入导入结构
    _import_structure["image_processing_siglip"] = ["SiglipImageProcessor"]

# 检查是否存在 torch 库，若不存在则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在，将 Siglip 相关的模型和类加入导入结构
    _import_structure["modeling_siglip"] = [
        "SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SiglipModel",
        "SiglipPreTrainedModel",
        "SiglipTextModel",
        "SiglipVisionModel",
        "SiglipForImageClassification",
    ]

# 如果是类型检查阶段，导入所需的具体类和函数
if TYPE_CHECKING:
    from .configuration_siglip import (
        SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
        SiglipConfig,
        SiglipTextConfig,
        SiglipVisionConfig,
    )
    from .processing_siglip import SiglipProcessor

    # 检查是否存在 sentencepiece，若存在则导入 SiglipTokenizer
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_siglip import SiglipTokenizer

    # 检查是否存在 vision 库，若存在则导入 SiglipImageProcessor
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_siglip import SiglipImageProcessor

    # 检查是否存在 torch 库，若存在则导入 Siglip 相关的模型和类
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_siglip import (
            SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            SiglipForImageClassification,
            SiglipModel,
            SiglipPreTrainedModel,
            SiglipTextModel,
            SiglipVisionModel,
        )

else:
    # 如果不是类型检查阶段，则什么都不做，这部分代码不会执行
    pass
    import sys
    导入 sys 模块，用于访问和操作 Python 解释器的运行时环境和变量。
    
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
    将当前模块注册为一个懒加载模块。这行代码的作用是将当前模块的模块对象（通过__name__获取）关联到一个自定义的 _LazyModule 实例，这个实例接受当前模块的名称、文件路径（通过 globals()["__file__"] 获取）、一个导入结构（_import_structure）、和一个模块规范（module_spec=__spec__）作为参数。

`.\models\speecht5\configuration_speecht5.py`

# 设置文件编码为 UTF-8
# 版权声明和许可信息，指明了代码的版权和使用许可
#
# 此处导入 functools 和 operator 模块
# configuration_utils 模块中的 PretrainedConfig 类导入
# logging 模块中的 get_logger 函数导入
import functools
import operator

from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取 logger 对象，用于记录日志
logger = logging.get_logger(__name__)

# 定义了不同预训练模型名称到其配置文件 URL 的映射
SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/speecht5_asr": "https://huggingface.co/microsoft/speecht5_asr/resolve/main/config.json",
    "microsoft/speecht5_tts": "https://huggingface.co/microsoft/speecht5_tts/resolve/main/config.json",
    "microsoft/speecht5_vc": "https://huggingface.co/microsoft/speecht5_vc/resolve/main/config.json",
}

# 定义了 HiFi-GAN 预训练模型名称到其配置文件 URL 的映射
SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP = {
    "microsoft/speecht5_hifigan": "https://huggingface.co/microsoft/speecht5_hifigan/resolve/main/config.json",
}

# SpeechT5Config 类，继承自 PretrainedConfig，用于存储 SpeechT5 模型的配置信息
# 提供了 SpeechT5 模型的配置，可用于实例化 SpeechT5 模型，定义模型架构
# 默认配置与 microsoft/speecht5_asr 架构类似
# 配置对象继承自 PretrainedConfig，可用于控制模型的输出
# 详细信息请参阅 PretrainedConfig 的文档
class SpeechT5Config(PretrainedConfig):
    # 模型类型为 speecht5
    model_type = "speecht5"
    # 属性映射表，将 num_attention_heads 映射为 encoder_attention_heads，将 num_hidden_layers 映射为 encoder_layers
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "num_hidden_layers": "encoder_layers"}
    # 定义一个类的初始化方法，用于初始化模型的各种参数和配置
    def __init__(
        self,
        vocab_size=81,  # 词汇表大小，默认为81
        hidden_size=768,  # 隐藏层大小，默认为768
        encoder_layers=12,  # 编码器层数，默认为12
        encoder_attention_heads=12,  # 编码器注意力头数，默认为12
        encoder_ffn_dim=3072,  # 编码器前馈网络维度，默认为3072
        encoder_layerdrop=0.1,  # 编码器层丢弃率，默认为0.1
        decoder_layers=6,  # 解码器层数，默认为6
        decoder_ffn_dim=3072,  # 解码器前馈网络维度，默认为3072
        decoder_attention_heads=12,  # 解码器注意力头数，默认为12
        decoder_layerdrop=0.1,  # 解码器层丢弃率，默认为0.1
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        positional_dropout=0.1,  # 位置编码的dropout率，默认为0.1
        hidden_dropout=0.1,  # 隐藏层dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力dropout率，默认为0.1
        activation_dropout=0.1,  # 激活函数dropout率，默认为0.1
        initializer_range=0.02,  # 初始化权重范围，默认为0.02
        layer_norm_eps=1e-5,  # LayerNorm层的epsilon，默认为1e-5
        scale_embedding=False,  # 是否对embedding进行缩放，默认为False
        feat_extract_norm="group",  # 特征提取的归一化方式，默认为"group"
        feat_proj_dropout=0.0,  # 特征投影的dropout率，默认为0.0
        feat_extract_activation="gelu",  # 特征提取的激活函数，默认为GELU
        conv_dim=(512, 512, 512, 512, 512, 512, 512),  # 卷积维度序列，默认为(512, 512, 512, 512, 512, 512, 512)
        conv_stride=(5, 2, 2, 2, 2, 2, 2),  # 卷积步幅序列，默认为(5, 2, 2, 2, 2, 2, 2)
        conv_kernel=(10, 3, 3, 3, 3, 2, 2),  # 卷积核大小序列，默认为(10, 3, 3, 3, 3, 2, 2)
        conv_bias=False,  # 是否使用卷积层的偏置，默认为False
        num_conv_pos_embeddings=128,  # 卷积位置编码的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置编码的分组数量，默认为16
        apply_spec_augment=True,  # 是否应用频谱增强，默认为True
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 时间掩码的最小掩码数，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        mask_feature_min_masks=0,  # 特征掩码的最小掩码数，默认为0
        pad_token_id=1,  # 填充token的ID，默认为1
        bos_token_id=0,  # 起始token的ID，默认为0
        eos_token_id=2,  # 结束token的ID，默认为2
        decoder_start_token_id=2,  # 解码器起始token的ID，默认为2
        num_mel_bins=80,  # 梅尔频率的数量，默认为80
        speech_decoder_prenet_layers=2,  # 语音解码器预网络层数，默认为2
        speech_decoder_prenet_units=256,  # 语音解码器预网络单元数，默认为256
        speech_decoder_prenet_dropout=0.5,  # 语音解码器预网络的dropout率，默认为0.5
        speaker_embedding_dim=512,  # 发声者嵌入的维度，默认为512
        speech_decoder_postnet_layers=5,  # 语音解码器后网络层数，默认为5
        speech_decoder_postnet_units=256,  # 语音解码器后网络单元数，默认为256
        speech_decoder_postnet_kernel=5,  # 语音解码器后网络卷积核大小，默认为5
        speech_decoder_postnet_dropout=0.5,  # 语音解码器后网络的dropout率，默认为0.5
        reduction_factor=2,  # 缩减因子，默认为2
        max_speech_positions=4000,  # 最大语音位置，默认为4000
        max_text_positions=450,  # 最大文本位置，默认为450
        encoder_max_relative_position=160,  # 编码器最大相对位置，默认为160
        use_guided_attention_loss=True,  # 是否使用引导注意力损失，默认为True
        guided_attention_loss_num_heads=2,  # 引导注意力损失的头数，默认为2
        guided_attention_loss_sigma=0.4,  # 引导注意力损失的sigma，默认为0.4
        guided_attention_loss_scale=10.0,  # 引导注意力损失的缩放因子，默认为10.0
        use_cache=True,  # 是否使用缓存，默认为True
        is_encoder_decoder=True,  # 是否是编码器-解码器结构，默认为True
        **kwargs,  # 其他未命名参数
    ):
        # 定义一个方法，用于计算输入到logits的比例
        def inputs_to_logits_ratio(self):
            # 使用functools.reduce函数对卷积步幅序列中的所有值进行累乘，初始值为1
            return functools.reduce(operator.mul, self.conv_stride, 1)
# 定义一个配置类，用于存储 [`SpeechT5HifiGanModel`] 的配置信息。这个类被用来实例化一个 SpeechT5 HiFi-GAN 语音合成模型，
# 根据指定的参数来定义模型架构。
class SpeechT5HifiGanConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SpeechT5HifiGanModel`]. It is used to instantiate
    a SpeechT5 HiFi-GAN vocoder model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the SpeechT5
    [microsoft/speecht5_hifigan](https://huggingface.co/microsoft/speecht5_hifigan) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        model_in_dim (`int`, *optional*, defaults to 80):
            The number of frequency bins in the input log-mel spectrogram.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
        upsample_initial_channel (`int`, *optional*, defaults to 512):
            The number of input channels into the upsampling network.
        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
            A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
            length of *upsample_rates* defines the number of convolutional layers and has to match the length of
            *upsample_kernel_sizes*.
        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 8, 8]`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
            length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
            *upsample_rates*.
        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
            fusion (MRF) module.
        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
            multi-receptive field fusion (MRF) module.
        initializer_range (`float`, *optional*, defaults to 0.01):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
            The angle of the negative slope used by the leaky ReLU activation.
        normalize_before (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.

    Example:

    ```
    # 定义模型类型为 "hifigan"
    model_type = "hifigan"

    # 初始化模型类，设置模型参数
    def __init__(
        self,
        model_in_dim=80,  # 输入维度，默认为80
        sampling_rate=16000,  # 采样率，默认为16000
        upsample_initial_channel=512,  # 上采样初始通道数，默认为512
        upsample_rates=[4, 4, 4, 4],  # 上采样倍率列表，默认为[4, 4, 4, 4]
        upsample_kernel_sizes=[8, 8, 8, 8],  # 上采样卷积核大小列表，默认为[8, 8, 8, 8]
        resblock_kernel_sizes=[3, 7, 11],  # ResBlock 的卷积核大小列表，默认为[3, 7, 11]
        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],  # ResBlock 的膨胀率列表，默认为[[1, 3, 5], [1, 3, 5], [1, 3, 5]]
        initializer_range=0.01,  # 初始化范围，默认为0.01
        leaky_relu_slope=0.1,  # LeakyReLU 的斜率，默认为0.1
        normalize_before=True,  # 是否在前归一化，默认为True
        **kwargs,  # 其他参数
    ):
        self.model_in_dim = model_in_dim  # 设置模型输入维度
        self.sampling_rate = sampling_rate  # 设置采样率
        self.upsample_initial_channel = upsample_initial_channel  # 设置上采样初始通道数
        self.upsample_rates = upsample_rates  # 设置上采样倍率列表
        self.upsample_kernel_sizes = upsample_kernel_sizes  # 设置上采样卷积核大小列表
        self.resblock_kernel_sizes = resblock_kernel_sizes  # 设置ResBlock的卷积核大小列表
        self.resblock_dilation_sizes = resblock_dilation_sizes  # 设置ResBlock的膨胀率列表
        self.initializer_range = initializer_range  # 设置初始化范围
        self.leaky_relu_slope = leaky_relu_slope  # 设置LeakyReLU的斜率
        self.normalize_before = normalize_before  # 设置是否在前归一化
        super().__init__(**kwargs)  # 调用父类的初始化方法，传入其他参数

`.\models\speecht5\convert_hifigan.py`

# 设置编码格式为 UTF-8

# 导入必要的库和模块
import argparse  # 用于解析命令行参数

import numpy as np  # 用于数值计算
import torch  # PyTorch 深度学习框架

from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig, logging  # 导入 Transformers 库中的相关模块和类

# 设置日志的详细程度为 info
logging.set_verbosity_info()

# 获取名为 "transformers.models.speecht5" 的日志记录器
logger = logging.get_logger("transformers.models.speecht5")


def load_weights(checkpoint, hf_model, config):
    # 对模型应用权重归一化操作
    hf_model.apply_weight_norm()

    # 加载输入卷积层的权重和偏置
    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]

    # 加载每个上采样层的权重和偏置
    for i in range(len(config.upsample_rates)):
        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]

    # 加载每个残差块的权重和偏置
    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
        for j in range(len(config.resblock_dilation_sizes)):
            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]

            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]

    # 加载输出卷积层的权重和偏置
    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]

    # 移除模型的权重归一化
    hf_model.remove_weight_norm()


@torch.no_grad()
def convert_hifigan_checkpoint(
    checkpoint_path,
    stats_path,
    pytorch_dump_folder_path,
    config_path=None,
    repo_id=None,
):
    # 如果提供了配置文件路径，则使用预训练配置创建配置对象，否则使用默认配置
    if config_path is not None:
        config = SpeechT5HifiGanConfig.from_pretrained(config_path)
    else:
        config = SpeechT5HifiGanConfig()

    # 创建 SpeechT5HifiGan 模型对象
    model = SpeechT5HifiGan(config)

    # 加载原始检查点文件
    orig_checkpoint = torch.load(checkpoint_path)

    # 加载权重到模型中
    load_weights(orig_checkpoint["model"]["generator"], model, config)
    # 加载保存的统计信息，这里假设 stats_path 是保存的 numpy 数组的文件路径
    stats = np.load(stats_path)
    
    # 从统计信息中提取平均值，并重塑为一维数组
    mean = stats[0].reshape(-1)
    
    # 从统计信息中提取标度，并重塑为一维数组
    scale = stats[1].reshape(-1)
    
    # 将平均值转换为 PyTorch 的 float 张量，并设置为模型的平均值属性
    model.mean = torch.from_numpy(mean).float()
    
    # 将标度转换为 PyTorch 的 float 张量，并设置为模型的标度属性
    model.scale = torch.from_numpy(scale).float()
    
    # 将模型保存到指定的 PyTorch 转储文件夹路径
    model.save_pretrained(pytorch_dump_folder_path)
    
    # 如果 repo_id 存在，则将模型推送到指定的存储库
    if repo_id:
        print("Pushing to the hub...")
        model.push_to_hub(repo_id)
# 如果当前脚本被作为主程序执行，则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加命令行参数：原始检查点的路径，必填参数
    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
    # 添加命令行参数：stats.npy 文件的路径，必填参数
    parser.add_argument("--stats_path", required=True, default=None, type=str, help="Path to stats.npy file")
    # 添加命令行参数：待转换模型的 HF 配置文件（config.json）的路径，可选参数
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加命令行参数：输出 PyTorch 模型的文件夹路径，必填参数
    parser.add_argument(
        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
    )
    # 添加命令行参数：指定是否将转换后的模型上传到 🤗 hub 的路径，可选参数
    parser.add_argument(
        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
    )

    # 解析命令行参数，并将其存储在 args 对象中
    args = parser.parse_args()
    
    # 调用函数 convert_hifigan_checkpoint 进行模型检查点的转换
    convert_hifigan_checkpoint(
        args.checkpoint_path,     # 原始检查点的路径
        args.stats_path,          # stats.npy 文件的路径
        args.pytorch_dump_folder_path,   # 输出 PyTorch 模型的文件夹路径
        args.config_path,         # HF 配置文件的路径
        args.push_to_hub          # 是否上传到 🤗 hub 的路径
    )

`.\models\speecht5\convert_speecht5_original_pytorch_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8
# 版权声明和许可证信息，指定这段代码的版权归属和使用许可
# 导入命令行参数解析模块
import argparse

# 导入 PyTorch 库
import torch

# 导入 Transformers 库中的相关类和函数
from transformers import (
    SpeechT5Config,  # 导入 SpeechT5 模型配置
    SpeechT5FeatureExtractor,  # 导入 SpeechT5 特征提取器
    SpeechT5ForSpeechToSpeech,  # 导入 SpeechT5 语音到语音模型
    SpeechT5ForSpeechToText,  # 导入 SpeechT5 语音到文本模型
    SpeechT5ForTextToSpeech,  # 导入 SpeechT5 文本到语音模型
    SpeechT5Processor,  # 导入 SpeechT5 处理器
    SpeechT5Tokenizer,  # 导入 SpeechT5 分词器
    logging,  # 导入日志记录模块
)
from transformers.tokenization_utils import AddedToken  # 导入特定的分词工具类

# 设置日志的详细程度为 info
logging.set_verbosity_info()

# 获取特定名称的日志记录器
logger = logging.get_logger("transformers.models.speecht5")

# 定义映射字典，将旧版本中的模型参数映射到新版本中的相应位置
MAPPING_SPEECH_ENCODER_PRENET = {
    "speech_encoder_prenet.layer_norm": "speecht5.encoder.prenet.feature_projection.layer_norm",
    "speech_encoder_prenet.post_extract_proj": "speecht5.encoder.prenet.feature_projection.projection",
    "speech_encoder_prenet.pos_conv.0": "speecht5.encoder.prenet.pos_conv_embed.conv",
    "speech_encoder_prenet.mask_emb": "speecht5.encoder.prenet.masked_spec_embed",
}

MAPPING_TEXT_ENCODER_PRENET = {
    "text_encoder_prenet.encoder_prenet.0": "speecht5.encoder.prenet.embed_tokens",
    "text_encoder_prenet.encoder_prenet.1.alpha": "speecht5.encoder.prenet.encode_positions.alpha",
}

MAPPING_SPEECH_DECODER_PRENET = {
    "speech_decoder_prenet.decoder_prenet.0.0.prenet.0.0": "speecht5.decoder.prenet.layers.0",
    "speech_decoder_prenet.decoder_prenet.0.0.prenet.1.0": "speecht5.decoder.prenet.layers.1",
    "speech_decoder_prenet.decoder_prenet.0.1": "speecht5.decoder.prenet.final_layer",
    "speech_decoder_prenet.decoder_prenet.1.alpha": "speecht5.decoder.prenet.encode_positions.alpha",
    "speech_decoder_prenet.spkembs_layer.0": "speecht5.decoder.prenet.speaker_embeds_layer",
}

MAPPING_SPEECH_DECODER_POSTNET = {
    "speech_decoder_postnet.feat_out": "speech_decoder_postnet.feat_out",
    "speech_decoder_postnet.prob_out": "speech_decoder_postnet.prob_out",
    "speech_decoder_postnet.postnet.postnet.0.0": "speech_decoder_postnet.layers.0.conv",
    "speech_decoder_postnet.postnet.postnet.0.1": "speech_decoder_postnet.layers.0.batch_norm",
    "speech_decoder_postnet.postnet.postnet.1.0": "speech_decoder_postnet.layers.1.conv",
    "speech_decoder_postnet.postnet.postnet.1.1": "speech_decoder_postnet.layers.1.batch_norm",
    "speech_decoder_postnet.postnet.postnet.2.0": "speech_decoder_postnet.layers.2.conv",
    "speech_decoder_postnet.postnet.postnet.2.1": "speech_decoder_postnet.layers.2.batch_norm",
}
    # 定义一个字典，将旧的模型层名称映射到新的模型层名称
    "speech_decoder_postnet.postnet.postnet.3.0": "speech_decoder_postnet.layers.3.conv",
    # 继续定义字典映射
    "speech_decoder_postnet.postnet.postnet.3.1": "speech_decoder_postnet.layers.3.batch_norm",
    # 继续定义字典映射
    "speech_decoder_postnet.postnet.postnet.4.0": "speech_decoder_postnet.layers.4.conv",
    # 继续定义字典映射
    "speech_decoder_postnet.postnet.postnet.4.1": "speech_decoder_postnet.layers.4.batch_norm",
}
# 文本到语音模型的映射，用于将文本解码器的预网络映射到SpeechT5解码器的预网络
MAPPING_TEXT_DECODER_PRENET = {
    "text_decoder_prenet.embed_tokens": "speecht5.decoder.prenet.embed_tokens",
}
# 文本到语音模型的映射，用于将文本解码器的后网络映射到文本解码器的语言模型头部
MAPPING_TEXT_DECODER_POSTNET = {
    "text_decoder_postnet.output_projection": "text_decoder_postnet.lm_head",
}
# 编码器的映射，将SpeechT5编码器的各个层映射到包装的编码器的对应层
MAPPING_ENCODER = {
    "encoder.layers.*.self_attn.k_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.k_proj",
    "encoder.layers.*.self_attn.v_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.v_proj",
    "encoder.layers.*.self_attn.q_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.q_proj",
    "encoder.layers.*.self_attn.out_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.out_proj",
    "encoder.layers.*.self_attn_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.layer_norm",
    "encoder.layers.*.fc1": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.intermediate_dense",
    "encoder.layers.*.fc2": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.output_dense",
    "encoder.layers.*.final_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "speecht5.encoder.wrapped_encoder.layer_norm",
    "encoder.pos_emb.pe_k": "speecht5.encoder.wrapped_encoder.embed_positions.pe_k",
}
# 解码器的映射，将SpeechT5解码器的各个层映射到包装的解码器的对应层
MAPPING_DECODER = {
    "decoder.layers.*.self_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.k_proj",
    "decoder.layers.*.self_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.v_proj",
    "decoder.layers.*.self_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.q_proj",
    "decoder.layers.*.self_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.out_proj",
    "decoder.layers.*.self_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.self_attn_layer_norm",
    "decoder.layers.*.encoder_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.k_proj",
    "decoder.layers.*.encoder_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.v_proj",
    "decoder.layers.*.encoder_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.q_proj",
    "decoder.layers.*.encoder_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.out_proj",
    "decoder.layers.*.encoder_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn_layer_norm",
    "decoder.layers.*.fc1": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.intermediate_dense",
    "decoder.layers.*.fc2": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.output_dense",
    "decoder.layers.*.final_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.final_layer_norm",
}
# 从文本到语音的映射，包括文本编码器、解码器和额外的预网络和后网络映射
MAPPING_S2T = {
    **MAPPING_SPEECH_ENCODER_PRENET,
    **MAPPING_ENCODER,
    **MAPPING_DECODER,
    **MAPPING_TEXT_DECODER_PRENET,
    **MAPPING_TEXT_DECODER_POSTNET,
}
# 从语音到文本的映射，包括文本编码器、解码器和语音解码器的预网络和后网络映射
MAPPING_T2S = {
    **MAPPING_TEXT_ENCODER_PRENET,
    **MAPPING_ENCODER,
    **MAPPING_DECODER,
    **MAPPING_SPEECH_DECODER_PRENET,
    **MAPPING_SPEECH_DECODER_POSTNET,
}
# 将 MAPPING_SPEECH_ENCODER_PRENET, MAPPING_ENCODER, MAPPING_DECODER,
# MAPPING_SPEECH_DECODER_PRENET, MAPPING_SPEECH_DECODER_POSTNET 合并为一个字典
MAPPING_S2S = {
    **MAPPING_SPEECH_ENCODER_PRENET,
    **MAPPING_ENCODER,
    **MAPPING_DECODER,
    **MAPPING_SPEECH_DECODER_PRENET,
    **MAPPING_SPEECH_DECODER_POSTNET,
}

# 顶层键的空列表
TOP_LEVEL_KEYS = []

# 忽略的键列表，包括某些具体路径和通配符
IGNORE_KEYS = [
    "encoder.version",
    "encoder.layers.*.norm_k.weight",
    "encoder.layers.*.norm_k.bias",
    "decoder.version",
    "decoder.layers.*.norm_k.weight",
    "decoder.layers.*.norm_k.bias",
    "decoder.pos_emb.pe_k",
    "speech_encoder_prenet.embed_positions._float_tensor",
    "text_decoder_prenet.embed_positions._float_tensor",
]

# S2T 任务特定的忽略键列表，包括通用的 IGNORE_KEYS 和一些额外的键
IGNORE_KEYS_S2T = IGNORE_KEYS + [
    "encoder.proj",
    "text_encoder_prenet.*",
    "speech_decoder_prenet.*",
    "speech_decoder_postnet.*",
]

# T2S 任务特定的忽略键列表，包括通用的 IGNORE_KEYS 和一些额外的键
IGNORE_KEYS_T2S = IGNORE_KEYS + [
    "encoder.proj",
    "speech_encoder_prenet.*",
    "text_decoder_prenet.*",
    "text_decoder_postnet.*",
]

# S2S 任务特定的忽略键列表，包括通用的 IGNORE_KEYS 和一些额外的键
IGNORE_KEYS_S2S = IGNORE_KEYS + [
    "encoder.proj",
    "text_encoder_prenet.*",
    "text_decoder_prenet.*",
    "text_decoder_postnet.*",
]

# 递归设置模型权重的函数
def set_recursively(hf_pointer, key, value, full_name, weight_type):
    # 根据键字符串逐级访问对象属性，直至最后一级
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)

    # 根据 weight_type 获取当前属性的形状
    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    # 如果形状不匹配，则抛出值错误异常
    if hf_shape != value.shape:
        raise ValueError(
            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
            f" {value.shape} for {full_name}"
        )

    # 根据 weight_type 设置属性的数据值
    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    elif weight_type == "running_mean":
        hf_pointer.running_mean.data = value
    elif weight_type == "running_var":
        hf_pointer.running_var.data = value
    elif weight_type == "num_batches_tracked":
        hf_pointer.num_batches_tracked.data = value
    else:
        hf_pointer.data = value

    # 记录权重初始化的信息到日志
    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")


# 判断给定名称是否应该被忽略的函数
def should_ignore(name, ignore_keys):
    for key in ignore_keys:
        if key.endswith(".*"):
            if name.startswith(key[:-1]):
                return True
        elif ".*." in key:
            prefix, suffix = key.split(".*.")
            if prefix in name and suffix in name:
                return True
        elif key in name:
            return True
    return False


# 递归加载权重到模型的函数
def recursively_load_weights(fairseq_dict, hf_model, task):
    unused_weights = []

    # 如果任务是 S2T
    if task == "s2t":
        # 获取特征编码器对象
        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
        # 使用 S2T 任务相关的映射和忽略键列表
        MAPPING = MAPPING_S2T
        IGNORE_KEYS = IGNORE_KEYS_S2T
    elif task == "t2s":
        feature_encoder = None
        MAPPING = MAPPING_T2S  # 设置映射表为 T2S 的映射表
        IGNORE_KEYS = IGNORE_KEYS_T2S  # 设置忽略列表为 T2S 的忽略列表
    elif task == "s2s":
        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder  # 获取特征编码器
        MAPPING = MAPPING_S2S  # 设置映射表为 S2S 的映射表
        IGNORE_KEYS = IGNORE_KEYS_S2S  # 设置忽略列表为 S2S 的忽略列表
    else:
        raise ValueError(f"Unsupported task: {task}")  # 抛出异常，任务不支持

    for name, value in fairseq_dict.items():  # 遍历 fairseq 字典的每个条目
        if should_ignore(name, IGNORE_KEYS):  # 判断是否应该忽略当前条目
            logger.info(f"{name} was ignored")  # 记录日志，指出被忽略的条目
            continue

        is_used = False  # 初始化是否使用的标志为 False
        if "conv_layers" in name:  # 如果条目名包含 "conv_layers"
            load_conv_layer(
                name,
                value,
                feature_encoder,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )  # 调用加载卷积层函数，加载当前条目
            is_used = True  # 设置已使用标志为 True，表示当前条目已被使用
        else:
            for key, mapped_key in MAPPING.items():  # 遍历映射表中的每个映射关系
                # mapped_key = "speecht5." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key

                if "*" in key:  # 如果映射键中包含通配符 *
                    prefix, suffix = key.split(".*.")  # 拆分前缀和后缀
                    if prefix in name and suffix in name:  # 如果条目名包含前缀和后缀
                        key = suffix  # 使用后缀作为当前键

                # if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                if key in name:  # 如果当前键存在于条目名中
                    is_used = True  # 设置已使用标志为 True
                    if "*" in mapped_key:  # 如果映射后的键中包含通配符 *
                        layer_index = name.split(key)[0].split(".")[-2]  # 提取层索引
                        mapped_key = mapped_key.replace("*", layer_index)  # 替换映射键中的通配符

                    # 确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        weight_type = "weight"
                    elif "running_mean" in name:
                        weight_type = "running_mean"
                    elif "running_var" in name:
                        weight_type = "running_var"
                    elif "num_batches_tracked" in name:
                        weight_type = "num_batches_tracked"
                    else:
                        weight_type = None

                    set_recursively(hf_model, mapped_key, value, name, weight_type)  # 递归设置模型参数

                continue  # 继续下一个映射关系的处理

        if not is_used:  # 如果当前条目未被使用
            unused_weights.append(name)  # 将当前条目名添加到未使用的权重列表中

    logger.warning(f"Unused weights: {unused_weights}")  # 记录未使用的权重列表到日志中
# 加载卷积层数据到特征提取器中
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 根据点号分割全名获取层和类型
    name = full_name.split("conv_layers.")[-1]
    items = name.split(".")
    layer_id = int(items[0])  # 提取层的标识号
    type_id = int(items[1])   # 提取类型的标识号

    # 如果类型标识为0，处理偏置项或权重项
    if type_id == 0:
        if "bias" in name:
            # 检查值的形状是否匹配特征提取器中对应卷积层的偏置项形状
            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].conv.bias.data = value  # 设置偏置项数据
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")  # 记录日志
        elif "weight" in name:
            # 检查值的形状是否匹配特征提取器中对应卷积层的权重形状
            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].conv.weight.data = value  # 设置权重数据
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")  # 记录日志
    # 如果类型标识为2且不使用组归一化，或者类型标识为2且是第一层且使用了组归一化
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        if "bias" in name:
            # 检查值的形状是否匹配特征提取器中对应层归一化的偏置项形状
            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value  # 设置层归一化偏置项数据
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")  # 记录日志
        elif "weight" in name:
            # 检查值的形状是否匹配特征提取器中对应层归一化的权重形状
            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value  # 设置层归一化权重数据
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")  # 记录日志
    else:
        unused_weights.append(full_name)  # 将未使用的权重名称添加到列表中


@torch.no_grad()
def convert_speecht5_checkpoint(
    task,
    checkpoint_path,
    pytorch_dump_folder_path,
    config_path=None,
    vocab_path=None,
    repo_id=None,
):
    """
    将模型的权重复制/粘贴/调整到transformers设计中。
    """
    if config_path is not None:
        config = SpeechT5Config.from_pretrained(config_path)  # 从预训练配置文件加载配置
    else:
        config = SpeechT5Config()  # 创建一个默认配置对象
    # 根据任务类型选择配置参数和模型
    if task == "s2t":
        config.max_length = config.max_text_positions
        # 使用给定的配置创建语音到文本任务的模型对象
        model = SpeechT5ForSpeechToText(config)
    elif task == "t2s":
        config.max_speech_positions = 1876
        config.max_text_positions = 600
        config.max_length = config.max_speech_positions
        # 使用给定的配置创建文本到语音任务的模型对象
        model = SpeechT5ForTextToSpeech(config)
    elif task == "s2s":
        config.max_speech_positions = 1876
        config.max_length = config.max_speech_positions
        # 使用给定的配置创建语音到语音任务的模型对象
        model = SpeechT5ForSpeechToSpeech(config)
    else:
        # 如果任务名未知，则抛出值错误异常
        raise ValueError(f"Unknown task name: {task}")

    if vocab_path:
        # 使用给定的词汇表路径和模型最大长度创建语音T5分词器对象
        tokenizer = SpeechT5Tokenizer(vocab_path, model_max_length=config.max_text_positions)

        # 添加一个特殊的掩码标记，表现得像普通词汇，即在其前面包含空格
        mask_token = AddedToken("<mask>", lstrip=True, rstrip=False)
        tokenizer.mask_token = mask_token
        tokenizer.add_special_tokens({"mask_token": mask_token})  # 添加特殊标记到分词器中
        tokenizer.add_tokens(["<ctc_blank>"])  # 添加特殊标记到分词器中

    # 创建语音T5特征提取器对象
    feature_extractor = SpeechT5FeatureExtractor()
    # 使用分词器和特征提取器创建语音T5处理器对象
    processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
    # 将处理器对象保存到指定的PyTorch模型转储文件夹路径
    processor.save_pretrained(pytorch_dump_folder_path)

    # 加载Fairseq检查点中的权重到模型对象中
    fairseq_checkpoint = torch.load(checkpoint_path)
    recursively_load_weights(fairseq_checkpoint["model"], model, task)

    # 将模型对象保存到指定的PyTorch模型转储文件夹路径
    model.save_pretrained(pytorch_dump_folder_path)

    if repo_id:
        # 如果存在repo_id，则推送处理器和模型到Hub上
        print("Pushing to the hub...")
        processor.push_to_hub(repo_id)
        model.push_to_hub(repo_id)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    parser.add_argument(
        "--task",
        default="s2t",
        type=str,
        help="Type of the SpeechT5 model you'd like to convert. Should be one of 's2t', 't2s', 's2s'.",
    )
    # 添加名为 "--task" 的命令行参数，指定默认值为 "s2t"，类型为字符串，用于指定要转换的模型类型

    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to fairseq checkpoint")
    # 添加名为 "--checkpoint_path" 的必需命令行参数，类型为字符串，用于指定 fairseq 模型的检查点路径

    parser.add_argument("--vocab_path", default=None, type=str, help="Path to SentencePiece model")
    # 添加名为 "--vocab_path" 的可选命令行参数，类型为字符串，用于指定 SentencePiece 模型的路径

    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加名为 "--config_path" 的可选命令行参数，类型为字符串，用于指定要转换模型的 HF (Hugging Face) 配置文件路径

    parser.add_argument(
        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
    )
    # 添加名为 "--pytorch_dump_folder_path" 的必需命令行参数，类型为字符串，用于指定输出 PyTorch 模型的文件夹路径

    parser.add_argument(
        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
    )
    # 添加名为 "--push_to_hub" 的可选命令行参数，类型为字符串，用于指定在 🤗 hub 上上传转换后的模型的位置

    args = parser.parse_args()
    # 解析命令行参数，并将结果存储在 args 变量中

    convert_speecht5_checkpoint(
        args.task,
        args.checkpoint_path,
        args.pytorch_dump_folder_path,
        args.config_path,
        args.vocab_path,
        args.push_to_hub,
    )
    # 调用函数 convert_speecht5_checkpoint，并传递解析后的命令行参数作为函数的参数

`.\models\speecht5\feature_extraction_speecht5.py`

# coding=utf-8
# 版权 2023 年 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何形式的明示或暗示的担保或条件。
# 有关特定语言的权限，请参阅许可证。
"""SpeechT5 的特征提取器类。"""

import warnings
from typing import Any, Dict, List, Optional, Union

import numpy as np

# 导入音频处理工具函数
from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
# 导入特征提取序列工具函数
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
# 导入批次特征类
from ...feature_extraction_utils import BatchFeature
# 导入日志记录工具
from ...utils import PaddingStrategy, TensorType, logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 SpeechT5FeatureExtractor 类，继承自 SequenceFeatureExtractor 类
class SpeechT5FeatureExtractor(SequenceFeatureExtractor):
    r"""
    构造一个 SpeechT5 特征提取器。

    此类可以通过（可选地）将原始语音信号归一化为零均值单位方差，以供 SpeechT5 语音编码器预网络使用。

    此类还可以从原始语音中提取对数梅尔滤波器组特征，以供 SpeechT5 语音解码器预网络使用。

    此特征提取器继承自 [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`]，其中包含大多数主要方法。
    用户应参考此超类以获取有关这些方法的更多信息。
    """
    """
    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 80):
            The number of mel-frequency bins in the extracted spectrogram features.
        hop_length (`int`, *optional*, defaults to 16):
            Number of ms between windows. Otherwise referred to as "shift" in many papers.
        win_length (`int`, *optional*, defaults to 64):
            Number of ms per window.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        frame_signal_scale (`float`, *optional*, defaults to 1.0):
            Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
        fmin (`float`, *optional*, defaults to 80):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*, defaults to 7600):
            Maximum mel frequency in Hz.
        mel_floor (`float`, *optional*, defaults to 1e-10):
            Minimum value of mel frequency banks.
        reduction_factor (`int`, *optional*, defaults to 2):
            Spectrogram length reduction factor. This argument is deprecated.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
    """

    # 初始定义模型的输入名称列表
    model_input_names = ["input_values", "attention_mask"]

    # 初始化函数，设置音频特征提取器的参数
    def __init__(
        self,
        feature_size: int = 1,
        sampling_rate: int = 16000,
        padding_value: float = 0.0,
        do_normalize: bool = False,
        num_mel_bins: int = 80,
        hop_length: int = 16,
        win_length: int = 64,
        win_function: str = "hann_window",
        frame_signal_scale: float = 1.0,
        fmin: float = 80,
        fmax: float = 7600,
        mel_floor: float = 1e-10,
        reduction_factor: int = 2,
        return_attention_mask: bool = True,
        **kwargs,
    ):
    ):
        # 调用父类初始化函数，设置特征大小、采样率、填充值等参数
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
        # 是否进行归一化处理
        self.do_normalize = do_normalize
        # 是否返回注意力掩码
        self.return_attention_mask = return_attention_mask

        # 梅尔滤波器参数
        self.num_mel_bins = num_mel_bins
        self.hop_length = hop_length
        self.win_length = win_length
        self.win_function = win_function
        self.frame_signal_scale = frame_signal_scale
        self.fmin = fmin
        self.fmax = fmax
        self.mel_floor = mel_floor
        self.reduction_factor = reduction_factor

        # 根据窗口长度和采样率计算样本大小和样本步长
        self.sample_size = win_length * sampling_rate // 1000
        self.sample_stride = hop_length * sampling_rate // 1000
        # 计算最优的 FFT 长度
        self.n_fft = optimal_fft_length(self.sample_size)
        # 计算频率数量
        self.n_freqs = (self.n_fft // 2) + 1

        # 设置窗口函数
        self.window = window_function(window_length=self.sample_size, name=self.win_function, periodic=True)

        # 创建梅尔滤波器组
        self.mel_filters = mel_filter_bank(
            num_frequency_bins=self.n_freqs,
            num_mel_filters=self.num_mel_bins,
            min_frequency=self.fmin,
            max_frequency=self.fmax,
            sampling_rate=self.sampling_rate,
            norm="slaney",
            mel_scale="slaney",
        )

        # 如果帧信号比例不为1.0，发出警告
        if frame_signal_scale != 1.0:
            warnings.warn(
                "The argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformers",
                FutureWarning,
            )
        # 如果减少因子不为2.0，发出警告
        if reduction_factor != 2.0:
            warnings.warn(
                "The argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers",
                FutureWarning,
            )

    @staticmethod
    # 从 transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm 复制而来
    def zero_mean_unit_var_norm(
        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
    ) -> List[np.ndarray]:
        """
        每个数组在列表中被归一化为零均值和单位方差
        """
        if attention_mask is not None:
            attention_mask = np.array(attention_mask, np.int32)
            normed_input_values = []

            for vector, length in zip(input_values, attention_mask.sum(-1)):
                # 计算归一化的切片，确保未定义的部分用填充值填充
                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
                if length < normed_slice.shape[0]:
                    normed_slice[length:] = padding_value

                normed_input_values.append(normed_slice)
        else:
            # 对于没有注意力掩码的情况，对所有输入值进行归一化
            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]

        return normed_input_values

    def _extract_mel_features(
        self,
        one_waveform: np.ndarray,
    ) -> np.ndarray:
        """
        Extracts log-mel filterbank features for one waveform array (unbatched).
        """
        # Compute log-mel spectrogram features for a single waveform array
        log_mel_spec = spectrogram(
            one_waveform,
            window=self.window,
            frame_length=self.sample_size,
            hop_length=self.sample_stride,
            fft_length=self.n_fft,
            mel_filters=self.mel_filters,
            mel_floor=self.mel_floor,
            log_mel="log10",
        )
        # Transpose the spectrogram matrix
        return log_mel_spec.T

    def __call__(
        self,
        audio: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
        audio_target: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
        padding: Union[bool, str, PaddingStrategy] = False,
        max_length: Optional[int] = None,
        truncation: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,
        **kwargs,
    ):
        """
        Process audio input according to specified parameters.
        """
        def _process_audio(
            self,
            speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
            is_target: bool = False,
            padding: Union[bool, str, PaddingStrategy] = False,
            max_length: Optional[int] = None,
            truncation: bool = False,
            pad_to_multiple_of: Optional[int] = None,
            return_attention_mask: Optional[bool] = None,
            return_tensors: Optional[Union[str, TensorType]] = None,
            **kwargs,
        ):
            """
            Internal method to process audio data.
            """

    def to_dict(self) -> Dict[str, Any]:
        """
        Convert the object's properties to a dictionary representation.
        """
        # Start with the base class's dictionary representation
        output = super().to_dict()

        # Remove derived properties from serialization
        names = ["window", "mel_filters", "sample_size", "sample_stride", "n_fft", "n_freqs"]
        for name in names:
            if name in output:
                del output[name]

        return output

`.\models\speecht5\modeling_speecht5.py`

# coding=utf-8
# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch SpeechT5 model."""

import math
from typing import List, Optional, Tuple, Union

import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, L1Loss

from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqSpectrogramOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig


logger = logging.get_logger(__name__)


_HIDDEN_STATES_START_POSITION = 1

# General docstring
_CONFIG_FOR_DOC = "SpeechT5Config"


SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/speecht5_asr",
    "microsoft/speecht5_tts",
    "microsoft/speecht5_vc",
    # See all SpeechT5 models at https://huggingface.co/models?filter=speecht5
]


# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


def shift_spectrograms_right(input_values: torch.Tensor, reduction_factor: int = 1):
    """
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    """
    # thin out frames for reduction factor
    if reduction_factor > 1:
        input_values = input_values[:, reduction_factor - 1 :: reduction_factor]

    # Initialize a tensor of zeros with the same shape as input_values
    shifted_input_values = input_values.new_zeros(input_values.shape)
    # 将输入数据的每一行的除第一列外的所有列向右移动一个位置，使用输入数据的副本进行操作
    shifted_input_values[:, 1:] = input_values[:, :-1].clone()
    
    # 将输入数据中可能存在的标签为 -100 的数值替换为零
    shifted_input_values.masked_fill_(shifted_input_values == -100.0, 0.0)
    
    # 返回经过处理后的移位后的输入数据
    return shifted_input_values
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                   independently generated mask spans of length `mask_length` is computed by
                   `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                   actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """
    batch_size, sequence_length = shape  # 解构元组，获取批量大小和序列长度

    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")  # 如果 mask_length 不大于 1，则抛出错误

    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )  # 如果 mask_length 大于序列长度，则抛出错误

    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()  # 生成一个随机数 epsilon，用于概率取整

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)  # 计算应该遮罩的 span 数量
        num_masked_span = max(num_masked_span, min_masks)  # 确保遮罩的 span 数量不低于 min_masks

        # make sure num masked span <= sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length  # 确保遮罩的 span 数量不超过序列长度

        # make sure num_masked span is also <= input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)  # 确保遮罩的 span 数量不超过 input_length - (mask_length - 1)

        return num_masked_span

    # compute number of masked spans in batch
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()  # 如果 attention_mask 不为空，计算每个样本的有效长度
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]  # 否则默认每个样本的有效长度为 sequence_length
    )

    # SpecAugment mask to fill
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)  # 创建一个全零的布尔类型数组，用于存放遮罩

    spec_aug_mask_idxs = []  # 初始化存放遮罩索引的列表

    max_num_masked_span = compute_num_masked_span(sequence_length)  # 计算序列长度内的最大遮罩 span 数量
    # 如果最大可屏蔽跨度为0，则直接返回特定的屏蔽掩码
    if max_num_masked_span == 0:
        return spec_aug_mask

    # 遍历每个输入序列的长度
    for input_length in input_lengths:
        # 计算当前输入序列的屏蔽跨度数量
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要屏蔽的索引位置
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 如果没有选择到任何索引，说明所有位置都被占用，使用最后一个位置作为虚拟屏蔽索引
        if len(spec_aug_mask_idx) == 0:
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 扩展屏蔽索引数组，以确保长度达到最大可屏蔽跨度
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将屏蔽索引数组转换为 numpy 数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将屏蔽索引扩展为屏蔽跨度
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 添加偏移量以形成完整的屏蔽跨度
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保屏蔽的索引不超过序列长度
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 在指定位置上进行屏蔽操作，将屏蔽结果存储在 spec_aug_mask 中
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回最终的特殊增强屏蔽结果
    return spec_aug_mask
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer 复制而来，将 Wav2Vec2 替换为 SpeechT5
class SpeechT5NoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 设置输入卷积维度为前一层的卷积维度（如果有的话），否则为 1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为当前层的卷积维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个 1D 卷积层，输入维度为 in_conv_dim，输出维度为 out_conv_dim
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 使用配置中的卷积核大小
            stride=config.conv_stride[layer_id],      # 使用配置中的步幅大小
            bias=config.conv_bias,                    # 是否使用配置中的偏置
        )
        # 使用配置中指定的激活函数名，从全局的 ACT2FN 字典中获取相应的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将隐藏状态传递给卷积层进行计算
        hidden_states = self.conv(hidden_states)
        # 将卷积层的输出应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer 复制而来，将 Wav2Vec2 替换为 SpeechT5
class SpeechT5LayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 设置输入卷积维度为前一层的卷积维度（如果有的话），否则为 1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为当前层的卷积维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个 1D 卷积层，输入维度为 in_conv_dim，输出维度为 out_conv_dim
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 使用配置中的卷积核大小
            stride=config.conv_stride[layer_id],      # 使用配置中的步幅大小
            bias=config.conv_bias,                    # 是否使用配置中的偏置
        )
        # 创建一个 LayerNorm 层，对输出的卷积结果进行归一化处理
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        # 使用配置中指定的激活函数名，从全局的 ACT2FN 字典中获取相应的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将隐藏状态传递给卷积层进行计算
        hidden_states = self.conv(hidden_states)

        # 对卷积结果进行维度转换，调整至合适的形状
        hidden_states = hidden_states.transpose(-2, -1)
        # 将转置后的结果输入到 LayerNorm 层中进行归一化处理
        hidden_states = self.layer_norm(hidden_states)
        # 再次将维度转换回原始形状
        hidden_states = hidden_states.transpose(-2, -1)

        # 最后应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer 复制而来，将 Wav2Vec2 替换为 SpeechT5
class SpeechT5GroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 设置输入卷积维度为前一层的卷积维度（如果有的话），否则为 1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为当前层的卷积维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个 1D 卷积层，输入维度为 in_conv_dim，输出维度为 out_conv_dim
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 使用配置中的卷积核大小
            stride=config.conv_stride[layer_id],      # 使用配置中的步幅大小
            bias=config.conv_bias,                    # 是否使用配置中的偏置
        )
        # 使用配置中指定的激活函数名，从全局的 ACT2FN 字典中获取相应的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

        # 创建一个 GroupNorm 层，对输出的卷积结果进行分组归一化处理
        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        # 将隐藏状态传递给卷积层进行计算
        hidden_states = self.conv(hidden_states)
        # 将卷积结果输入到 GroupNorm 层中进行分组归一化处理
        hidden_states = self.layer_norm(hidden_states)
        # 最后应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states
# 将 Speech2TextSinusoidalPositionalEmbedding 类替换为 SpeechT5SinusoidalPositionalEmbedding，适用于 SpeechT5 模型
class SpeechT5SinusoidalPositionalEmbedding(nn.Module):
    """本模块生成任意长度的正弦余弦位置嵌入。"""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
        # 初始化模块，设置未使用的偏移量、嵌入维度、填充索引
        super().__init__()
        self.offset = 2
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        # 预生成权重矩阵
        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)

    # 自定义权重矩阵生成方法，确保正确类型和设备的参数
    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        # 根据数理规则生成预设的权重矩阵
        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
        if hasattr(self, "weights"):
            # 调整权重矩阵设备与类型，并更新参数
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        self.weights = nn.Parameter(emb_weights)
        self.weights.requires_grad = False  # 确保权重矩阵不可训练
        self.weights.detach_()

    # 根据数学原理生成权重矩阵，定义句法和辅助常量使用
    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        """
        创造正弦余弦嵌入式权重矩阵，遵循文档中归纳的公式，生成为一种特定应用的基础矢量组。
        """
        half_dim = embedding_dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        # 如果 Weight 维度不兼容，进行必要的补充以避免错误
        if embedding_dim % 2 == 1:
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            emb[padding_idx, :] = 0
        return emb.to(torch.get_default_dtype())

    # 无梯度权重获得器函数，实际赋予输入序列对应位置的权重嵌入，且优化使用管理
    @torch.no_grad()
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        bsz, seq_len = input_ids.size()    
        # 创建从输入令牌 ID 转换得到的唯一位置索引列表
        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
            input_ids.device
        )
        # 扩展权重矩阵，以覆盖可能的序列长度变化情况
        max_pos = self.padding_idx + 1 + seq_len     
        if max_pos > self.weights.size(0):
            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)

        # 从位置索引矩阵中按索引选择权重矩阵元素，构建输出 Tensor
        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()

    # 创建携带历史关键值转换序列长度以适应当前输入 ID 序列的附加位置索引
    def create_position_ids_from_input_ids(
        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
    ):
        # 接受含填充和不需要的技术标识符的线性 ID 列表，并返回在上下文考虑中的增强系列长度
        # 分发到与输入相同设备的自定义模块结果
        """
        将非填充符号替换为它们的位置编号。位置编号从 padding_idx+1 开始。填充符号被忽略。这是基于fairseq的`utils.make_positions`修改的版本。

        Args:
            x: torch.Tensor 输入张量
        Returns:
            torch.Tensor 包含位置编号的张量
        """
        # 下面的类型转换操作被精心设计，以便同时适用于ONNX导出和XLA。
        # 创建一个张量，其中非填充位置为1，填充位置为0
        mask = input_ids.ne(padding_idx).int()
        # 计算每个位置的累积索引，并加上过去的键值长度
        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
        # 将计算得到的位置索引转换为长整型，并加上填充索引，以得到最终的位置编号张量
        return incremental_indices.long() + padding_idx
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding 复制代码，并将 Wav2Vec2 替换为 SpeechT5
class SpeechT5PositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个一维卷积层，用于位置编码
        self.conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=config.num_conv_pos_embeddings,
            padding=config.num_conv_pos_embeddings // 2,
            groups=config.num_conv_pos_embedding_groups,
        )

        # 初始化权重归一化函数
        weight_norm = nn.utils.weight_norm
        if hasattr(nn.utils.parametrizations, "weight_norm"):
            weight_norm = nn.utils.parametrizations.weight_norm

        # 如果启用了 DeepSpeed zero3 模式
        if is_deepspeed_zero3_enabled():
            import deepspeed
            # 使用 GatheredParameters 从第0个修饰器秩（modifier_rank=0）收集卷积层的权重
            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                self.conv = weight_norm(self.conv, name="weight", dim=2)
            # 注册卷积层权重的外部参数
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 对卷积层进行权重归一化
            self.conv = weight_norm(self.conv, name="weight", dim=2)

        # 初始化同填充层
        self.padding = SpeechT5SamePadLayer(config.num_conv_pos_embeddings)
        # 初始化激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将输入的 hidden_states 调换维度，使得通道维度处于第二个位置
        hidden_states = hidden_states.transpose(1, 2)

        # 通过卷积层进行位置编码计算
        hidden_states = self.conv(hidden_states)
        # 进行同填充处理
        hidden_states = self.padding(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)

        # 调换维度，使通道维度回到最后一个位置
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


class SpeechT5ScaledPositionalEncoding(nn.Module):
    """
    Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
    """

    def __init__(self, dropout, dim, max_len=5000):
        # 初始化位置编码矩阵 pe
        pe = torch.zeros(max_len, dim)
        # 生成位置索引
        position = torch.arange(0, max_len).unsqueeze(1)
        # 计算位置编码的除数项
        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.int64).float() * -(math.log(10000.0) / dim)))
        # 计算 sin 和 cos 的位置编码
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        # 增加一维作为 batch 维度
        pe = pe.unsqueeze(0)
        super().__init__()
        # 将 pe 注册为 buffer，非持久性
        self.register_buffer("pe", pe, persistent=False)
        # 初始化 dropout 层
        self.dropout = nn.Dropout(p=dropout)
        self.dim = dim
        # 初始化缩放参数 alpha
        self.alpha = torch.nn.Parameter(torch.tensor(1.0))

    def forward(self, emb):
        # 将输入的 emb 与位置编码相加，并乘以缩放参数 alpha
        emb = emb + self.alpha * self.pe[:, : emb.size(1)]
        # 应用 dropout
        emb = self.dropout(emb)
        return emb


class SpeechT5RelativePositionalEncoding(torch.nn.Module):
    def __init__(self, dim, max_length=1000):
        super().__init__()
        self.dim = dim
        self.max_length = max_length
        # 初始化相对位置编码层
        self.pe_k = torch.nn.Embedding(2 * max_length, dim)
    # 定义前向传播方法，接受隐藏状态作为输入
    def forward(self, hidden_states):
        # 获取序列长度
        seq_len = hidden_states.shape[1]
        # 创建一个从 0 到 seq_len-1 的整数序列，并将其转换为长整型张量，移动到与 hidden_states 相同的设备上
        pos_seq = torch.arange(0, seq_len).long().to(hidden_states.device)
        # 将 pos_seq 转置并计算每对位置之间的差值，形成位置编码矩阵
        pos_seq = pos_seq[:, None] - pos_seq[None, :]

        # 将位置编码矩阵中小于 -self.max_length 的值设为 -self.max_length
        pos_seq[pos_seq < -self.max_length] = -self.max_length
        # 将位置编码矩阵中大于等于 self.max_length 的值设为 self.max_length - 1
        pos_seq[pos_seq >= self.max_length] = self.max_length - 1
        # 将位置编码矩阵中所有值加上 self.max_length，保证所有值非负
        pos_seq = pos_seq + self.max_length

        # 使用位置编码矩阵作为索引，调用位置编码器的 pe_k 方法进行位置编码
        return self.pe_k(pos_seq)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制代码，将 Wav2Vec2 替换为 SpeechT5
class SpeechT5SamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 如果 num_conv_pos_embeddings 是偶数，则 num_pad_remove 为 1，否则为 0
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        # 如果 num_pad_remove 大于 0，则从 hidden_states 的末尾移除对应数量的填充
        if self.num_pad_remove > 0:
            hidden_states = hidden_states[:, :, :-self.num_pad_remove]
        # 返回处理后的 hidden_states
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制代码，将 Wav2Vec2 替换为 SpeechT5
class SpeechT5FeatureEncoder(nn.Module):
    """从原始音频波形构建特征"""

    def __init__(self, config):
        super().__init__()

        if config.feat_extract_norm == "group":
            # 如果特征提取使用 group normalization，则创建相应的卷积层列表
            conv_layers = [SpeechT5GroupNormConvLayer(config, layer_id=0)] + [
                SpeechT5NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果特征提取使用 layer normalization，则创建相应的卷积层列表
            conv_layers = [
                SpeechT5LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
            ]
        else:
            # 抛出值错误，提示 config.feat_extract_norm 的值必须是 'group' 或 'layer'
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )
        # 将卷积层列表转换为 ModuleList
        self.conv_layers = nn.ModuleList(conv_layers)
        self.gradient_checkpointing = False
        self._requires_grad = True

    def _freeze_parameters(self):
        # 冻结模型参数，设置 _requires_grad 为 False
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def forward(self, input_values):
        # 将输入的音频值扩展一个维度
        hidden_states = input_values[:, None]

        # 如果模型正在训练并且需要梯度，设置 hidden_states 的梯度追踪为 True
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        # 遍历所有卷积层，依次对 hidden_states 进行卷积操作
        for conv_layer in self.conv_layers:
            if self._requires_grad and self.gradient_checkpointing and self.training:
                # 如果使用梯度检查点和需要梯度且在训练中，则调用梯度检查点函数对 conv_layer 进行处理
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,
                    hidden_states,
                )
            else:
                # 否则，直接调用 conv_layer 对 hidden_states 进行卷积操作
                hidden_states = conv_layer(hidden_states)

        # 返回处理后的 hidden_states
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection 复制代码，将 Wav2Vec2 替换为 SpeechT5
class SpeechT5FeatureProjection(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用 layer normalization 初始化 layer_norm，设置 eps 为 config.layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
        # 使用线性层初始化 projection，将卷积的最后一维映射到 hidden_size
        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # 使用 dropout 初始化 dropout，设置 dropout 概率为 config.feat_proj_dropout
        self.dropout = nn.Dropout(config.feat_proj_dropout)
    # 定义一个前向传播方法，用于处理隐藏状态
    def forward(self, hidden_states):
        # 对隐藏状态进行层归一化处理，用于量化
        norm_hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的隐藏状态投影到新的空间
        hidden_states = self.projection(norm_hidden_states)
        # 对投影后的隐藏状态应用丢弃（dropout）操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的隐藏状态以及归一化前的隐藏状态
        return hidden_states, norm_hidden_states
# 定义一个名为 SpeechT5SpeechEncoderPrenet 的类，继承自 nn.Module
class SpeechT5SpeechEncoderPrenet(nn.Module):
    # 初始化方法，接受一个 config 参数
    def __init__(self, config):
        # 调用父类（nn.Module）的初始化方法
        super().__init__()
        # 将传入的 config 参数保存到实例的 config 属性中
        self.config = config
        # 创建一个 SpeechT5FeatureEncoder 类的实例，并保存到 feature_encoder 属性中
        self.feature_encoder = SpeechT5FeatureEncoder(config)
        # 创建一个 SpeechT5FeatureProjection 类的实例，并保存到 feature_projection 属性中
        self.feature_projection = SpeechT5FeatureProjection(config)

        # 只有当 config 中 mask_time_prob 大于 0.0 或者 config 中 mask_feature_prob 大于 0.0 时，才需要创建 masked_spec_embed 属性
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            # 创建一个随机初始化的可学习参数（nn.Parameter），并保存到 masked_spec_embed 属性中
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())

        # 创建一个 SpeechT5PositionalConvEmbedding 类的实例，并保存到 pos_conv_embed 属性中
        self.pos_conv_embed = SpeechT5PositionalConvEmbedding(config)
        # 创建一个 SpeechT5SinusoidalPositionalEmbedding 类的实例，并保存到 pos_sinusoidal_embed 属性中
        self.pos_sinusoidal_embed = SpeechT5SinusoidalPositionalEmbedding(
            config.max_speech_positions + config.pad_token_id + 1,
            config.hidden_size,
            config.pad_token_id,
        )

    # 冻结 feature_encoder 中的参数
    def freeze_feature_encoder(self):
        self.feature_encoder._freeze_parameters()

    # 前向传播方法
    def forward(
        self,
        input_values: torch.Tensor,  # 输入的特征张量
        attention_mask: Optional[torch.LongTensor] = None,  # 可选的注意力掩码张量
        mask_time_indices: Optional[torch.FloatTensor] = None,  # 可选的时间掩码张量
    ):
        # 使用 feature_encoder 对输入的特征进行编码
        extract_features = self.feature_encoder(input_values)
        # 将编码后的特征张量进行维度转置
        extract_features = extract_features.transpose(1, 2)

        # 如果 attention_mask 不为 None，则计算与特征向量对应的减少的 attention_mask
        if attention_mask is not None:
            attention_mask = self._get_feature_vector_attention_mask(
                extract_features.shape[1],  # 提取特征后的特征长度
                attention_mask,
            )

        # 使用 feature_projection 对特征进行投影，并获取隐藏状态和投影后的特征
        hidden_states, extract_features = self.feature_projection(extract_features)
        
        # 对隐藏状态进行掩码处理，根据传入的 mask_time_indices 和 attention_mask
        hidden_states = self._mask_hidden_states(
            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
        )

        # 使用 pos_conv_embed 对隐藏状态进行位置卷积嵌入
        positional_conv_embedding = self.pos_conv_embed(hidden_states)
        hidden_states = hidden_states + positional_conv_embedding

        # 如果 attention_mask 不为 None，则创建一个 padding_mask 张量，用于表示 padding 的位置
        if attention_mask is not None:
            padding_mask = attention_mask.ne(1).long()
        else:
            # 否则创建一个全零张量，形状与隐藏状态的前两个维度相同，用于表示没有 padding 的位置
            padding_mask = torch.zeros(hidden_states.shape[:2], dtype=torch.long, device=hidden_states.device)

        # 使用 pos_sinusoidal_embed 对隐藏状态进行位置正弦嵌入
        positional_sinusoidal_embeddings = self.pos_sinusoidal_embed(padding_mask)
        hidden_states = hidden_states + positional_sinusoidal_embeddings

        # 返回隐藏状态和注意力掩码（如果有的话）
        return hidden_states, attention_mask

    # 从 transformers.models.unispeech.modeling_unispeech.UniSpeechPreTrainedModel._get_feature_vector_attention_mask 复制的方法
    # 计算非填充部分的长度，即每个序列的有效长度
    non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
    # 根据有效长度计算输出长度，并转换为长整型
    output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
    # 获取批量大小
    batch_size = attention_mask.shape[0]

    # 创建一个全零的注意力掩码张量，形状为(batch_size, feature_vector_length)，类型和设备与原始注意力掩码相同
    attention_mask = torch.zeros(
        (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
    )
    # 将每个序列的有效长度对应位置设为1，表示需要关注的部分
    attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
    # 反转注意力掩码张量，进行累积和布尔化处理，以确保输出长度位置之前的所有值都被关注
    attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
    # 返回处理后的注意力掩码张量
    return attention_mask

# 从UniSpeechPreTrainedModel中复制的函数，用于计算卷积层的输出长度
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
    """
    计算卷积层的输出长度
    """

    def _conv_out_length(input_length, kernel_size, stride):
        # 从 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html 获取的1D卷积层输出长度公式
        return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

    # 遍历配置中的卷积核大小和步长，依次计算输出长度
    for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
        input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

    return input_lengths

    # 从Wav2Vec2Model中复制的函数，用于对隐藏状态进行掩码处理
    def _mask_hidden_states(
        self,
        hidden_states: torch.FloatTensor,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
    ):
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        # 检查配置中是否允许应用 SpecAugment，如果不允许，则直接返回隐藏状态
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        # 获取隐藏状态的尺寸信息：批大小、序列长度和隐藏大小
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # 如果提供了 mask_time_indices，则沿时间轴应用 SpecAugment
            # 使用给定的 mask_time_indices 对隐藏状态进行 SpecAugment
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # 如果未提供 mask_time_indices，则根据配置和训练状态生成并应用时间轴上的 SpecAugment
            # 根据配置生成时间轴上的掩码索引
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # 如果配置允许并且在训练模式下，生成并应用特征轴上的 SpecAugment
            # 根据配置生成特征轴上的掩码索引
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            # 将特征轴上的掩码扩展到整个序列上，并将相应位置的隐藏状态置零
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        # 返回经过 SpecAugment 处理后的隐藏状态
        return hidden_states
class SpeechT5SpeechDecoderPrenet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        # 定义多层线性层组成的神经网络
        self.layers = nn.ModuleList(
            [
                nn.Linear(
                    config.num_mel_bins if i == 0 else config.speech_decoder_prenet_units,
                    config.speech_decoder_prenet_units,
                )
                for i in range(config.speech_decoder_prenet_layers)
            ]
        )

        # 定义最终的线性层
        self.final_layer = nn.Linear(config.speech_decoder_prenet_units, config.hidden_size)
        
        # 编码位置信息的模块
        self.encode_positions = SpeechT5ScaledPositionalEncoding(
            config.positional_dropout,
            config.hidden_size,
            config.max_speech_positions,
        )
        
        # 处理说话者嵌入的线性层
        self.speaker_embeds_layer = nn.Linear(config.speaker_embedding_dim + config.hidden_size, config.hidden_size)

    # 实现一致性 dropout 的方法
    def _consistent_dropout(self, inputs_embeds, p):
        mask = torch.bernoulli(inputs_embeds[0], p=p)
        all_masks = mask.unsqueeze(0).repeat(inputs_embeds.size(0), 1, 1)
        return torch.where(all_masks == 1, inputs_embeds, 0) * 1 / (1 - p)

    # 前向传播方法
    def forward(
        self,
        input_values: torch.Tensor,
        speaker_embeddings: Optional[torch.Tensor] = None,
    ):
        # 在评估时也始终应用 dropout，参见 https://arxiv.org/abs/1712.05884 §2.2。

        inputs_embeds = input_values
        
        # 对每一层线性层应用 ReLU 激活函数和一致性 dropout
        for layer in self.layers:
            inputs_embeds = nn.functional.relu(layer(inputs_embeds))
            inputs_embeds = self._consistent_dropout(inputs_embeds, self.config.speech_decoder_prenet_dropout)

        # 应用最终的线性层
        inputs_embeds = self.final_layer(inputs_embeds)
        
        # 编码位置信息
        inputs_embeds = self.encode_positions(inputs_embeds)

        # 如果提供了说话者嵌入，则将其归一化并拼接到输入中，然后再应用 ReLU 激活函数
        if speaker_embeddings is not None:
            speaker_embeddings = nn.functional.normalize(speaker_embeddings)
            speaker_embeddings = speaker_embeddings.unsqueeze(1).expand(-1, inputs_embeds.size(1), -1)
            inputs_embeds = torch.cat([inputs_embeds, speaker_embeddings], dim=-1)
            inputs_embeds = nn.functional.relu(self.speaker_embeds_layer(inputs_embeds))

        return inputs_embeds
    # 初始化函数，用于构造一个卷积层和相关的批归一化、激活函数、dropout等组件
    def __init__(self, config, layer_id=0):
        # 调用父类的初始化函数
        super().__init__()

        # 根据给定的层编号确定输入卷积层的维度
        if layer_id == 0:
            in_conv_dim = config.num_mel_bins
        else:
            in_conv_dim = config.speech_decoder_postnet_units

        # 根据给定的层编号确定输出卷积层的维度
        if layer_id == config.speech_decoder_postnet_layers - 1:
            out_conv_dim = config.num_mel_bins
        else:
            out_conv_dim = config.speech_decoder_postnet_units

        # 创建一个一维卷积层对象
        self.conv = nn.Conv1d(
            in_conv_dim,
            out_conv_dim,
            kernel_size=config.speech_decoder_postnet_kernel,
            stride=1,
            padding=(config.speech_decoder_postnet_kernel - 1) // 2,
            bias=False,
        )
        # 创建一个一维批归一化层对象
        self.batch_norm = nn.BatchNorm1d(out_conv_dim)

        # 根据层编号选择是否使用双曲正切作为激活函数
        if layer_id < config.speech_decoder_postnet_layers - 1:
            self.activation = nn.Tanh()
        else:
            self.activation = None

        # 创建一个dropout层对象，用于在训练过程中随机置零输入张量的部分元素
        self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)

    # 前向传播函数，接收输入的隐藏状态张量，经过卷积、批归一化、激活和dropout处理后返回处理后的隐藏状态张量
    def forward(self, hidden_states):
        # 使用卷积层处理输入张量
        hidden_states = self.conv(hidden_states)
        # 使用批归一化层处理卷积后的张量
        hidden_states = self.batch_norm(hidden_states)
        # 如果存在激活函数，则应用激活函数到批归一化后的张量
        if self.activation is not None:
            hidden_states = self.activation(hidden_states)
        # 应用dropout到处理后的张量
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的张量作为本层的输出
        return hidden_states
class SpeechT5SpeechDecoderPostnet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config  # 初始化模型配置

        # 定义输出特征和概率的线性层
        self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
        self.prob_out = nn.Linear(config.hidden_size, config.reduction_factor)

        # 创建一系列的语音解码后处理层
        self.layers = nn.ModuleList(
            [SpeechT5BatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
        )

    def forward(self, hidden_states: torch.Tensor):
        # 计算特征输出，将其形状变换为(batch_size, seq_len, num_mel_bins)
        outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
        # 经过后处理网络处理特征输出
        outputs_after_postnet = self.postnet(outputs_before_postnet)
        # 计算最终的分类概率
        logits = self.prob_out(hidden_states).view(hidden_states.size(0), -1)
        return outputs_before_postnet, outputs_after_postnet, logits

    def postnet(self, hidden_states: torch.Tensor):
        # 转置隐藏状态以适应后处理层的输入格式
        layer_output = hidden_states.transpose(1, 2)
        # 通过每个后处理层处理隐藏状态
        for layer in self.layers:
            layer_output = layer(layer_output)
        # 将处理后的输出转置回原始形状，并与原始隐藏状态相加
        return hidden_states + layer_output.transpose(1, 2)


class SpeechT5TextEncoderPrenet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 词嵌入层，用于将词汇索引映射为隐藏表示
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
        # 编码位置信息的扩展
        self.encode_positions = SpeechT5ScaledPositionalEncoding(
            config.positional_dropout,
            config.hidden_size,
            config.max_text_positions,
        )

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(self, input_ids: torch.Tensor):
        # 获取输入词嵌入
        inputs_embeds = self.embed_tokens(input_ids)
        # 添加位置编码并返回编码结果
        inputs_embeds = self.encode_positions(inputs_embeds)
        return inputs_embeds


class SpeechT5TextDecoderPrenet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 随机失活层，用于减少过拟合
        self.dropout = nn.Dropout(config.positional_dropout)
        # 嵌入缩放因子，用于调整嵌入的标度
        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0

        # 词嵌入层，将词汇索引映射为隐藏表示
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)

        # 嵌入位置信息的正弦位置编码
        self.embed_positions = SpeechT5SinusoidalPositionalEmbedding(
            config.max_text_positions + config.pad_token_id + 1,
            config.hidden_size,
            config.pad_token_id,
        )

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        ):
        # 如果给定了输入的 token IDs
        if input_ids is not None:
            # 获取输入 token IDs 的形状
            input_shape = input_ids.size()
            # 将输入 token IDs 展平成二维张量，保留最后一个维度不变
            input_ids = input_ids.view(-1, input_shape[-1])
        else:
            # 如果没有提供输入 token IDs，则抛出数值错误
            raise ValueError("You have to specify `decoder_input_ids`")

        # 如果提供了过去的键值对，则计算过去的键的长度
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
        # 使用 embed_positions 方法计算位置嵌入
        positions = self.embed_positions(input_ids, past_key_values_length)

        # 使用 embed_tokens 方法获取输入 token IDs 的嵌入表示，并乘以 embed_scale
        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
        # 将位置嵌入加到输入的嵌入表示中
        inputs_embeds += positions
        # 对输入的嵌入表示进行 dropout
        inputs_embeds = self.dropout(inputs_embeds)

        # 返回处理后的输入嵌入表示和注意力掩码
        return inputs_embeds, attention_mask
class SpeechT5TextDecoderPostnet(nn.Module):
    # 定义一个用于T5文本解码的后处理网络模块
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 线性层，将隐藏状态映射到词汇表大小的输出空间，没有偏置
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

    def forward(self, hidden_states: torch.Tensor):
        # 前向传播函数，接收隐藏状态并返回线性层的输出
        return self.lm_head(hidden_states)

    def get_output_embeddings(self):
        # 返回当前的输出嵌入层（即lm_head）
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置新的输出嵌入层
        self.lm_head = new_embeddings


class SpeechT5Attention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    """
    # 定义T5模型中的注意力机制模块，支持相对位置偏置
    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        # 初始化查询、键、值的线性投影层和输出投影层
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入张量重塑为适合多头注意力计算的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        position_bias: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 实现注意力模块的前向传播
        # 接收隐藏状态、键值状态、注意力掩码等，返回加权后的输出及可能的注意力分布
        raise NotImplementedError


class SpeechT5FeedForward(nn.Module):
    # 定义一个T5模型中的前馈神经网络模块
    def __init__(self, config, intermediate_size):
        super().__init__()
        # 使用配置中的激活函数的丢弃层
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 中间层的线性层和激活函数
        self.intermediate_dense = nn.Linear(config.hidden_size, intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

        # 输出层的线性层和丢弃层
        self.output_dense = nn.Linear(intermediate_size, config.hidden_size)
        self.output_dropout = nn.Dropout(config.hidden_dropout)
    # 定义一个前向传播方法，接收隐藏状态作为输入
    def forward(self, hidden_states):
        # 通过中间的稠密层处理隐藏状态
        hidden_states = self.intermediate_dense(hidden_states)
        # 应用中间激活函数到处理后的隐藏状态
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 对处理后的隐藏状态进行中间的丢弃操作

        # 通过输出稠密层处理隐藏状态
        hidden_states = self.output_dense(hidden_states)
        # 对处理后的隐藏状态进行输出丢弃操作
        hidden_states = self.output_dropout(hidden_states)
        # 返回处理后的隐藏状态作为输出
        return hidden_states
class SpeechT5EncoderLayer(nn.Module):
    def __init__(self, config: SpeechT5Config):
        super().__init__()
        # 初始化自注意力层，使用配置中的隐藏层大小、注意力头数和注意力丢弃率，作为编码器
        self.attention = SpeechT5Attention(
            embed_dim=config.hidden_size,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        # 定义丢弃层，用于隐藏状态
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 定义层归一化，使用配置中的隐藏层大小和层归一化 epsilon 值
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义前馈网络，使用配置和编码器的 FFN 维度
        self.feed_forward = SpeechT5FeedForward(config, config.encoder_ffn_dim)
        # 定义最终层归一化，使用配置中的隐藏层大小和层归一化 epsilon 值
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        position_bias: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        """
        Args:
            hidden_states (`torch.FloatTensor`):
                输入层的隐藏状态，形状为 `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                注意力掩码，形状为 `(batch, 1, tgt_len, src_len)`，其中填充元素用极大负值表示
            layer_head_mask (`torch.FloatTensor`):
                给定层中注意力头的掩码，形状为 `(config.encoder_attention_heads,)`
            position_bias (`torch.FloatTensor`):
                相对位置嵌入，形状为 `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *可选*):
                是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 以获取更多细节。
        """
        # 保存残差连接
        residual = hidden_states
        # 使用自注意力层处理隐藏状态，返回处理后的隐藏状态、注意力权重以及（如果输出）所有层的注意力
        hidden_states, attn_weights, _ = self.attention(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            position_bias=position_bias,
            output_attentions=output_attentions,
        )

        # 应用丢弃层
        hidden_states = self.dropout(hidden_states)
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 应用层归一化
        hidden_states = self.layer_norm(hidden_states)
        # 使用前馈网络处理隐藏状态
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        # 最终层归一化
        hidden_states = self.final_layer_norm(hidden_states)

        # 输出结果
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


class SpeechT5DecoderLayer(nn.Module):
    # 这里需要继续实现 SpeechT5DecoderLayer 类的定义，但不在此处添加注释
    # 初始化函数，用于创建一个 SpeechT5DecoderLayer 对象
    def __init__(self, config: SpeechT5Config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建自注意力机制对象 SpeechT5Attention
        self.self_attn = SpeechT5Attention(
            embed_dim=config.hidden_size,                # 设置注意力机制的输入维度
            num_heads=config.decoder_attention_heads,    # 设置注意力头的数量
            dropout=config.attention_dropout,            # 设置注意力机制的dropout率
            is_decoder=True,                             # 声明这是一个decoder层的自注意力机制
        )
        # 创建dropout层
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 创建LayerNorm层，用于自注意力机制后的归一化
        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 创建编码器注意力机制对象 SpeechT5Attention
        self.encoder_attn = SpeechT5Attention(
            config.hidden_size,                           # 设置编码器注意力机制的输入维度
            config.decoder_attention_heads,               # 设置注意力头的数量
            dropout=config.attention_dropout,             # 设置注意力机制的dropout率
            is_decoder=True,                              # 声明这是一个decoder层的编码器注意力机制
        )
        # 创建LayerNorm层，用于编码器注意力机制后的归一化
        self.encoder_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 创建前向传播（Feed Forward）层对象 SpeechT5FeedForward
        self.feed_forward = SpeechT5FeedForward(config, config.decoder_ffn_dim)
        # 创建LayerNorm层，用于前向传播层后的归一化
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定该类使用的配置类
    config_class = SpeechT5Config
    # 指定基础模型前缀名称
    base_model_prefix = "speecht5"
    # 主要输入名称
    main_input_name = "input_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是 SpeechT5PositionalConvEmbedding 类型
        if isinstance(module, SpeechT5PositionalConvEmbedding):
            # 使用正态分布初始化卷积层权重
            nn.init.normal_(
                module.conv.weight,
                mean=0,
                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
            )
            # 将卷积层偏置初始化为常数0
            nn.init.constant_(module.conv.bias, 0)
        # 如果模块是 SpeechT5FeatureProjection 类型
        elif isinstance(module, SpeechT5FeatureProjection):
            # 计算初始化范围
            k = math.sqrt(1 / module.projection.in_features)
            # 使用均匀分布初始化投影层权重和偏置
            nn.init.uniform_(module.projection.weight, a=-k, b=k)
            nn.init.uniform_(module.projection.bias, a=-k, b=k)
        # 如果模块是 nn.Linear 类型
        elif isinstance(module, nn.Linear):
            # 使用正态分布初始化全连接层权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置，将偏置初始化为0
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是 nn.LayerNorm 或 nn.GroupNorm 类型
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            # 将归一化层的偏置初始化为0
            module.bias.data.zero_()
            # 将归一化层的权重初始化为1
            module.weight.data.fill_(1.0)
        # 如果模块是 nn.Conv1d 类型
        elif isinstance(module, nn.Conv1d):
            # 使用 Kaiming 正态分布初始化卷积层权重
            nn.init.kaiming_normal_(module.weight)
            # 如果有偏置，使用均匀分布初始化卷积层偏置
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)
        # 如果模块是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有填充索引，将填充索引对应的权重初始化为0
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
    # 定义神经网络模型的前向传播函数
    def forward(
        self,
        # 输入隐藏状态，通常是一个张量
        hidden_states: torch.FloatTensor,
        # 注意力掩码，用于指定哪些位置的输入应该被忽略
        attention_mask: Optional[torch.Tensor] = None,
        # 头部掩码，用于指定哪些注意力头部应该被忽略
        head_mask: Optional[torch.Tensor] = None,
        # 是否输出注意力权重
        output_attentions: Optional[bool] = None,
        # 是否输出所有隐藏状态
        output_hidden_states: Optional[bool] = None,
        # 是否返回一个字典格式的结果
        return_dict: Optional[bool] = None,
class SpeechT5EncoderWithSpeechPrenet(SpeechT5PreTrainedModel):
    """
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    """

    def __init__(self, config: SpeechT5Config):
        super().__init__(config)
        # 实例化一个 SpeechT5SpeechEncoderPrenet 对象，用于处理音频波形数据
        self.prenet = SpeechT5SpeechEncoderPrenet(config)
        # 实例化一个 SpeechT5Encoder 对象，用于编码处理后的特征
        self.wrapped_encoder = SpeechT5Encoder(config)

        # 调用后初始化方法，用于初始化权重和进行最终处理
        self.post_init()

    def forward(
        self,
        input_values: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 使用 SpeechT5SpeechEncoderPrenet 处理输入的音频数据，生成隐藏状态和注意力掩码
        hidden_states, attention_mask = self.prenet(input_values, attention_mask)

        # 将处理后的特征传递给 wrapped_encoder 进行进一步编码
        outputs = self.wrapped_encoder(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        return outputs


class SpeechT5EncoderWithTextPrenet(SpeechT5PreTrainedModel):
    """
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    """

    def __init__(self, config: SpeechT5Config):
        super().__init__(config)
        # 实例化一个 SpeechT5TextEncoderPrenet 对象，用于处理输入的文本特征
        self.prenet = SpeechT5TextEncoderPrenet(config)
        # 实例化一个 SpeechT5Encoder 对象，用于编码处理后的特征
        self.wrapped_encoder = SpeechT5Encoder(config)

        # 调用后初始化方法，用于初始化权重和进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 获取预处理器（prenet）的输入嵌入
        return self.prenet.get_input_embeddings()

    def set_input_embeddings(self, value):
        # 设置预处理器（prenet）的输入嵌入
        self.prenet.set_input_embeddings(value)

    def forward(
        self,
        input_values: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 使用 SpeechT5TextEncoderPrenet 处理输入的文本数据，生成隐藏状态
        hidden_states = self.prenet(input_values)

        # 将处理后的特征传递给 wrapped_encoder 进行进一步编码
        outputs = self.wrapped_encoder(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        return outputs


class SpeechT5EncoderWithoutPrenet(SpeechT5PreTrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    """
    # 初始化方法，接收一个 SpeechT5Config 类型的配置参数
    def __init__(self, config: SpeechT5Config):
        # 调用父类的初始化方法，传入配置参数
        super().__init__(config)
        # 创建一个 SpeechT5Encoder 的实例，并将其赋值给 self.wrapped_encoder
        self.wrapped_encoder = SpeechT5Encoder(config)

        # 执行后续的初始化操作和权重设置
        self.post_init()

    # 前向传播方法，接收多个输入参数并返回一个联合类型的值
    def forward(
        self,
        input_values: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        # 调用 self.wrapped_encoder 的前向传播方法，传递所有输入参数，并返回其输出
        return self.wrapped_encoder(
            hidden_states=input_values,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
class SpeechT5Decoder(SpeechT5PreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`].
    """

    def __init__(self, config: SpeechT5Config):
        super().__init__(config)
        self.layerdrop = config.decoder_layerdrop  # 设置层间丢弃率，从配置中获取

        # 创建多个解码层，并用列表包装成模块列表
        self.layers = nn.ModuleList([SpeechT5DecoderLayer(config) for _ in range(config.decoder_layers)])

        self.gradient_checkpointing = False  # 梯度检查点设置为假

        # 初始化权重并进行最终处理
        self.post_init()

    def forward(
        self,
        hidden_states: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



class SpeechT5DecoderWithSpeechPrenet(SpeechT5PreTrainedModel):
    """
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    """

    def __init__(self, config: SpeechT5Config):
        super().__init__(config)
        # 初始化语音预网络
        self.prenet = SpeechT5SpeechDecoderPrenet(config)
        # 创建一个SpeechT5Decoder对象作为包装解码器
        self.wrapped_decoder = SpeechT5Decoder(config)

        # 初始化权重并进行最终处理
        self.post_init()

    def forward(
        self,
        input_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        speaker_embeddings: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 定义函数的返回类型，表示返回的结果是一个元组或者BaseModelOutputWithPastAndCrossAttentions类的实例
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        # 使用prenet方法处理输入数值和说话者嵌入，生成解码器的隐藏状态
        decoder_hidden_states = self.prenet(input_values, speaker_embeddings)

        # 调用wrapped_decoder进行解码器的正向传播
        outputs = self.wrapped_decoder(
            # 传递解码器的隐藏状态
            hidden_states=decoder_hidden_states,
            # 传递注意力掩码
            attention_mask=attention_mask,
            # 传递编码器的隐藏状态
            encoder_hidden_states=encoder_hidden_states,
            # 传递编码器的注意力掩码
            encoder_attention_mask=encoder_attention_mask,
            # 传递头部掩码
            head_mask=head_mask,
            # 传递交叉注意力头部掩码
            cross_attn_head_mask=cross_attn_head_mask,
            # 传递过去的键值对
            past_key_values=past_key_values,
            # 指定是否使用缓存
            use_cache=use_cache,
            # 指定是否输出注意力权重
            output_attentions=output_attentions,
            # 指定是否输出隐藏状态
            output_hidden_states=output_hidden_states,
            # 指定是否返回字典格式的输出
            return_dict=return_dict,
        )

        # 返回wrapped_decoder的输出结果
        return outputs
# 继承自 SpeechT5PreTrainedModel 的 SpeechT5DecoderWithTextPrenet 类，包装了 SpeechT5Decoder，并应用 SpeechT5TextDecoderPrenet 将输入标记转换为隐藏特征。
class SpeechT5DecoderWithTextPrenet(SpeechT5PreTrainedModel):
    """
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    """

    # 初始化方法，接收一个 SpeechT5Config 类型的参数 config
    def __init__(self, config: SpeechT5Config):
        super().__init__(config)
        # 创建 SpeechT5TextDecoderPrenet 实例并赋值给 self.prenet 属性
        self.prenet = SpeechT5TextDecoderPrenet(config)
        # 创建 SpeechT5Decoder 实例并赋值给 self.wrapped_decoder 属性
        self.wrapped_decoder = SpeechT5Decoder(config)

        # 调用自定义方法 post_init，用于初始化权重和应用最终处理
        self.post_init()

    # 获取输入嵌入层的方法，委托给 self.prenet 的 get_input_embeddings 方法
    def get_input_embeddings(self):
        return self.prenet.get_input_embeddings()

    # 设置输入嵌入层的方法，委托给 self.prenet 的 set_input_embeddings 方法
    def set_input_embeddings(self, value):
        self.prenet.set_input_embeddings(value)

    # 前向传播方法，接收多个输入参数并返回模型输出
    def forward(
        self,
        input_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        # 使用 self.prenet 处理输入值和注意力掩码，返回解码器隐藏状态和注意力掩码
        decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)

        # 调用 self.wrapped_decoder 的 forward 方法，传入解码器隐藏状态和注意力掩码等参数，获取模型输出
        outputs = self.wrapped_decoder(
            hidden_states=decoder_hidden_states,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 返回模型输出
        return outputs


# 继承自 SpeechT5PreTrainedModel 的 SpeechT5DecoderWithoutPrenet 类，作为辅助类在与 SpeechT5Model 结合使用时正确加载预训练检查点。
class SpeechT5DecoderWithoutPrenet(SpeechT5PreTrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    """

    # 初始化方法，接收一个 SpeechT5Config 类型的参数 config
    def __init__(self, config: SpeechT5Config):
        super().__init__(config)
        # 创建 SpeechT5Decoder 实例并赋值给 self.wrapped_decoder 属性
        self.wrapped_decoder = SpeechT5Decoder(config)

        # 调用自定义方法 post_init，用于初始化权重和应用最终处理
        self.post_init()
    # 定义一个方法 forward，用于模型的前向传播
    def forward(
        self,
        input_values: Optional[torch.FloatTensor] = None,  # 输入值，可以是浮点型张量，可选
        attention_mask: Optional[torch.LongTensor] = None,  # 注意力掩码，可以是长整型张量，可选
        encoder_hidden_states: Optional[torch.FloatTensor] = None,  # 编码器隐藏状态，可以是浮点型张量，可选
        encoder_attention_mask: Optional[torch.LongTensor] = None,  # 编码器注意力掩码，可以是长整型张量，可选
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，可以是张量，可选
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力头部掩码，可以是张量，可选
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对列表，可以是浮点型张量列表，可选
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以是布尔值，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力，可以是布尔值，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以是布尔值，可选
        return_dict: Optional[bool] = None,  # 是否返回字典格式结果，可以是布尔值，可选
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        # 调用 wrapped_decoder 对象的方法，进行解码器的包装前向传播
        outputs = self.wrapped_decoder(
            hidden_states=input_values,  # 输入值作为隐藏状态传入
            attention_mask=attention_mask,  # 注意力掩码传入
            encoder_hidden_states=encoder_hidden_states,  # 编码器隐藏状态传入
            encoder_attention_mask=encoder_attention_mask,  # 编码器注意力掩码传入
            head_mask=head_mask,  # 头部掩码传入
            cross_attn_head_mask=cross_attn_head_mask,  # 交叉注意力头部掩码传入
            past_key_values=past_key_values,  # 过去的键值对列表传入
            use_cache=use_cache,  # 是否使用缓存传入
            output_attentions=output_attentions,  # 是否输出注意力传入
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态传入
            return_dict=return_dict,  # 是否返回字典格式结果传入
        )
        # 返回 wrapped_decoder 方法的输出结果
        return outputs
class SpeechT5GuidedMultiheadAttentionLoss(nn.Module):
    """
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
    """

    def __init__(self, config: SpeechT5Config):
        super().__init__()
        self.sigma = config.guided_attention_loss_sigma  # 初始化 sigma 参数
        self.scale = config.guided_attention_loss_scale  # 初始化 scale 参数

    def forward(
        self, attentions: torch.FloatTensor, input_masks: torch.BoolTensor, output_masks: torch.BoolTensor
    ) -> torch.Tensor:
        """
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        """
        guided_attn_masks = self._make_guided_attention_masks(input_masks, output_masks, attentions.device)
        masks = output_masks.unsqueeze(-1) & input_masks.unsqueeze(-2)  # 创建掩码，用于选择有效的注意力权重
        masks = masks.to(attentions.device).unsqueeze(1)  # 将掩码移到与注意力权重相同的设备上，并扩展维度以匹配注意力权重的形状

        losses = guided_attn_masks * attentions  # 计算引导注意力损失
        loss = torch.mean(losses.masked_select(masks))  # 使用掩码选择有效区域内的损失值，并计算平均损失
        return self.scale * loss  # 返回经过缩放的损失值

    def _make_guided_attention_masks(self, input_masks, output_masks, device):
        input_lengths = input_masks.sum(-1)  # 计算输入掩码的有效长度
        output_lengths = output_masks.sum(-1)  # 计算输出掩码的有效长度

        guided_attn_masks = torch.zeros((len(input_masks), output_masks.shape[1], input_masks.shape[1]), device=device)  # 初始化引导注意力掩码

        for idx, (ilen, olen) in enumerate(zip(input_lengths, output_lengths)):
            guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(ilen, olen, self.sigma, device)  # 生成并填充每个样本的引导注意力掩码

        return guided_attn_masks.unsqueeze(1)  # 扩展维度以匹配注意力权重的形状

    @staticmethod
    def _make_guided_attention_mask(input_length, output_length, sigma, device):
        grid_y, grid_x = torch.meshgrid(
            torch.arange(input_length, device=device),  # 创建输入序列长度的网格
            torch.arange(output_length, device=device),  # 创建输出序列长度的网格
            indexing="xy",
        )
        grid_x = grid_x.float() / output_length  # 标准化输出网格
        grid_y = grid_y.float() / input_length  # 标准化输入网格
        return 1.0 - torch.exp(-((grid_y - grid_x) ** 2) / (2 * (sigma**2)))  # 生成引导注意力掩码
    # 初始化方法，接受一个 SpeechT5Config 类型的参数 config
    def __init__(self, config: SpeechT5Config):
        # 调用父类的初始化方法
        super().__init__()
        # 根据配置文件设置是否使用引导注意力损失
        self.use_guided_attention_loss = config.use_guided_attention_loss
        # 设置引导注意力损失的头数
        self.guided_attention_loss_num_heads = config.guided_attention_loss_num_heads
        # 设置减少因子
        self.reduction_factor = config.reduction_factor

        # 定义 L1 损失函数
        self.l1_criterion = L1Loss()
        # 定义带权重的二元交叉熵损失函数
        self.bce_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor(5.0))

        # 如果使用引导注意力损失，则初始化 SpeechT5GuidedMultiheadAttentionLoss 类
        if self.use_guided_attention_loss:
            self.attn_criterion = SpeechT5GuidedMultiheadAttentionLoss(config)

    # 前向传播方法，接受多个张量作为输入并返回一个张量
    def forward(
        self,
        attention_mask: torch.LongTensor,
        outputs_before_postnet: torch.FloatTensor,
        outputs_after_postnet: torch.FloatTensor,
        logits: torch.FloatTensor,
        labels: torch.FloatTensor,
        cross_attentions: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        # 创建一个填充掩码，用于过滤掉填充部分
        padding_mask = labels != -100.0

        # 根据填充掩码选择有效的标签数据
        labels = labels.masked_select(padding_mask)
        outputs_before_postnet = outputs_before_postnet.masked_select(padding_mask)
        outputs_after_postnet = outputs_after_postnet.masked_select(padding_mask)

        # 计算声谱图损失，包括前后处理网络输出的 L1 损失
        l1_loss = self.l1_criterion(outputs_after_postnet, labels) + self.l1_criterion(outputs_before_postnet, labels)

        # 根据填充掩码构建停止标签
        masks = padding_mask[:, :, 0]
        stop_labels = torch.cat([~masks * 1.0, torch.ones(masks.size(0), 1).to(masks.device)], dim=1)
        stop_labels = stop_labels[:, 1:].masked_select(masks)
        logits = logits.masked_select(masks)

        # 计算停止令牌损失，使用带权重的二元交叉熵损失函数
        bce_loss = self.bce_criterion(logits, stop_labels)

        # 组合所有损失
        loss = l1_loss + bce_loss

        # 如果使用引导注意力损失，则计算该损失
        if self.use_guided_attention_loss:
            # 将所有交叉注意力的头拼接在一起
            attn = torch.cat([x[:, : self.guided_attention_loss_num_heads] for x in cross_attentions], dim=1)
            # 获取输入和输出的掩码
            input_masks = attention_mask == 1
            output_masks = padding_mask[:, :, 0]
            # 如果有减少因子，则按照减少因子对输出掩码进行调整
            if self.reduction_factor > 1:
                output_masks = output_masks[:, self.reduction_factor - 1 :: self.reduction_factor]
            # 计算引导注意力损失
            attn_loss = self.attn_criterion(attn, input_masks, output_masks)
            # 将引导注意力损失添加到总损失中
            loss += attn_loss

        # 返回最终的损失张量
        return loss
# 定义字符串常量，用于描述 SpeechT5Model 类的文档字符串起始部分，包含继承关系及通用模型方法说明
SPEECHT5_BASE_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`SpeechT5Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
        encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
            The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
            [`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
        decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
            The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
            [`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
            states.
"""

# 定义字符串常量，用于描述 SpeechT5Model 类的文档字符串起始部分，缩短版，不包含特定的 pre- 或 post-nets 描述
SPEECHT5_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`SpeechT5Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义字符串常量，用于描述 SpeechT5Model 类的输入文档字符串，目前为空字符串
SPEECHT5_INPUTS_DOCSTRING = r"""
"""

# 使用装饰器 @add_start_docstrings 添加文档字符串到 SpeechT5Model 类，描述其作为裸型（bare）Encoder-Decoder 模型输出原始隐藏状态的特性
@add_start_docstrings(
    "The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.",
    SPEECHT5_BASE_START_DOCSTRING,
)
class SpeechT5Model(SpeechT5PreTrainedModel):
    def __init__(
        self,
        config: SpeechT5Config,
        encoder: Optional[nn.Module] = None,
        decoder: Optional[nn.Module] = None,
    ):
        # 调用父类的初始化方法，并传入配置参数
        super().__init__(config)
        # 存储传入的配置参数
        self.config = config
        # 如果未提供编码器，则使用默认的 SpeechT5EncoderWithoutPrenet
        self.encoder = SpeechT5EncoderWithoutPrenet(config) if encoder is None else encoder
        # 如果未提供解码器，则使用默认的 SpeechT5DecoderWithoutPrenet
        self.decoder = SpeechT5DecoderWithoutPrenet(config) if decoder is None else decoder

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 如果编码器是 SpeechT5EncoderWithTextPrenet 类型，则获取其输入嵌入
        if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
            return self.encoder.get_input_embeddings()
        # 如果解码器是 SpeechT5DecoderWithTextPrenet 类型，则获取其输入嵌入
        if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
            return self.decoder.get_input_embeddings()
        # 否则返回空
        return None

    def set_input_embeddings(self, value):
        # 如果编码器是 SpeechT5EncoderWithTextPrenet 类型，则设置其输入嵌入
        if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
            self.encoder.set_input_embeddings(value)
        # 如果解码器是 SpeechT5DecoderWithTextPrenet 类型，则设置其输入嵌入
        if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
            self.decoder.set_input_embeddings(value)

    def get_encoder(self):
        # 返回编码器对象
        return self.encoder

    def get_decoder(self):
        # 返回解码器对象
        return self.decoder

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 如果编码器是 SpeechT5EncoderWithSpeechPrenet 类型，则冻结其特征编码器，使其梯度不计算
        if isinstance(self.encoder, SpeechT5EncoderWithSpeechPrenet):
            self.encoder.prenet.freeze_feature_encoder()

    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_values: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        decoder_input_values: Optional[torch.Tensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        speaker_embeddings: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 使用装饰器添加文档字符串，描述了SpeechT5模型的功能和起始文档字符串
@add_start_docstrings(
    """SpeechT5 Model with a speech encoder and a text decoder.""",
    SPEECHT5_START_DOCSTRING,
)
# SpeechT5ForSpeechToText类继承自SpeechT5PreTrainedModel类
class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel):
    # 定义了与文本解码器的权重绑定的关键字列表
    _tied_weights_keys = ["text_decoder_postnet.lm_head.weight"]

    # 初始化方法，接受一个SpeechT5Config类型的参数config
    def __init__(self, config: SpeechT5Config):
        # 调用父类SpeechT5PreTrainedModel的初始化方法
        super().__init__(config)

        # 如果配置中未指定词汇表大小，则引发值错误
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
                " vocabulary size of the language model head. Please instantiate the model as follows:"
                " `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
                " your model's configuration."
            )

        # 创建语音编码器和文本解码器实例
        speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
        text_decoder = SpeechT5DecoderWithTextPrenet(config)
        # 创建SpeechT5Model实例，整合语音编码器和文本解码器
        self.speecht5 = SpeechT5Model(config, speech_encoder, text_decoder)

        # 创建文本解码器后处理模块的实例
        self.text_decoder_postnet = SpeechT5TextDecoderPostnet(config)

        # 初始化权重并应用最终处理
        self.post_init()

    # 返回语音T5模型中的编码器
    def get_encoder(self):
        return self.speecht5.get_encoder()

    # 返回语音T5模型中的解码器
    def get_decoder(self):
        return self.speecht5.get_decoder()

    # 冻结特征编码器，禁用其梯度计算，以确保在训练过程中不更新其参数
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.get_encoder().prenet.freeze_feature_encoder()

    # 获取文本解码器后处理模块的输出嵌入
    def get_output_embeddings(self):
        return self.text_decoder_postnet.get_output_embeddings()

    # 设置文本解码器后处理模块的输出嵌入
    def set_output_embeddings(self, new_embeddings):
        self.text_decoder_postnet.set_output_embeddings(new_embeddings)

    # 重写forward方法，接受多个参数来进行前向推断
    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
    # 为生成准备输入的方法，用于生成器模型
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用了过去的键值（past_key_values），则裁剪decoder_input_ids
        if past_key_values is not None:
            # 获取过去键值的长度
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经仅传递最后一个输入ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认旧行为：仅保留最后一个ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 裁剪decoder_input_ids，保留remove_prefix_length之后的部分
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回包含准备好的输入的字典
        return {
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 改变这个以避免缓存（可能用于调试）
        }

    # 静态方法：重新排序缓存中的过去键值，用于束搜索时的重排
    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        # 遍历每一层的过去键值
        for layer_past in past_key_values:
            # 重新排序过去的状态，根据束搜索的索引
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重排后的过去键值
        return reordered_past
# 定义一个生成语音的函数，接受以下参数：
# - model: SpeechT5PreTrainedModel，用于生成语音的预训练模型
# - input_values: torch.FloatTensor，输入的语音数据
# - speaker_embeddings: Optional[torch.FloatTensor]，说话者嵌入（可选）
# - attention_mask: Optional[torch.LongTensor]，注意力掩码（可选）
# - threshold: float，阈值，默认为0.5
# - minlenratio: float，最小长度比率，默认为0.0
# - maxlenratio: float，最大长度比率，默认为20.0
# - vocoder: Optional[nn.Module]，声码器模块（可选）
# - output_cross_attentions: bool，是否输出交叉注意力（默认为False）
# - return_output_lengths: bool，是否返回输出长度（默认为False）
# 返回值为Union[torch.FloatTensor, Tuple[torch.FloatTensor, torch.FloatTensor]]，输出的语音数据或包含语音数据和长度的元组

if speaker_embeddings is None:
    # 如果未提供说话者嵌入，抛出数值错误并提供解决方法的文本信息
    raise ValueError(
        """`speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                the code snippet provided in this link:
                https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                """
    )

if attention_mask is None:
    # 如果未提供注意力掩码，根据输入数据中的填充标记生成注意力掩码
    encoder_attention_mask = 1 - (input_values == model.config.pad_token_id).int()
else:
    # 否则，使用给定的注意力掩码
    encoder_attention_mask = attention_mask

bsz = input_values.size(0)  # 计算批次大小

# 使用模型的编码器对输入进行编码
encoder_out = model.speecht5.encoder(
    input_values=input_values,
    attention_mask=encoder_attention_mask,
    return_dict=True,
)

encoder_last_hidden_state = encoder_out.last_hidden_state  # 获取编码器的最后隐藏状态

# 如果模型的编码器是 SpeechT5EncoderWithSpeechPrenet 类型，对注意力掩码进行降采样处理
if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
    encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
        encoder_out[0].shape[1], encoder_attention_mask
    )

# 根据最大和最小长度比率以及模型的减少因子计算输出的最大和最小长度
maxlen = int(encoder_last_hidden_state.size(1) * maxlenratio / model.config.reduction_factor)
minlen = int(encoder_last_hidden_state.size(1) * minlenratio / model.config.reduction_factor)

# 初始化输出序列，以一个全部为零的梅尔频谱开始
output_sequence = encoder_last_hidden_state.new_zeros(bsz, 1, model.config.num_mel_bins)

spectrogram = []  # 初始化频谱列表
cross_attentions = []  # 初始化交叉注意力列表
past_key_values = None  # 初始化过去的键值对
idx = 0  # 初始化索引
result_spectrogram = {}  # 初始化结果频谱字典
    while True:
        idx += 1

        # 在整个输出序列上运行解码器的预处理网络。
        decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
        # 在预处理网络输出的最后一个元素上运行解码器层。
        decoder_out = model.speecht5.decoder.wrapped_decoder(
            hidden_states=decoder_hidden_states[:, -1:],
            attention_mask=None,
            encoder_hidden_states=encoder_last_hidden_state,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=True,
            output_attentions=output_cross_attentions,
            return_dict=True,
        )

        # 如果需要输出跨注意力，将其收集起来。
        if output_cross_attentions:
            cross_attentions.append(torch.cat(decoder_out.cross_attentions, dim=0))

        # 获取解码器的最后隐藏状态，并将其作为下一步预测的输入。
        last_decoder_output = decoder_out.last_hidden_state.squeeze(1)
        past_key_values = decoder_out.past_key_values

        # 预测当前步骤的新的梅尔频谱。
        spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
        spectrum = spectrum.view(bsz, model.config.reduction_factor, model.config.num_mel_bins)
        spectrogram.append(spectrum)

        # 将新的梅尔频谱扩展到输出序列中。
        new_spectrogram = spectrum[:, -1, :].view(bsz, 1, model.config.num_mel_bins)
        output_sequence = torch.cat((output_sequence, new_spectrogram), dim=1)

        # 预测这是停止标记的概率。
        prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))

        # 如果仍未达到最小长度要求，继续生成。
        if idx < minlen:
            continue
        else:
            # 如果生成循环次数小于最大长度，检查满足概率阈值的批次中的序列。
            # 否则，假设所有序列都满足阈值，并为批次中的其他频谱填充。
            if idx < maxlen:
                meet_thresholds = torch.sum(prob, dim=-1) >= threshold
                meet_indexes = torch.where(meet_thresholds)[0].tolist()
            else:
                meet_indexes = range(len(prob))
            meet_indexes = [i for i in meet_indexes if i not in result_spectrogram]

            # 如果满足阈值的序列数大于零，则处理这些序列的频谱。
            if len(meet_indexes) > 0:
                spectrograms = torch.stack(spectrogram)
                spectrograms = spectrograms.transpose(0, 1).flatten(1, 2)
                spectrograms = model.speech_decoder_postnet.postnet(spectrograms)
                for meet_index in meet_indexes:
                    result_spectrogram[meet_index] = spectrograms[meet_index]

            # 如果已经收集到足够的结果频谱，则停止生成。
            if len(result_spectrogram) >= bsz:
                break

    # 将结果频谱收集到列表中。
    spectrograms = [result_spectrogram[i] for i in range(len(result_spectrogram))]
    `
    # 如果不需要返回输出长度信息
    if not return_output_lengths:
        # 如果 batch size 为 1，则直接取第一个 spectrogram；否则对 spectrograms 进行批次填充
        spectrogram = spectrograms[0] if bsz == 1 else torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
        # 如果有 vocoder，则对 spectrogram 进行合成处理；否则直接使用 spectrogram
        if vocoder is not None:
            outputs = vocoder(spectrogram)
        else:
            outputs = spectrogram
        # 如果需要输出交叉注意力，将交叉注意力拼接起来
        if output_cross_attentions:
            cross_attentions = torch.cat(cross_attentions, dim=2)
            # 如果 batch size 大于 1，则重塑交叉注意力的形状
            if bsz > 1:
                cross_attentions = cross_attentions.view(
                    bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
                )
            # 将输出结果设置为 spectrogram 和 cross_attentions 的元组
            outputs = (outputs, cross_attentions)
    else:
        # 如果需要返回输出长度信息
        # 计算每个 spectrogram 的长度并存储在 spectrogram_lengths 中
        spectrogram_lengths = []
        for i in range(bsz):
            spectrogram_lengths.append(spectrograms[i].size(0))
        # 如果没有 vocoder，则对 spectrograms 进行批次填充并返回 spectrograms 和 lengths 的元组
        if vocoder is None:
            spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
            outputs = (spectrograms, spectrogram_lengths)
        else:
            # 否则，对 spectrograms 进行批次填充并使用 vocoder 处理生成 waveforms
            waveforms = vocoder(spectrograms)
            # 计算每个 waveform 的长度并存储在 waveform_lengths 中
            waveform_lengths = [int(waveforms.size(1) / max(spectrogram_lengths)) * i for i in spectrogram_lengths]
            # 返回 waveforms 和 waveform_lengths 的元组作为输出结果
            outputs = (waveforms, waveform_lengths)
        # 如果需要输出交叉注意力，将交叉注意力拼接起来并加入到输出结果中
        if output_cross_attentions:
            cross_attentions = torch.cat(cross_attentions, dim=2)
            cross_attentions = cross_attentions.view(
                bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
            )
            outputs = (*outputs, cross_attentions)
    # 返回最终的输出结果
    return outputs
# 为 SpeechT5ForTextToSpeech 类添加文档字符串，描述其是一个带有文本编码器和语音解码器的 SpeechT5 模型
@add_start_docstrings(
    """SpeechT5 Model with a text encoder and a speech decoder.""",
    SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
    # 主要输入名称为 input_ids
    main_input_name = "input_ids"

    # 初始化函数，接受一个 SpeechT5Config 对象作为参数
    def __init__(self, config: SpeechT5Config):
        super().__init__(config)

        # 如果配置中的词汇表大小为 None，则抛出错误，要求配置中定义语言模型头的词汇表大小
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
                " vocabulary size of the language model head. Please instantiate the model as follows:"
                " `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
                " your model's configuration."
            )

        # 创建文本编码器和语音解码器
        text_encoder = SpeechT5EncoderWithTextPrenet(config)
        speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
        # 使用创建的编码器和解码器创建 SpeechT5Model 对象
        self.speecht5 = SpeechT5Model(config, text_encoder, speech_decoder)

        # 创建语音解码器的后处理网络 SpeechT5SpeechDecoderPostnet
        self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 返回文本编码器
    def get_encoder(self):
        return self.speecht5.get_encoder()

    # 返回语音解码器
    def get_decoder(self):
        return self.speecht5.get_decoder()

    # 重写 forward 函数，接受多个输入参数，用于执行前向传播操作
    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqSpectrogramOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        decoder_input_values: Optional[torch.FloatTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        speaker_embeddings: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.FloatTensor] = None,
        stop_labels: Optional[torch.Tensor] = None,
    ):
        pass

    # 生成函数，用于生成输出的语音波形
    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.LongTensor] = None,
        speaker_embeddings: Optional[torch.FloatTensor] = None,
        threshold: float = 0.5,
        minlenratio: float = 0.0,
        maxlenratio: float = 20.0,
        vocoder: Optional[nn.Module] = None,
        output_cross_attentions: bool = False,
        return_output_lengths: bool = False,
        **kwargs,
    ):
        pass
    # 定义生成语音的方法，用于生成基于输入 ID 的语音
    def generate_speech(
        self,
        input_ids: torch.LongTensor,  # 输入的文本 ID，用于生成语音内容
        speaker_embeddings: Optional[torch.FloatTensor] = None,  # 说话者的嵌入向量，可选
        attention_mask: Optional[torch.LongTensor] = None,  # 注意力掩码，可选，用于指定哪些位置要被关注
        threshold: float = 0.5,  # 阈值，控制生成语音的质量
        minlenratio: float = 0.0,  # 最小长度比率，生成语音的最小长度与输入文本长度的比率
        maxlenratio: float = 20.0,  # 最大长度比率，生成语音的最大长度与输入文本长度的比率
        vocoder: Optional[nn.Module] = None,  # 语音合成器模型，可选
        output_cross_attentions: bool = False,  # 是否输出交叉注意力
        return_output_lengths: bool = False,  # 是否返回输出长度信息
@add_start_docstrings(
    """SpeechT5 Model with a speech encoder and a speech decoder.""",
    SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
    """
    SpeechT5ForSpeechToSpeech is a specialized model for speech-to-speech tasks, incorporating
    both an encoder and a decoder for processing speech data.
    """

    def __init__(self, config: SpeechT5Config):
        super().__init__(config)

        # Initialize the speech encoder with a prenet specific to SpeechT5
        speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
        
        # Initialize the speech decoder with a prenet specific to SpeechT5
        speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
        
        # Combine the encoder and decoder into the SpeechT5Model
        self.speecht5 = SpeechT5Model(config, speech_encoder, speech_decoder)

        # Initialize the postnet for the speech decoder
        self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)

        # Perform post-initialization tasks
        self.post_init()

    def get_encoder(self):
        """
        Returns the speech encoder from the SpeechT5 model.
        """
        return self.speecht5.get_encoder()

    def get_decoder(self):
        """
        Returns the speech decoder from the SpeechT5 model.
        """
        return self.speecht5.get_decoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will freeze the gradient computation for the feature encoder,
        preventing its parameters from being updated during training.
        """
        self.get_encoder().prenet.freeze_feature_encoder()

    @add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqSpectrogramOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        decoder_input_values: Optional[torch.FloatTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        speaker_embeddings: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.FloatTensor] = None,
        stop_labels: Optional[torch.Tensor] = None,
    ):
        """
        Forward pass of the SpeechT5 model for speech-to-speech conversion tasks.
        """
        # Implementation details handled internally by SpeechT5Model

    @torch.no_grad()
    def generate_speech(
        self,
        input_values: torch.FloatTensor,
        speaker_embeddings: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        threshold: float = 0.5,
        minlenratio: float = 0.0,
        maxlenratio: float = 20.0,
        vocoder: Optional[nn.Module] = None,
        output_cross_attentions: bool = False,
        return_output_lengths: bool = False,
    ):
        """
        Generates speech output based on input values, optionally using speaker embeddings.
        """
        # Implementation details for generating speech output


注释：
这段代码定义了一个名为SpeechT5ForSpeechToSpeech的Python类，用于处理语音到语音的转换任务。该类继承自SpeechT5PreTrainedModel，并组合了一个自定义的语音编码器、解码器及其他相关组件，实现了前向传播方法和生成语音的方法。
    # 这个模型也是一个 PyTorch 的 `torch.nn.Module` 子类。
    # 可以像普通的 PyTorch 模块一样使用，并且关于一般使用和行为的所有问题，请参考 PyTorch 的文档。

    # 参数:
    #     config ([`SpeechT5HifiGanConfig`]):
    #         模型配置类，包含模型的所有参数。使用配置文件初始化不会加载与模型关联的权重，只会加载配置。
    #         若要加载模型权重，请查看 [`~PreTrainedModel.from_pretrained`] 方法。
# 定义一个自定义的残差块类，继承自 nn.Module
class HifiGanResidualBlock(nn.Module):
    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
        super().__init__()
        self.leaky_relu_slope = leaky_relu_slope

        # 创建多个卷积层，每个卷积层具有不同的扩张率
        self.convs1 = nn.ModuleList(
            [
                nn.Conv1d(
                    channels,
                    channels,
                    kernel_size,
                    stride=1,
                    dilation=dilation[i],
                    padding=self.get_padding(kernel_size, dilation[i]),
                )
                for i in range(len(dilation))
            ]
        )
        
        # 创建另一组卷积层，每个卷积层的扩张率为 1
        self.convs2 = nn.ModuleList(
            [
                nn.Conv1d(
                    channels,
                    channels,
                    kernel_size,
                    stride=1,
                    dilation=1,
                    padding=self.get_padding(kernel_size, 1),
                )
                for _ in range(len(dilation))
            ]
        )

    # 计算卷积的填充数，确保输出与输入大小相同
    def get_padding(self, kernel_size, dilation=1):
        return (kernel_size * dilation - dilation) // 2

    # 对所有卷积层应用权重归一化
    def apply_weight_norm(self):
        for layer in self.convs1:
            nn.utils.weight_norm(layer)
        for layer in self.convs2:
            nn.utils.weight_norm(layer)

    # 移除所有卷积层的权重归一化
    def remove_weight_norm(self):
        for layer in self.convs1:
            nn.utils.remove_weight_norm(layer)
        for layer in self.convs2:
            nn.utils.remove_weight_norm(layer)

    # 前向传播函数，依次通过多个卷积层和激活函数，实现残差连接
    def forward(self, hidden_states):
        for conv1, conv2 in zip(self.convs1, self.convs2):
            residual = hidden_states
            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
            hidden_states = conv1(hidden_states)
            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
            hidden_states = conv2(hidden_states)
            hidden_states = hidden_states + residual  # 残差连接
        return hidden_states


# 将 HiFi-GAN 的 vocoder 模型定义为 SpeechT5HifiGan 类，它继承自 PreTrainedModel
@add_start_docstrings(
    """HiFi-GAN vocoder.""",  # 添加了关于 HiFi-GAN 的文档字符串
    HIFIGAN_START_DOCSTRING,  # 引用了 HIFIGAN_START_DOCSTRING 的文档字符串（可能是预定义的常量或函数）
)
class SpeechT5HifiGan(PreTrainedModel):
    config_class = SpeechT5HifiGanConfig  # 指定配置类为 SpeechT5HifiGanConfig
    main_input_name = "spectrogram"  # 指定主要输入名称为 "spectrogram"
    # 初始化函数，接受一个名为config的SpeechT5HifiGanConfig对象作为参数
    def __init__(self, config: SpeechT5HifiGanConfig):
        # 调用父类的初始化函数，传递config作为参数
        super().__init__(config)
        # 计算并记录resblock_kernel_sizes列表的长度，即卷积块的数量
        self.num_kernels = len(config.resblock_kernel_sizes)
        # 计算并记录upsample_rates列表的长度，即上采样率的数量
        self.num_upsamples = len(config.upsample_rates)
        # 创建一个一维卷积层，输入维度为config.model_in_dim，输出通道数为config.upsample_initial_channel，卷积核大小为7，步长为1，填充为3
        self.conv_pre = nn.Conv1d(
            config.model_in_dim,
            config.upsample_initial_channel,
            kernel_size=7,
            stride=1,
            padding=3,
        )

        # 创建一个空的模块列表upsampler，用于存储上采样卷积层
        self.upsampler = nn.ModuleList()
        # 遍历upsample_rates和upsample_kernel_sizes的元素，逐个创建反卷积层并添加到upsampler中
        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
            self.upsampler.append(
                nn.ConvTranspose1d(
                    config.upsample_initial_channel // (2**i),
                    config.upsample_initial_channel // (2 ** (i + 1)),
                    kernel_size=kernel_size,
                    stride=upsample_rate,
                    padding=(kernel_size - upsample_rate) // 2,
                )
            )

        # 创建一个空的模块列表resblocks，用于存储残差块
        self.resblocks = nn.ModuleList()
        # 根据upsampler的长度，遍历每个上采样层并添加对应数量的残差块
        for i in range(len(self.upsampler)):
            # 计算当前层的通道数
            channels = config.upsample_initial_channel // (2 ** (i + 1))
            # 遍历resblock_kernel_sizes和resblock_dilation_sizes的元素，逐个创建残差块并添加到resblocks中
            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))

        # 创建最终的一维卷积层，输入通道数为channels，输出通道数为1，卷积核大小为7，步长为1，填充为3
        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)

        # 注册缓冲区mean，用于存储均值张量，初始化为与输入维度一致的全零张量
        self.register_buffer("mean", torch.zeros(config.model_in_dim))
        # 注册缓冲区scale，用于存储标准差张量，初始化为与输入维度一致的全一张量
        self.register_buffer("scale", torch.ones(config.model_in_dim))

        # 调用post_init函数，用于初始化权重并进行最终处理
        self.post_init()

    # 权重初始化函数，初始化线性层和一维卷积层的权重
    def _init_weights(self, module):
        """Initialize the weights."""
        # 如果module是nn.Linear或nn.Conv1d类型的实例
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 使用正态分布随机初始化权重，均值为0，标准差为self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，则将偏置项初始化为零
            if module.bias is not None:
                module.bias.data.zero_()

    # 应用权重归一化，对conv_pre、upsampler中的每个层和resblocks中的每个块应用权重归一化
    def apply_weight_norm(self):
        nn.utils.weight_norm(self.conv_pre)
        for layer in self.upsampler:
            nn.utils.weight_norm(layer)
        for layer in self.resblocks:
            layer.apply_weight_norm()
        nn.utils.weight_norm(self.conv_post)

    # 移除权重归一化，对conv_pre、upsampler中的每个层和resblocks中的每个块移除权重归一化
    def remove_weight_norm(self):
        nn.utils.remove_weight_norm(self.conv_pre)
        for layer in self.upsampler:
            nn.utils.remove_weight_norm(layer)
        for layer in self.resblocks:
            layer.remove_weight_norm()
        nn.utils.remove_weight_norm(self.conv_post)
    def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
        r"""
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.

        Args:
            spectrogram (`torch.FloatTensor`):
                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
                config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        """
        if self.config.normalize_before:
            # 如果配置要求，在前处理时对输入的频谱图进行标准化
            spectrogram = (spectrogram - self.mean) / self.scale

        # 检查输入的频谱图是否是批处理的
        is_batched = spectrogram.dim() == 3
        if not is_batched:
            # 如果输入的频谱图未经批处理，则添加批处理维度
            spectrogram = spectrogram.unsqueeze(0)

        # 将频谱图的通道维和时间步维进行转置，以便卷积层处理
        hidden_states = spectrogram.transpose(2, 1)

        # 在前处理卷积层中应用卷积操作
        hidden_states = self.conv_pre(hidden_states)

        # 循环进行上采样操作
        for i in range(self.num_upsamples):
            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
            hidden_states = self.upsampler[i](hidden_states)

            # 应用残差块
            res_state = self.resblocks[i * self.num_kernels](hidden_states)
            for j in range(1, self.num_kernels):
                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
            hidden_states = res_state / self.num_kernels

        # 在后处理卷积层中应用卷积操作和激活函数
        hidden_states = nn.functional.leaky_relu(hidden_states)
        hidden_states = self.conv_post(hidden_states)
        hidden_states = torch.tanh(hidden_states)

        if not is_batched:
            # 如果输入未经批处理，则去除批处理维度，并将张量展平成音频波形
            waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
        else:
            # 如果输入经过批处理，则去除时间步维，因为此时时间步维已经折叠成一个
            waveform = hidden_states.squeeze(1)

        return waveform

`.\models\speecht5\number_normalizer.py`

# coding=utf-8
# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Number Normalizer class for SpeechT5."""

import re

class EnglishNumberNormalizer:
    def __init__(self):
        # 单位数字（0-9）
        self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
        # 十位数（11-19）
        self.teens = [
            "",
            "eleven",
            "twelve",
            "thirteen",
            "fourteen",
            "fifteen",
            "sixteen",
            "seventeen",
            "eighteen",
            "nineteen",
        ]
        # 十位数（10, 20, 30, ..., 90）
        self.tens = ["", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
        # 千位数（如thousand, million, billion, ...）
        self.thousands = [
            "",
            "thousand",
            "million",
            "billion",
            "trillion",
            "quadrillion",
            "quintillion",
            "sextillion",
            "septillion",
            "octillion",
            "nonillion",
            "decillion",
        ]

        # 定义一个字典，将货币符号映射到它们的名称
        # 根据 https://en.wikipedia.org/wiki/Template:Most_traded_currencies
        self.currency_symbols = {
            "$": " dollars",
            "€": " euros",
            "£": " pounds",
            "¢": " cents",
            "¥": " japanese yen",
            "﷼": " saudi riyal",
            "₹": " indian rupees",
            "₽": " russian rubles",
            "฿": " thai baht",
            "₺": " turkish liras",
            "₴": " ukrainian hryvnia",
            "₣": " swiss francs",
            "₡": " costa rican colon",
            "₱": " philippine peso",
            "₪": " israeli shekels",
            "₮": " mongolian tögrög",
            "₩": " south korean won",
            "₦": " nigerian naira",
            "₫": " vietnamese Đồng",
        }
    # 将给定的数字转换成英文单词表示，例如将数字 1234 转换成 "one thousand two hundred thirty four"
    def spell_number(self, num):
        # 如果数字是 0，直接返回 "zero"
        if num == 0:
            return "zero"

        parts = []
        # 遍历 self.thousands 列表中的每个元素（"thousand", "million", "billion" 等）
        for i in range(0, len(self.thousands)):
            # 如果当前 num 不是以 000 结尾的，则继续处理
            if num % 1000 != 0:
                part = ""
                # 获取当前三位数中的百位数
                hundreds = num % 1000 // 100
                # 获取当前三位数中的十位和个位数的组合
                tens_units = num % 100

                # 如果百位数大于 0，则添加 "百" 位数的英文表达
                if hundreds > 0:
                    part += self.ones[hundreds] + " hundred"
                    # 如果十位和个位数的组合大于 0，则在百位数的后面添加 "and"
                    if tens_units > 0:
                        part += " and "

                # 判断十位和个位数的组合是否在 11 到 19 之间，若是，则添加对应的英文表达
                if tens_units >= 11 and tens_units <= 19:
                    part += self.teens[tens_units - 10]
                else:
                    # 否则，分别添加十位数和个位数的英文表达
                    tens_digit = self.tens[tens_units // 10]
                    ones_digit = self.ones[tens_units % 10]
                    if tens_digit:
                        part += tens_digit
                    if ones_digit:
                        if tens_digit:
                            part += " "
                        part += ones_digit

                # 将当前三位数的英文表达添加到 parts 列表中
                parts.append(part)

            # 将 num 变为其除以 1000 的整数部分，用于处理下一个 "thousand" 的处理
            num //= 1000

        # 将 parts 列表中的内容逆序拼接成最终的英文数字表达式并返回
        return " ".join(reversed(parts))
    def convert(self, number):
        """
        Converts an individual number passed in string form to spelt-out form
        将传入的字符串形式的数字转换为拼写形式
        """
        # Split number into integer and decimal parts if present
        if "." in number:
            integer_part, decimal_part = number.split(".")
        else:
            integer_part, decimal_part = number, "00"

        # Extract currency symbol if present
        currency_symbol = ""
        for symbol, name in self.currency_symbols.items():
            if integer_part.startswith(symbol):
                currency_symbol = name
                integer_part = integer_part[len(symbol) :]
                break

            if integer_part.startswith("-"):
                if integer_part[1:].startswith(symbol):
                    currency_symbol = name
                    integer_part = "-" + integer_part[len(symbol) + 1 :]
                    break

        # Extract 'minus' prefix for negative numbers
        minus_prefix = ""
        if integer_part.startswith("-"):
            minus_prefix = "minus "
            integer_part = integer_part[1:]
        elif integer_part.startswith("minus"):
            minus_prefix = "minus "
            integer_part = integer_part[len("minus") :]

        # Handle percentage suffix
        percent_suffix = ""
        if "%" in integer_part or "%" in decimal_part:
            percent_suffix = " percent"
            integer_part = integer_part.replace("%", "")
            decimal_part = decimal_part.replace("%", "")

        # Pad integer part to ensure proper grouping
        integer_part = integer_part.zfill(3 * ((len(integer_part) - 1) // 3 + 1))

        # Split integer part into groups of three for conversion
        parts = []
        for i in range(0, len(integer_part), 3):
            chunk = int(integer_part[i : i + 3])
            if chunk > 0:
                part = self.spell_number(chunk)
                unit = self.thousands[len(integer_part[i:]) // 3 - 1]
                if unit:
                    part += " " + unit
                parts.append(part)

        # Join parts to form the spelled-out integer
        spelled_integer = " ".join(parts)

        # Format the final output based on conditions
        if decimal_part == "00":
            return (
                f"{minus_prefix}{spelled_integer}{percent_suffix}{currency_symbol}"
                if minus_prefix or currency_symbol
                else f"{spelled_integer}{percent_suffix}"
            )
        else:
            # Convert decimal part to spelled-out form
            spelled_decimal = " ".join([self.spell_number(int(digit)) for digit in decimal_part])
            return (
                f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}{currency_symbol}"
                if minus_prefix or currency_symbol
                else f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}"
            )
    def __call__(self, text):
        """
        Convert numbers / number-like quantities in a string to their spelt-out counterparts
        """
        # 定义匹配各种货币符号的正则表达式模式
        pattern = r"(?<!\w)(-?\$?\€?\£?\¢?\¥?\₹?\₽?\฿?\₺?\₴?\₣?\₡?\₱?\₪?\₮?\₩?\₦?\₫?\﷼?\d+(?:\.\d{1,2})?%?)(?!\w)"

        # 查找并替换数字中的逗号（例如 15,000 -> 15000 等）
        text = re.sub(r"(\d+,\d+)", lambda match: match.group(1).replace(",", ""), text)

        # 使用正则表达式查找并替换文本中的数字为其对应的拼写形式
        converted_text = re.sub(pattern, lambda match: self.convert(match.group(1)), text)
        
        # 将连续多个空格替换为单个空格
        converted_text = re.sub(" +", " ", converted_text)

        # 返回转换后的文本
        return converted_text

`.\models\speecht5\processing_speecht5.py`

# coding=utf-8
# 声明文件编码为 UTF-8

# 导入处理工具的混合类 ProcessorMixin
from ...processing_utils import ProcessorMixin

# 定义一个名为 SpeechT5Processor 的类，继承自 ProcessorMixin 类
class SpeechT5Processor(ProcessorMixin):
    """
    构造一个 SpeechT5Processor 类，将特征提取器和分词器封装成一个单一的处理器。

    [`SpeechT5Processor`] 提供了 [`SpeechT5FeatureExtractor`] 和 [`SpeechT5Tokenizer`] 的所有功能。查看
    [`~SpeechT5Processor.__call__`] 和 [`~SpeechT5Processor.decode`] 的文档字符串以获取更多信息。

    Args:
        feature_extractor (`SpeechT5FeatureExtractor`):
            [`SpeechT5FeatureExtractor`] 的实例。特征提取器是必需的输入。
        tokenizer (`SpeechT5Tokenizer`):
            [`SpeechT5Tokenizer`] 的实例。分词器是必需的输入。
    """

    # 类属性：特征提取器类名
    feature_extractor_class = "SpeechT5FeatureExtractor"
    # 类属性：分词器类名
    tokenizer_class = "SpeechT5Tokenizer"

    # 初始化方法，接收特征提取器和分词器作为参数
    def __init__(self, feature_extractor, tokenizer):
        # 调用父类 ProcessorMixin 的初始化方法
        super().__init__(feature_extractor, tokenizer)
    def __call__(self, *args, **kwargs):
        """
        Processes audio and text input, as well as audio and text targets.

        You can process audio by using the argument `audio`, or process audio targets by using the argument
        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
        [`~SpeechT5FeatureExtractor.__call__`].

        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].

        Valid input combinations are:

        - `text` only
        - `audio` only
        - `text_target` only
        - `audio_target` only
        - `text` and `audio_target`
        - `audio` and `audio_target`
        - `text` and `text_target`
        - `audio` and `text_target`

        Please refer to the docstring of the above two methods for more information.
        """
        # 从关键字参数中取出 `audio`，如果不存在则为 None
        audio = kwargs.pop("audio", None)
        # 从关键字参数中取出 `text`，如果不存在则为 None
        text = kwargs.pop("text", None)
        # 从关键字参数中取出 `text_target`，如果不存在则为 None
        text_target = kwargs.pop("text_target", None)
        # 从关键字参数中取出 `audio_target`，如果不存在则为 None
        audio_target = kwargs.pop("audio_target", None)
        # 从关键字参数中取出 `sampling_rate`，如果不存在则为 None
        sampling_rate = kwargs.pop("sampling_rate", None)

        # 如果同时有音频输入和文本输入，则抛出 ValueError
        if audio is not None and text is not None:
            raise ValueError(
                "Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?"
            )
        # 如果同时有音频目标和文本目标输入，则抛出 ValueError
        if audio_target is not None and text_target is not None:
            raise ValueError(
                "Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?"
            )
        # 如果没有指定任何输入或目标，则抛出 ValueError
        if audio is None and audio_target is None and text is None and text_target is None:
            raise ValueError(
                "You need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process."
            )

        # 根据有无音频输入来选择调用特征提取器或者分词器处理输入数据
        if audio is not None:
            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        elif text is not None:
            inputs = self.tokenizer(text, **kwargs)
        else:
            inputs = None

        # 根据有无音频目标来选择调用特征提取器或者分词器处理目标数据
        if audio_target is not None:
            targets = self.feature_extractor(audio_target=audio_target, *args, sampling_rate=sampling_rate, **kwargs)
            labels = targets["input_values"]
        elif text_target is not None:
            targets = self.tokenizer(text_target, **kwargs)
            labels = targets["input_ids"]
        else:
            targets = None

        # 如果输入为空，则直接返回目标
        if inputs is None:
            return targets

        # 如果目标不为空，则将标签添加到输入中，并且根据目标的注意力掩码设置解码器的注意力掩码
        if targets is not None:
            inputs["labels"] = labels

            decoder_attention_mask = targets.get("attention_mask")
            if decoder_attention_mask is not None:
                inputs["decoder_attention_mask"] = decoder_attention_mask

        # 返回处理后的输入数据
        return inputs
    def pad(self, *args, **kwargs):
        """
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        """
        # 从 kwargs 中取出对应的输入数据
        input_values = kwargs.pop("input_values", None)
        input_ids = kwargs.pop("input_ids", None)
        labels = kwargs.pop("labels", None)

        # 如果既有 input_values 又有 input_ids，则抛出数值错误
        if input_values is not None and input_ids is not None:
            raise ValueError("Cannot process both `input_values` and `input_ids` inputs.")
        # 如果 input_values、input_ids 和 labels 均为 None，则抛出数值错误
        if input_values is None and input_ids is None and labels is None:
            raise ValueError(
                "You need to specify either an `input_values`, `input_ids`, or `labels` input to be padded."
            )

        # 根据输入数据类型选择相应的填充方法
        if input_values is not None:
            inputs = self.feature_extractor.pad(input_values, *args, **kwargs)
        elif input_ids is not None:
            inputs = self.tokenizer.pad(input_ids, **kwargs)
        else:
            inputs = None

        # 如果存在 labels，则处理目标数据
        if labels is not None:
            # 如果 labels 包含 "input_ids" 或其第一个元素是包含 "input_ids" 的列表，则使用 tokenizer 进行填充
            if "input_ids" in labels or (isinstance(labels, list) and "input_ids" in labels[0]):
                targets = self.tokenizer.pad(labels, **kwargs)
                labels = targets["input_ids"]
            else:
                # 否则，进行特征提取器的填充，针对 log-mel spectrograms
                feature_size_hack = self.feature_extractor.feature_size
                self.feature_extractor.feature_size = self.feature_extractor.num_mel_bins
                targets = self.feature_extractor.pad(labels, *args, **kwargs)
                self.feature_extractor.feature_size = feature_size_hack
                labels = targets["input_values"]
        else:
            targets = None

        # 如果 inputs 为 None，则直接返回 targets
        if inputs is None:
            return targets

        # 如果 targets 存在，则将 labels 添加到 inputs 中，并处理 decoder_attention_mask
        if targets is not None:
            inputs["labels"] = labels

            decoder_attention_mask = targets.get("attention_mask")
            if decoder_attention_mask is not None:
                inputs["decoder_attention_mask"] = decoder_attention_mask

        # 返回处理后的 inputs
        return inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        """
        # 将所有参数传递给 tokenizer 的 batch_decode 方法，并返回结果
        return self.tokenizer.batch_decode(*args, **kwargs)
    def decode(self, *args, **kwargs):
        """
        这个方法将所有参数转发给 SpeechT5Tokenizer 的 [`~SpeechT5Tokenizer.decode`] 方法。
        请参考该方法的文档字符串以获取更多信息。
        """
        # 调用 tokenizer 对象的 decode 方法，并返回其结果
        return self.tokenizer.decode(*args, **kwargs)

`.\models\speecht5\tokenization_speecht5.py`

# coding=utf-8
# 上面是指定文件编码格式为 UTF-8

# 版权声明和许可证信息
# Copyright 2023 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tokenization class for SpeechT5."""
# 上面是文件的简要描述和目的

# 引入必要的模块
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple

# 引入 SentencePiece 库用于分词
import sentencepiece as spm

# 引入 tokenization_utils 模块中的 PreTrainedTokenizer 类
from ...tokenization_utils import PreTrainedTokenizer
# 引入 logging 模块中的日志记录器
from ...utils import logging
# 引入本地的 number_normalizer 模块中的 EnglishNumberNormalizer 类
from .number_normalizer import EnglishNumberNormalizer

# 获取 logger 对象，用于日志记录
logger = logging.get_logger(__name__)

# 定义词汇文件名常量
VOCAB_FILES_NAMES = {"vocab_file": "spm_char.model"}

# 定义预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/speecht5_asr": "https://huggingface.co/microsoft/speecht5_asr/resolve/main/spm_char.model",
        "microsoft/speecht5_tts": "https://huggingface.co/microsoft/speecht5_tts/resolve/main/spm_char.model",
        "microsoft/speecht5_vc": "https://huggingface.co/microsoft/speecht5_vc/resolve/main/spm_char.model",
    }
}

# 定义预训练模型的位置编码嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/speecht5_asr": 1024,
    "microsoft/speecht5_tts": 1024,
    "microsoft/speecht5_vc": 1024,
}


class SpeechT5Tokenizer(PreTrainedTokenizer):
    """
    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # 上面是 SpeechT5Tokenizer 类的描述和基本信息
    # 导入必要的库和模块
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) 文件的路径（通常具有 *.spm* 扩展名），
            包含实例化分词器所需的词汇表。
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            序列的开始标记。
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            序列的结束标记。
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            未知标记。当词汇表中没有某个词时，将该词转换为此标记。
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            用于填充的标记，在将不同长度的序列进行批处理时使用。
        normalize (`bool`, *optional*, defaults to `False`):
            是否将文本中的数字量转换为其英文拼写的对应词。
        sp_model_kwargs (`dict`, *optional*):
            将传递给 `SentencePieceProcessor.__init__()` 方法的参数。可以用于设置 SentencePiece 的一些选项，
            如启用子词正则化 (`enable_sampling`)、nbest_size 参数等。

              - `enable_sampling`: 启用子词正则化。
              - `nbest_size`: 对于unigram的采样参数。对于BPE-Dropout无效。

                - `nbest_size = {0,1}`: 不执行采样。
                - `nbest_size > 1`: 从 nbest_size 个结果中进行采样。
                - `nbest_size < 0`: 假设 nbest_size 为无限大，使用前向过滤和后向采样算法从所有假设（lattice）中采样。

              - `alpha`: unigram 采样的平滑参数，以及 BPE-dropout 合并操作的 dropout 概率。

    Attributes:
        sp_model (`SentencePieceProcessor`):
            用于每次转换（字符串、标记和ID）的 *SentencePiece* 处理器。
    """

    # 定义一些常量和映射
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        normalize=False,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):
        # 初始化函数，用于实例化一个新的 SentencePieceTokenizer 对象
    ) -> None:
        # 初始化函数，设置参数并加载 SentencePiece 模型
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs  # 如果未提供 sp_model_kwargs，则初始化为空字典
        self.vocab_file = vocab_file  # 设置词汇文件路径
        self.normalize = normalize  # 设置是否进行文本归一化
        self._normalizer = None  # 初始化归一化器为 None

        # 使用给定的参数初始化 SentencePieceProcessor 对象
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)  # 加载指定的词汇文件

        # 调用父类的初始化方法，传递相关参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            normalize=normalize,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        # 准备文本用于分词处理
        normalize = kwargs.pop("normalize", self.normalize)  # 获取归一化参数，如果未指定则使用实例变量的值
        if is_split_into_words:
            text = " " + text  # 如果文本已经分成单词，则在文本前加空格
        if normalize:
            text = self.normalizer(text)  # 如果需要归一化，则对文本进行归一化处理
        return (text, kwargs)  # 返回处理后的文本和剩余的 kwargs 参数

    @property
    def vocab_size(self):
        # 返回词汇表大小，即 SentencePiece 模型中的词汇数量
        return self.sp_model.get_piece_size()

    @property
    def normalizer(self):
        # 返回归一化器对象，如果未初始化则创建一个英文数字归一化器
        if self._normalizer is None:
            self._normalizer = EnglishNumberNormalizer()
        return self._normalizer

    @normalizer.setter
    def normalizer(self, value):
        # 设置归一化器对象
        self._normalizer = value

    def get_vocab(self):
        # 返回词汇表，将词汇 ID 映射为对应的词汇（字符串形式）
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)  # 将额外的特殊标记也加入词汇表
        return vocab

    def __getstate__(self):
        # 获取对象的状态，用于序列化
        state = self.__dict__.copy()
        state["sp_model"] = None  # 将 sp_model 设为 None，以免在序列化时保存 SentencePieceProcessor 对象
        return state

    def __setstate__(self, d):
        # 设置对象的状态，用于反序列化
        self.__dict__ = d

        # 为了向后兼容性
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 重新创建 SentencePieceProcessor 对象，并加载词汇文件
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    def _tokenize(self, text: str) -> List[str]:
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        # 对文本进行分词处理，返回分词结果（字符串列表）
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 将词汇（token）转换为对应的 ID
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 将 ID 转换为对应的词汇（token）
        token = self.sp_model.IdToPiece(index)
        return token

    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) into a single string."""
        # 初始化空列表用于存储当前子 token 序列
        current_sub_tokens = []
        # 初始化输出字符串
        out_string = ""
        # 初始化标志来跟踪上一个 token 是否是特殊 token
        prev_is_special = False
        # 遍历 tokens 序列
        for token in tokens:
            # 检查当前 token 是否是特殊 token
            if token in self.all_special_tokens:
                # 如果当前 token 是特殊 token 并且上一个 token 不是特殊 token，则添加空格
                if not prev_is_special:
                    out_string += " "
                # 解码当前子 token 序列并添加当前 token 到输出字符串
                out_string += self.sp_model.decode(current_sub_tokens) + token
                # 更新标志表明当前 token 是特殊 token
                prev_is_special = True
                # 重置当前子 token 序列
                current_sub_tokens = []
            else:
                # 将当前 token 添加到当前子 token 序列中
                current_sub_tokens.append(token)
                # 更新标志表明当前 token 不是特殊 token
                prev_is_special = False
        # 将剩余的子 token 序列解码并添加到输出字符串
        out_string += self.sp_model.decode(current_sub_tokens)
        # 返回去除首尾空格的输出字符串
        return out_string.strip()

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """Build model inputs from a sequence by appending eos_token_id."""
        # 如果只有一个输入序列，则在末尾添加 eos_token_id 并返回
        if token_ids_1 is None:
            return token_ids_0 + [self.eos_token_id]
        # 如果有两个输入序列，将它们连接并在末尾添加 eos_token_id 后返回
        return token_ids_0 + token_ids_1 + [self.eos_token_id]

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        # 如果输入的 token_ids_0 已经包含特殊 token，直接调用父类的方法并返回结果
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 后缀添加 1 用于标识特殊 token
        suffix_ones = [1]
        # 如果只有一个输入序列，返回长度为 token_ids_0 的零列表加上后缀
        if token_ids_1 is None:
            return ([0] * len(token_ids_0)) + suffix_ones
        # 如果有两个输入序列，返回长度为 token_ids_0 和 token_ids_1 的零列表加上后缀
        return ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        # 构建输出词汇表文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与输出词汇表文件路径不同且当前词汇表文件存在，则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇表文件不存在，则将当前 sentencepiece 模型的序列化模型写入输出文件
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回输出词汇表文件路径的元组
        return (out_vocab_file,)

`.\models\speecht5\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从工具模块中导入异常和延迟加载模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_torch_available,
)

# 定义模块导入结构，包含不同子模块及其对应的类和常量
_import_structure = {
    "configuration_speecht5": [
        "SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP",
        "SpeechT5Config",
        "SpeechT5HifiGanConfig",
    ],
    "feature_extraction_speecht5": ["SpeechT5FeatureExtractor"],
    "processing_speecht5": ["SpeechT5Processor"],
}

# 尝试检查是否存在 SentencePiece 库，如果不存在则引发异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果库可用，则将 tokenization_speecht5 模块添加到导入结构中
    _import_structure["tokenization_speecht5"] = ["SpeechT5Tokenizer"]

# 尝试检查是否存在 Torch 库，如果不存在则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则将 modeling_speecht5 模块添加到导入结构中
    _import_structure["modeling_speecht5"] = [
        "SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SpeechT5ForSpeechToText",
        "SpeechT5ForSpeechToSpeech",
        "SpeechT5ForTextToSpeech",
        "SpeechT5Model",
        "SpeechT5PreTrainedModel",
        "SpeechT5HifiGan",
    ]

# 如果正在进行类型检查，则从子模块导入特定类和常量
if TYPE_CHECKING:
    from .configuration_speecht5 import (
        SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP,
        SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP,
        SpeechT5Config,
        SpeechT5HifiGanConfig,
    )
    from .feature_extraction_speecht5 import SpeechT5FeatureExtractor
    from .processing_speecht5 import SpeechT5Processor

    # 如果存在 SentencePiece 库，则从 tokenization_speecht5 导入 SpeechT5Tokenizer 类
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_speecht5 import SpeechT5Tokenizer

    # 如果存在 Torch 库，则从 modeling_speecht5 导入各个 SpeechT5 模型类和常量
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_speecht5 import (
            SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST,
            SpeechT5ForSpeechToSpeech,
            SpeechT5ForSpeechToText,
            SpeechT5ForTextToSpeech,
            SpeechT5HifiGan,
            SpeechT5Model,
            SpeechT5PreTrainedModel,
        )

# 如果不是类型检查阶段，则使用延迟加载模块的 LazyModule 类进行模块的动态导入
else:
    import sys

    # 将当前模块设为 LazyModule 类型，动态导入相关子模块和类
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\speech_encoder_decoder\configuration_speech_encoder_decoder.py`

# coding=utf-8
# 版权所有 2021 年 HuggingFace Inc. 团队.
# 版权所有 2018 年 NVIDIA 公司. 保留所有权利.
#
# 根据 Apache 许可证 2.0 版本 ("许可证") 进行许可;
# 您不得使用此文件，除非符合许可证的规定。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件按"原样"分发，
# 没有任何形式的明示或暗示担保或条件。
# 有关更多信息，请参阅许可证。

from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import AutoConfig

# 获取日志记录器，以记录与当前模块相关的日志消息
logger = logging.get_logger(__name__)

# 继承自 PretrainedConfig 的配置类，用于存储 SpeechEncoderDecoderModel 的配置信息
class SpeechEncoderDecoderConfig(PretrainedConfig):
    r"""
    [`SpeechEncoderDecoderConfig`] 是用于存储 [`SpeechEncoderDecoderModel`] 配置的类。
    根据指定的参数实例化一个 Encoder-Decoder 模型，定义编码器和解码器的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    Args:
        kwargs (*可选*):
            关键字参数的字典。特别是:

                - **encoder** ([`PretrainedConfig`], *可选*) -- 定义编码器配置的配置对象实例。
                - **decoder** ([`PretrainedConfig`], *可选*) -- 定义解码器配置的配置对象实例。

    Examples:

    ```
    >>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel

    >>> # 初始化一个 Wav2Vec2 和 BERT 风格的配置
    >>> config_encoder = Wav2Vec2Config()
    >>> config_decoder = BertConfig()

    >>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

    >>> # 从 Wav2Vec2 和 google-bert/bert-base-uncased 风格的配置初始化一个 Wav2Vec2Bert 模型
    >>> model = SpeechEncoderDecoderModel(config=config)

    >>> # 访问模型配置
    >>> config_encoder = model.config.encoder
    >>> config_decoder = model.config.decoder
    >>> # 将解码器配置设置为 causal lm
    >>> config_decoder.is_decoder = True
    >>> config_decoder.add_cross_attention = True

    >>> # 保存模型，包括其配置
    >>> model.save_pretrained("my-model")

    >>> # 从预训练文件夹加载模型和配置
    >>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained("my-model")
    >>> model = SpeechEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
    ```"""
    
    # 模型类型标识为 "speech-encoder-decoder"
    model_type = "speech-encoder-decoder"
    # 定义一个类变量 is_composition 并初始化为 True，表示此类是一个组合类
    is_composition = True

    # 构造函数，初始化对象时被调用
    def __init__(self, **kwargs):
        # 调用父类的构造函数，传入所有的关键字参数
        super().__init__(**kwargs)
        # 检查是否传入了 "encoder" 和 "decoder" 参数，如果没有则抛出 ValueError 异常
        if "encoder" not in kwargs or "decoder" not in kwargs:
            raise ValueError(
                f"A configuraton of type {self.model_type} cannot be instantiated because not both `encoder` and"
                f" `decoder` sub-configurations are passed, but only {kwargs}"
            )

        # 从 kwargs 中弹出 "encoder" 和 "decoder" 参数的配置，并获取它们的 model_type
        encoder_config = kwargs.pop("encoder")
        encoder_model_type = encoder_config.pop("model_type")
        decoder_config = kwargs.pop("decoder")
        decoder_model_type = decoder_config.pop("model_type")

        # 使用 AutoConfig 类为 encoder 和 decoder 创建配置对象
        self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
        self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
        # 设置对象的 is_encoder_decoder 属性为 True，表示这是一个 encoder-decoder 架构
        self.is_encoder_decoder = True

    @classmethod
    # 类方法，用于从预训练的 encoder 和 decoder 配置中实例化一个 SpeechEncoderDecoderConfig 对象
    def from_encoder_decoder_configs(
        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
    ) -> PretrainedConfig:
        """
        从预训练的 encoder 和 decoder 配置实例化一个 [`SpeechEncoderDecoderConfig`] (或其派生类) 对象。

        Returns:
            [`SpeechEncoderDecoderConfig`]: 配置对象的一个实例
        """
        # 记录日志信息，设置 decoder_config 的 is_decoder=True 和 add_cross_attention=True
        logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
        decoder_config.is_decoder = True
        decoder_config.add_cross_attention = True

        # 使用传入的 encoder_config 和 decoder_config 的字典形式，以及其他关键字参数，实例化一个类对象
        return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)

`.\models\speech_encoder_decoder\convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py`

# 导入必要的库和模块
import argparse  # 导入命令行参数解析模块

import fairseq  # 导入fairseq库
import torch  # 导入PyTorch库
from torch import nn  # 导入PyTorch的神经网络模块

from transformers import (  # 从transformers库中导入以下模块和类
    MBart50Tokenizer,
    MBartConfig,
    MBartForCausalLM,
    SpeechEncoderDecoderConfig,
    SpeechEncoderDecoderModel,
    Wav2Vec2Config,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Model,
    logging,
)

logging.set_verbosity_info()  # 设置日志级别为INFO
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 定义一个字典，用于将Wav2Vec2模型的参数映射到Hugging Face的命名空间
MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
    "fc2": "encoder.layers.*.feed_forward.output_dense",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "w2v_model.layer_norm": "feature_projection.layer_norm",
    "quantizer.weight_proj": "quantizer.weight_proj",
    "quantizer.vars": "quantizer.codevectors",
    "project_q": "project_q",
    "final_proj": "project_hid",
    "w2v_encoder.proj": "lm_head",
    "mask_emb": "masked_spec_embed",
}

# 定义顶层的关键字列表
TOP_LEVEL_KEYS = [
    "lm_head",
    "quantizer.weight_proj",
    "quantizer.codevectors",
    "project_q",
    "project_hid",
]

# 递归设置函数，用于将权重设置到指定的Hugging Face指针中
def set_recursively(hf_pointer, key, value, full_name, weight_type):
    # 根据键名逐级获取Hugging Face指针
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)

    # 检查Hugging Face指针的形状是否与待设置的值相匹配
    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    # 断言确认形状匹配，否则抛出错误
    assert hf_shape == value.shape, (
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

    # 根据权重类型设置相应的值到Hugging Face指针中
    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    else:
        hf_pointer.data = value
    # 记录信息到日志中，描述初始化操作的详细情况，包括属性名（如果提供了权重类型）和来源的完整名称。
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
# 递归加载 Fairseq 模型的权重到 Hugging Face 模型中
def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
    # 未使用的权重列表
    unused_weights = []
    # 获取 Fairseq 模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 获取 Hugging Face 模型的特征提取器和适配器
    feature_extractor = hf_model.feature_extractor
    adapter = hf_model.adapter

    # 遍历 Fairseq 模型状态字典中的每个键值对
    for name, value in fairseq_dict.items():
        is_used = False
        # 如果名称中包含 "conv_layers"
        if "conv_layers" in name:
            # 调用加载卷积层的函数
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            is_used = True
        # 如果名称中包含任何 "adaptor", "w2v_encoder.proj.", "w2v_proj_ln."
        elif any(x in name for x in ["adaptor", "w2v_encoder.proj.", "w2v_proj_ln."]):
            # 调用加载适配器的函数
            load_adapter(name, value, adapter, unused_weights)
            is_used = True
        else:
            # 遍历 MAPPING 字典中的每个键值对
            for key, mapped_key in MAPPING.items():
                # 如果键存在于名称中或者其去掉 "w2v_model." 后的部分等于名称的第一个部分
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    is_used = True
                    # 如果映射键包含 "*"，则替换为层索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    # 根据名称中的关键字确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        weight_type = "weight"
                    else:
                        weight_type = None
                    # 递归设置 Hugging Face 模型的对应权重
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                continue
        # 如果没有使用此权重，则添加到未使用列表中
        if not is_used:
            unused_weights.append(name)

    # 记录未使用的权重列表到日志
    logger.warning(f"Unused weights: {unused_weights}")


# 加载卷积层的函数
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 获取卷积层的名称
    name = full_name.split("conv_layers.")[-1]
    # 拆分名称
    items = name.split(".")
    # 获取层和类型索引
    layer_id = int(items[0])
    type_id = int(items[1])

    # 如果类型索引为 0
    if type_id == 0:
        # 如果名称中包含 "bias"
        if "bias" in name:
            # 断言检查值的形状与特征提取器中相应卷积层的偏置数据形状是否一致
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            # 将值赋给特征提取器中对应卷积层的偏置数据
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 如果名称中包含 "weight"
        elif "weight" in name:
            # 断言检查值的形状与特征提取器中相应卷积层的权重数据形状是否一致
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            # 将值赋给特征提取器中对应卷积层的权重数据
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 如果 type_id 等于 2，并且不使用组归一化，或者 type_id 等于 2，且 layer_id 等于 0 并且使用组归一化
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果变量名中包含 "bias"
        if "bias" in name:
            # 断言当前值的形状与特征提取器中卷积层的层归一化偏置数据的形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            # 将值赋给特征提取器中卷积层的层归一化偏置数据
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            # 记录日志，指示层归一化权重已从指定变量名初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果变量名中包含 "weight"
        elif "weight" in name:
            # 断言当前值的形状与特征提取器中卷积层的层归一化权重数据的形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
            )
            # 将值赋给特征提取器中卷积层的层归一化权重数据
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            # 记录日志，指示层归一化权重已从指定变量名初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    else:
        # 将未使用的权重变量名添加到未使用权重列表中
        unused_weights.append(full_name)
# 定义一个函数，用于加载适配器（adapter）的权重信息
def load_adapter(full_name, value, adapter, unused_weights):
    # 从完整的名称中提取适配器的名称
    name = full_name.split("adaptor.")[-1]
    # 将名称按"."分割为列表
    items = name.split(".")

    # 判断第二个元素是否为数字，如果是则转换为整数，否则设为None
    if items[1].isdigit():
        layer_id = int(items[1])
    else:
        layer_id = None

    # 如果完整名称中不包含 "adaptor"
    if "adaptor" not in full_name:
        # 如果包含 "proj_ln"，则是投影层规范化（layer norm）
        if "proj_ln" in full_name:
            # 如果名称中包含 "bias"，则进行断言和赋值操作
            if "bias" in name:
                assert (
                    value.shape == adapter.proj_layer_norm.bias.data.shape
                ), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.bias.data.shape} was found."
                adapter.proj_layer_norm.bias.data = value
                logger.info(f"Adapter proj layer norm bias was initialized from {full_name}.")
            # 如果名称中包含 "weight"，则进行断言和赋值操作
            if "weight" in name:
                assert (
                    value.shape == adapter.proj_layer_norm.weight.data.shape
                ), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.weight.data.shape} was found."
                adapter.proj_layer_norm.weight.data = value
        else:
            # 否则是投影层
            # 如果名称中包含 "bias"，则进行断言和赋值操作
            if "bias" in name:
                assert (
                    value.shape == adapter.proj.bias.data.shape
                ), f"{full_name} has size {value.shape}, but {adapter.proj.bias.data.shape} was found."
                adapter.proj.bias.data = value
                logger.info(f"Adapter proj layer bias was initialized from {full_name}.")
            # 如果名称中包含 "weight"，则进行断言和赋值操作
            if "weight" in name:
                assert (
                    value.shape == adapter.proj.weight.data.shape
                ), f"{full_name} has size {value.shape}, but {adapter.proj.weight.data.shape} was found."
                adapter.proj.weight.data = value
    # 如果 layer_id 是整数
    elif isinstance(layer_id, int):
        # 如果名称中包含 "bias"，则进行断言和赋值操作
        if "bias" in name:
            assert (
                value.shape == adapter.layers[layer_id].conv.bias.data.shape
            ), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.bias.data.shape} was found."
            adapter.layers[layer_id].conv.bias.data = value
            logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
        # 如果名称中包含 "weight"，则进行断言和赋值操作
        elif "weight" in name:
            assert (
                value.shape == adapter.layers[layer_id].conv.weight.data.shape
            ), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.weight.data.shape} was found."
            adapter.layers[layer_id].conv.weight.data = value
            logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
    else:
        # 如果既不是 "adaptor" 开头，也没有整数的 layer_id，将 full_name 添加到未使用的权重列表中
        unused_weights.append(full_name)


# 根据嵌入层（emb）创建一个线性层，并将其权重初始化为嵌入层的权重
def make_linear_from_emb(emb):
    # 获取嵌入层的词汇大小和嵌入维度
    vocab_size, emb_size = emb.weight.shape
    # 创建一个线性层，不带偏置
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
    # 将线性层的权重数据初始化为嵌入层的权重数据
    lin_layer.weight.data = emb.weight.data
    return lin_layer


# 使用无梯度的上下文装饰器定义一个函数，用于将 wav2vec2 的检查点转换
@torch.no_grad()
def convert_wav2vec2_checkpoint(
    checkpoint_path,
    pytorch_dump_folder_path,        # PyTorch模型保存的文件夹路径
    dict_path,                       # 词典文件路径
    config_yaml_path,                # 配置文件（YAML格式）路径
    encoder_config_path,             # 编码器配置文件路径
    decoder_config_path,             # 解码器配置文件路径
    add_adapter,                     # 是否添加适配器（布尔值）
    adapter_kernel_size,             # 适配器的卷积核大小
    adapter_stride,                  # 适配器的步幅大小
    decoder_start_token_id,          # 解码器起始标记ID
    encoder_output_dim,              # 编码器的输出维度
def copy_weights_to_transformers_model(
    pytorch_dump_folder_path,
    checkpoint_path,
    dict_path,
    config_yaml_path,
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """

    # load configs
    # 从预训练配置文件加载 Wav2Vec2Config
    encoder_config = Wav2Vec2Config.from_pretrained(
        encoder_config_path,
        add_adapter=True,
        adapter_stride=adapter_stride,
        adapter_kernel_size=adapter_kernel_size,
        token_token=True,
        output_hidden_size=encoder_output_dim,
    )
    
    # 从预训练配置文件加载 MBartConfig
    decoder_config = MBartConfig.from_pretrained(decoder_config_path)

    # load model
    # 使用 fairseq 提供的函数加载模型集合和任务
    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [checkpoint_path],
        arg_overrides={
            "config_yaml": config_yaml_path,
            "data": "/".join(dict_path.split("/")[:-1]),
            "w2v_path": checkpoint_path,
            "load_pretrained_decoder_from": None,
        },
    )
    model = model[0].eval()  # 设置模型为评估模式

    # load feature extractor
    # 从预训练配置文件加载 Wav2Vec2FeatureExtractor
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, token_token=True)

    # set weights for wav2vec2 encoder
    # 使用 Wav2Vec2Model 构建 hf_encoder
    hf_encoder = Wav2Vec2Model(encoder_config)

    # 递归加载 wav2vec2 encoder 的权重
    recursively_load_weights_wav2vec2(model.encoder, hf_encoder)

    # load decoder weights
    # 使用 MBartForCausalLM 构建 hf_decoder
    hf_decoder = MBartForCausalLM(decoder_config)
    # 加载模型的 decoder 权重，并记录缺失和意外的键
    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)
    logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}")
    logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}")

    # 构建 SpeechEncoderDecoderModel
    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
    hf_wav2vec.config.tie_word_embeddings = False  # 设置不共享词嵌入权重

    # 初始化 MBart50Tokenizer
    tokenizer = MBart50Tokenizer(dict_path)
    tokenizer.save_pretrained(pytorch_dump_folder_path)  # 保存 tokenizer 到指定路径

    # 构建配置字典并设置相关 token id
    config = hf_wav2vec.config.to_dict()
    config["pad_token_id"] = tokenizer.pad_token_id
    config["bos_token_id"] = tokenizer.bos_token_id
    config["eos_token_id"] = tokenizer.eos_token_id
    config["tokenizer_class"] = "mbart50"
    config["feature_extractor_type"] = "wav2vec2"
    config["decoder_start_token_id"] = tokenizer.eos_token_id
    config["forced_bos_token_id"] = 250004
    config["forced_eos_token_id"] = tokenizer.eos_token_id

    # 从配置字典构建 SpeechEncoderDecoderConfig
    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)

    # 将模型保存到指定路径
    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
    # 保存 feature extractor 到指定路径
    feature_extractor.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    parser.add_argument("--config_yaml_path", default=None, type=str, help="Path to yaml file of fine-tuned model")
    # 添加一个接收命令行参数的选项，用于指定编码器配置文件的路径
    parser.add_argument(
        "--encoder_config_path",
        default="facebook/wav2vec2-xls-r-1b",
        type=str,
        help="Path to hf encoder wav2vec2 checkpoint config",
    )
    
    # 添加一个接收命令行参数的选项，用于指定解码器配置文件的路径
    parser.add_argument(
        "--decoder_config_path",
        default="facebook/mbart-large-50-one-to-many-mmt",
        type=str,
        help="Path to hf decoder checkpoint config",
    )
    
    # 添加一个接收命令行参数的选项，指定是否添加模型适配器层，默认为 True
    parser.add_argument("--add_adapter", default=True, type=bool, help="whether to add model adapter layers")
    
    # 添加一个接收命令行参数的选项，用于指定模型适配器层的步幅，默认为 2
    parser.add_argument("--adapter_stride", default=2, type=int, help="stride of adapter layers")
    
    # 添加一个接收命令行参数的选项，用于指定模型适配器层的卷积核大小，默认为 3
    parser.add_argument("--adapter_kernel_size", default=3, type=int, help="kernel size of adapter layers")
    
    # 添加一个接收命令行参数的选项，用于指定编码器输出的维度，默认为 1024
    parser.add_argument("--encoder_output_dim", default=1024, type=int, help="encoder output dim")
    
    # 添加一个接收命令行参数的选项，用于指定解码器启动令牌的ID，默认为 250004
    parser.add_argument("--start_token_id", default=250004, type=int, help="`decoder_start_token_id` of model config")
    
    # 解析命令行参数并将其存储在 args 变量中
    args = parser.parse_args()
    
    # 调用函数 convert_wav2vec2_checkpoint，传递命令行参数中的各个配置项作为参数
    convert_wav2vec2_checkpoint(
        args.checkpoint_path,
        args.pytorch_dump_folder_path,
        args.dict_path,
        args.config_yaml_path,
        encoder_config_path=args.encoder_config_path,
        decoder_config_path=args.decoder_config_path,
        add_adapter=args.add_adapter,
        adapter_kernel_size=args.adapter_kernel_size,
        adapter_stride=args.adapter_stride,
        decoder_start_token_id=args.start_token_id,
        encoder_output_dim=args.encoder_output_dim,
    )

Transformers-源码解析-一百零四-

Transformers 源码解析（一百零四）

.\models\siglip\__init__.py

.\models\speecht5\configuration_speecht5.py

.\models\speecht5\convert_hifigan.py

.\models\speecht5\convert_speecht5_original_pytorch_checkpoint_to_pytorch.py

.\models\speecht5\feature_extraction_speecht5.py

.\models\speecht5\modeling_speecht5.py

.\models\speecht5\number_normalizer.py

.\models\speecht5\processing_speecht5.py

.\models\speecht5\tokenization_speecht5.py

.\models\speecht5\__init__.py

.\models\speech_encoder_decoder\configuration_speech_encoder_decoder.py

.\models\speech_encoder_decoder\convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py

`.\models\siglip\init.py`

`.\models\speecht5\configuration_speecht5.py`

`.\models\speecht5\convert_hifigan.py`

`.\models\speecht5\convert_speecht5_original_pytorch_checkpoint_to_pytorch.py`

`.\models\speecht5\feature_extraction_speecht5.py`

`.\models\speecht5\modeling_speecht5.py`

`.\models\speecht5\number_normalizer.py`

`.\models\speecht5\processing_speecht5.py`

`.\models\speecht5\tokenization_speecht5.py`

`.\models\speecht5\init.py`

`.\models\speech_encoder_decoder\configuration_speech_encoder_decoder.py`

`.\models\speech_encoder_decoder\convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py`