Transformers 源码解析（四十八）

`.\models\falcon\init.py`

# 设置编码格式为 UTF-8
# 版权声明，标明 Falcon 作者和 HuggingFace Inc. 团队的版权
# 根据 Apache License, Version 2.0 许可证，使用该文件需要遵循许可证规定
# 可以在指定许可证网址获取许可证的副本
# 根据适用法律或书面同意的情况下，本软件按"原样"提供，无任何明示或暗示的担保
# 详见许可证以获取特定语言的权限说明
from typing import TYPE_CHECKING

# 从 utils 模块导入必要的依赖项
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块导入结构
_import_structure = {
    "configuration_falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
}

# 检查是否存在 torch 库，若不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 torch 库，则添加相关模型的导入结构
    _import_structure["modeling_falcon"] = [
        "FALCON_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FalconForCausalLM",
        "FalconModel",
        "FalconPreTrainedModel",
        "FalconForSequenceClassification",
        "FalconForTokenClassification",
        "FalconForQuestionAnswering",
    ]

# 如果是类型检查阶段，从 configuration_falcon 模块导入相关的配置和类
if TYPE_CHECKING:
    from .configuration_falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig

    # 检查是否存在 torch 库，若不存在则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果存在 torch 库，则从 modeling_falcon 模块导入相关的模型类
        from .modeling_falcon import (
            FALCON_PRETRAINED_MODEL_ARCHIVE_LIST,
            FalconForCausalLM,
            FalconForQuestionAnswering,
            FalconForSequenceClassification,
            FalconForTokenClassification,
            FalconModel,
            FalconPreTrainedModel,
        )

# 如果不是类型检查阶段，则动态地将当前模块设置为 LazyModule，并指定导入结构和模块规范
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\fastspeech2_conformer\configuration_fastspeech2_conformer.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" FastSpeech2Conformer model configuration"""

from typing import Dict

from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取全局日志记录器对象
logger = logging.get_logger(__name__)

# 定义 FastSpeech2Conformer 预训练模型配置文件的存档映射字典
FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/config.json",
}

# 定义包含 HiFi-GAN 的 FastSpeech2Conformer 预训练模型配置文件的存档映射字典
FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "espnet/fastspeech2_conformer_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_hifigan/raw/main/config.json",
}

# 定义包含带 HiFi-GAN 的 FastSpeech2Conformer 预训练模型配置文件的存档映射字典
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "espnet/fastspeech2_conformer_with_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_with_hifigan/raw/main/config.json",
}

# 定义 FastSpeech2ConformerConfig 类，用于存储 FastSpeech2Conformer 模型的配置信息
class FastSpeech2ConformerConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to
    instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the
    FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig

    >>> # Initializing a FastSpeech2Conformer style configuration
    >>> configuration = FastSpeech2ConformerConfig()

    >>> # Initializing a model from the FastSpeech2Conformer style configuration
    >>> model = FastSpeech2ConformerModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 定义模型类型为 "fastspeech2_conformer"
    model_type = "fastspeech2_conformer"
    # 定义属性映射，将配置参数映射到 FastSpeech2ConformerModel 的参数上
    attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
    # 定义一个初始化函数，用于初始化一个神经网络模型
    def __init__(
        self,
        hidden_size=384,  # 隐藏层大小，默认为384
        vocab_size=78,  # 词汇表大小，默认为78
        num_mel_bins=80,  # 梅尔频谱特征的频道数，默认为80
        encoder_num_attention_heads=2,  # 编码器注意力头数，默认为2
        encoder_layers=4,  # 编码器层数，默认为4
        encoder_linear_units=1536,  # 编码器线性单元数，默认为1536
        decoder_layers=4,  # 解码器层数，默认为4
        decoder_num_attention_heads=2,  # 解码器注意力头数，默认为2
        decoder_linear_units=1536,  # 解码器线性单元数，默认为1536
        speech_decoder_postnet_layers=5,  # 语音解码器后处理网络层数，默认为5
        speech_decoder_postnet_units=256,  # 语音解码器后处理网络单元数，默认为256
        speech_decoder_postnet_kernel=5,  # 语音解码器后处理网络核大小，默认为5
        positionwise_conv_kernel_size=3,  # 位置卷积核大小，默认为3
        encoder_normalize_before=False,  # 编码器归一化层在前，默认为False
        decoder_normalize_before=False,  # 解码器归一化层在前，默认为False
        encoder_concat_after=False,  # 编码器拼接后，默认为False
        decoder_concat_after=False,  # 解码器拼接后，默认为False
        reduction_factor=1,  # 缩减因子，默认为1
        speaking_speed=1.0,  # 说话速度，默认为1.0
        use_macaron_style_in_conformer=True,  # 在Conformer中使用Macaron风格，默认为True
        use_cnn_in_conformer=True,  # 在Conformer中使用CNN，默认为True
        encoder_kernel_size=7,  # 编码器卷积核大小，默认为7
        decoder_kernel_size=31,  # 解码器卷积核大小，默认为31
        duration_predictor_layers=2,  # 持续时间预测器层数，默认为2
        duration_predictor_channels=256,  # 持续时间预测器通道数，默认为256
        duration_predictor_kernel_size=3,  # 持续时间预测器卷积核大小，默认为3
        energy_predictor_layers=2,  # 能量预测器层数，默认为2
        energy_predictor_channels=256,  # 能量预测器通道数，默认为256
        energy_predictor_kernel_size=3,  # 能量预测器卷积核大小，默认为3
        energy_predictor_dropout=0.5,  # 能量预测器dropout率，默认为0.5
        energy_embed_kernel_size=1,  # 能量嵌入卷积核大小，默认为1
        energy_embed_dropout=0.0,  # 能量嵌入dropout率，默认为0.0
        stop_gradient_from_energy_predictor=False,  # 是否从能量预测器停止梯度，默认为False
        pitch_predictor_layers=5,  # 音高预测器层数，默认为5
        pitch_predictor_channels=256,  # 音高预测器通道数，默认为256
        pitch_predictor_kernel_size=5,  # 音高预测器卷积核大小，默认为5
        pitch_predictor_dropout=0.5,  # 音高预测器dropout率，默认为0.5
        pitch_embed_kernel_size=1,  # 音高嵌入卷积核大小，默认为1
        pitch_embed_dropout=0.0,  # 音高嵌入dropout率，默认为0.0
        stop_gradient_from_pitch_predictor=True,  # 是否从音高预测器停止梯度，默认为True
        encoder_dropout_rate=0.2,  # 编码器dropout率，默认为0.2
        encoder_positional_dropout_rate=0.2,  # 编码器位置dropout率，默认为0.2
        encoder_attention_dropout_rate=0.2,  # 编码器注意力dropout率，默认为0.2
        decoder_dropout_rate=0.2,  # 解码器dropout率，默认为0.2
        decoder_positional_dropout_rate=0.2,  # 解码器位置dropout率，默认为0.2
        decoder_attention_dropout_rate=0.2,  # 解码器注意力dropout率，默认为0.2
        duration_predictor_dropout_rate=0.2,  # 持续时间预测器dropout率，默认为0.2
        speech_decoder_postnet_dropout=0.5,  # 语音解码器后处理dropout率，默认为0.5
        max_source_positions=5000,  # 最大源位置数，默认为5000
        use_masking=True,  # 是否使用掩码，默认为True
        use_weighted_masking=False,  # 是否使用加权掩码，默认为False
        num_speakers=None,  # 说话者数量，默认为None
        num_languages=None,  # 语言数量，默认为None
        speaker_embed_dim=None,  # 说话者嵌入维度，默认为None
        is_encoder_decoder=True,  # 是否为编码器-解码器结构，默认为True
        **kwargs,  # 其他参数，以字典形式接收
`
class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
    # 定义 FastSpeech2ConformerHifiGanConfig 类，继承自 PretrainedConfig 类
    r"""
    This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to
    instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model
    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    FastSpeech2Conformer
    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        model_in_dim (`int`, *optional*, defaults to 80):
            The number of frequency bins in the input log-mel spectrogram.
        upsample_initial_channel (`int`, *optional*, defaults to 512):
            The number of input channels into the upsampling network.
        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
            A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
            length of *upsample_rates* defines the number of convolutional layers and has to match the length of
            *upsample_kernel_sizes*.
        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
            A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
            length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
            *upsample_rates*.
        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
            fusion (MRF) module.
        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
            multi-receptive field fusion (MRF) module.
        initializer_range (`float`, *optional*, defaults to 0.01):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
            The angle of the negative slope used by the leaky ReLU activation.
        normalize_before (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.

    Example:

    ```
    >>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig
    """
    # 初始化配置类，设置默认参数
    def __init__(self, model_in_dim=80, upsample_initial_channel=512, upsample_rates=[8, 8, 2, 2],
                 upsample_kernel_sizes=[16, 16, 4, 4], resblock_kernel_sizes=[3, 7, 11],
                 resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], initializer_range=0.01,
                 leaky_relu_slope=0.1, normalize_before=True, **kwargs):
        # 调用父类构造函数初始化
        super().__init__(**kwargs)
        # 初始化参数
        self.model_in_dim = model_in_dim
        self.upsample_initial_channel = upsample_initial_channel
        self.upsample_rates = upsample_rates
        self.upsample_kernel_sizes = upsample_kernel_sizes
        self.resblock_kernel_sizes = resblock_kernel_sizes
        self.resblock_dilation_sizes = resblock_dilation_sizes
        self.initializer_range = initializer_range
        self.leaky_relu_slope = leaky_relu_slope
        self.normalize_before = normalize_before
    # 设置模型类型为 "hifigan"
    model_type = "hifigan"

    # 定义 FastSpeech2ConformerHifiGan 类，继承自父类
    def __init__(
        self,
        model_in_dim=80,  # 设置模型输入维度为 80
        upsample_initial_channel=512,  # 设置初始上采样通道数为 512
        upsample_rates=[8, 8, 2, 2],  # 设置上采样率数组
        upsample_kernel_sizes=[16, 16, 4, 4],  # 设置上采样卷积核大小数组
        resblock_kernel_sizes=[3, 7, 11],  # 设置残差块卷积核大小数组
        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],  # 设置残差块膨胀卷积尺寸数组
        initializer_range=0.01,  # 设置参数初始化范围
        leaky_relu_slope=0.1,  # 设置 LeakyReLU 斜率
        normalize_before=True,  # 设置是否在归一化前执行操作
        **kwargs,  # 其他可变关键字参数
    ):
        self.model_in_dim = model_in_dim  # 初始化模型输入维度
        self.upsample_initial_channel = upsample_initial_channel  # 初始化初始上采样通道数
        self.upsample_rates = upsample_rates  # 初始化上采样率数组
        self.upsample_kernel_sizes = upsample_kernel_sizes  # 初始化上采样卷积核大小数组
        self.resblock_kernel_sizes = resblock_kernel_sizes  # 初始化残差块卷积核大小数组
        self.resblock_dilation_sizes = resblock_dilation_sizes  # 初始化残差块膨胀卷积尺寸数组
        self.initializer_range = initializer_range  # 初始化参数初始化范围
        self.leaky_relu_slope = leaky_relu_slope  # 初始化 LeakyReLU 斜率
        self.normalize_before = normalize_before  # 初始化归一化前操作标志
        super().__init__(**kwargs)  # 调用父类的初始化方法，传入其他关键字参数
class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to
    instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations,
    defining the model architecture.

    Instantiating a configuration with the defaults will yield a similar configuration to that of the
    FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and
    FastSpeech2ConformerHifiGan
    [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        model_config (`typing.Dict`, *optional*):
            Configuration of the text-to-speech model.
        vocoder_config (`typing.Dict`, *optional*):
            Configuration of the vocoder model.
    model_config ([`FastSpeech2ConformerConfig`], *optional*):
        Configuration of the text-to-speech model.
    vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*):
        Configuration of the vocoder model.

    Example:

    ```
    >>> from transformers import (
    ...     FastSpeech2ConformerConfig,
    ...     FastSpeech2ConformerHifiGanConfig,
    ...     FastSpeech2ConformerWithHifiGanConfig,
    ...     FastSpeech2ConformerWithHifiGan,
    ... )

    >>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations.
    >>> model_config = FastSpeech2ConformerConfig()
    >>> vocoder_config = FastSpeech2ConformerHifiGanConfig()

    >>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration
    >>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict())

    >>> # Initializing a model (with random weights)
    >>> model = FastSpeech2ConformerWithHifiGan(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "fastspeech2_conformer_with_hifigan"
    is_composition = True

    def __init__(
        self,
        model_config: Dict = None,
        vocoder_config: Dict = None,
        **kwargs,
    ):
        # 如果 `model_config` 为 None，则使用默认值初始化文本转语音模型配置
        if model_config is None:
            model_config = {}
            logger.info("model_config is None. initializing the model with default values.")

        # 如果 `vocoder_config` 为 None，则使用默认值初始化声码器模型配置
        if vocoder_config is None:
            vocoder_config = {}
            logger.info("vocoder_config is None. initializing the coarse model with default values.")

        # 使用给定的 `model_config` 字典初始化 FastSpeech2ConformerConfig 对象
        self.model_config = FastSpeech2ConformerConfig(**model_config)
        # 使用给定的 `vocoder_config` 字典初始化 FastSpeech2ConformerHifiGanConfig 对象
        self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config)

        # 调用父类的构造函数，传递额外的关键字参数
        super().__init__(**kwargs)

`.\models\fastspeech2_conformer\convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py`

# 设置代码文件的编码格式为 UTF-8

# 导入 argparse 模块，用于处理命令行参数
import argparse

# 导入 json 模块，用于处理 JSON 格式数据
import json

# 导入 re 模块，用于正则表达式操作
import re

# 从 pathlib 模块中导入 Path 类，用于处理文件路径
from pathlib import Path

# 从 tempfile 模块中导入 TemporaryDirectory 类，用于创建临时目录
from tempfile import TemporaryDirectory

# 导入 torch 库，用于处理 PyTorch 相关功能
import torch

# 导入 yaml 模块，用于处理 YAML 格式数据
import yaml

# 从 transformers 库中导入以下类和函数
from transformers import (
    FastSpeech2ConformerConfig,        # FastSpeech2ConformerConfig 类，用于配置 FastSpeech2Conformer 模型
    FastSpeech2ConformerModel,        # FastSpeech2ConformerModel 类，FastSpeech2Conformer 模型
    FastSpeech2ConformerTokenizer,    # FastSpeech2ConformerTokenizer 类，FastSpeech2Conformer 模型的分词器
    logging                           # logging 模块，用于日志记录
)

# 设置 logging 模块的详细程度为 info
logging.set_verbosity_info()

# 获取日志记录器，用于记录 FastSpeech2Conformer 模型相关的日志信息
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")

# 定义一个映射表，将配置参数映射到新的命名格式，用于兼容旧的配置
CONFIG_MAPPING = {
    "adim": "hidden_size",                                    # adim 映射到 hidden_size
    "aheads": "num_attention_heads",                          # aheads 映射到 num_attention_heads
    "conformer_dec_kernel_size": "decoder_kernel_size",       # conformer_dec_kernel_size 映射到 decoder_kernel_size
    "conformer_enc_kernel_size": "encoder_kernel_size",       # conformer_enc_kernel_size 映射到 encoder_kernel_size
    "decoder_normalize_before": "decoder_normalize_before",   # decoder_normalize_before 映射到 decoder_normalize_before
    "dlayers": "decoder_layers",                              # dlayers 映射到 decoder_layers
    "dunits": "decoder_linear_units",                         # dunits 映射到 decoder_linear_units
    "duration_predictor_chans": "duration_predictor_channels",# duration_predictor_chans 映射到 duration_predictor_channels
    "duration_predictor_kernel_size": "duration_predictor_kernel_size",  # duration_predictor_kernel_size 映射到 duration_predictor_kernel_size
    "duration_predictor_layers": "duration_predictor_layers",# duration_predictor_layers 映射到 duration_predictor_layers
    "elayers": "encoder_layers",                              # elayers 映射到 encoder_layers
    "encoder_normalize_before": "encoder_normalize_before",   # encoder_normalize_before 映射到 encoder_normalize_before
    "energy_embed_dropout": "energy_embed_dropout",           # energy_embed_dropout 映射到 energy_embed_dropout
    "energy_embed_kernel_size": "energy_embed_kernel_size",   # energy_embed_kernel_size 映射到 energy_embed_kernel_size
    "energy_predictor_chans": "energy_predictor_channels",    # energy_predictor_chans 映射到 energy_predictor_channels
    "energy_predictor_dropout": "energy_predictor_dropout",   # energy_predictor_dropout 映射到 energy_predictor_dropout
    "energy_predictor_kernel_size": "energy_predictor_kernel_size",  # energy_predictor_kernel_size 映射到 energy_predictor_kernel_size
    "energy_predictor_layers": "energy_predictor_layers",     # energy_predictor_layers 映射到 energy_predictor_layers
    "eunits": "encoder_linear_units",                         # eunits 映射到 encoder_linear_units
    "pitch_embed_dropout": "pitch_embed_dropout",             # pitch_embed_dropout 映射到 pitch_embed_dropout
    "pitch_embed_kernel_size": "pitch_embed_kernel_size",     # pitch_embed_kernel_size 映射到 pitch_embed_kernel_size
    "pitch_predictor_chans": "pitch_predictor_channels",      # pitch_predictor_chans 映射到 pitch_predictor_channels
    "pitch_predictor_dropout": "pitch_predictor_dropout",     # pitch_predictor_dropout 映射到 pitch_predictor_dropout
    "pitch_predictor_kernel_size": "pitch_predictor_kernel_size",  # pitch_predictor_kernel_size 映射到 pitch_predictor_kernel_size
    "pitch_predictor_layers": "pitch_predictor_layers",       # pitch_predictor_layers 映射到 pitch_predictor_layers
    "positionwise_conv_kernel_size": "positionwise_conv_kernel_size",  # positionwise_conv_kernel_size 映射到 positionwise_conv_kernel_size
    "postnet_chans": "speech_decoder_postnet_units",          # postnet_chans 映射到 speech_decoder_postnet_units
    "postnet_filts": "speech_decoder_postnet_kernel",         # postnet_filts 映射到 speech_decoder_postnet_kernel
    "postnet_layers": "speech_decoder_postnet_layers",        # postnet_layers 映射到 speech_decoder_postnet_layers
    "reduction_factor": "reduction_factor",                   # reduction_factor 映射到 reduction_factor
    "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor",  # stop_gradient_from_energy_predictor 映射到 stop_gradient_from_energy_predictor
    "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor",    # stop_gradient_from_pitch_predictor 映射到 stop_gradient_from_pitch_predictor
    "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate",  # transformer_dec_attn_dropout_rate 映射到 decoder_attention_dropout_rate
    "transformer_dec_dropout_rate": "decoder_dropout_rate",   # transformer_dec_dropout_rate 映射到 decoder_dropout_rate
    "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate",
    # 将配置中的 "transformer_dec_positional_dropout_rate" 映射为 "decoder_positional_dropout_rate"

    "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate",
    # 将配置中的 "transformer_enc_attn_dropout_rate" 映射为 "encoder_attention_dropout_rate"

    "transformer_enc_dropout_rate": "encoder_dropout_rate",
    # 将配置中的 "transformer_enc_dropout_rate" 映射为 "encoder_dropout_rate"

    "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate",
    # 将配置中的 "transformer_enc_positional_dropout_rate" 映射为 "encoder_positional_dropout_rate"

    "use_cnn_in_conformer": "use_cnn_in_conformer",
    # 指示是否在 Conformer 模型中使用 CNN

    "use_macaron_style_in_conformer": "use_macaron_style_in_conformer",
    # 指示是否在 Conformer 模型中使用 Macaron 风格的结构

    "use_masking": "use_masking",
    # 指示是否使用掩码来进行模型训练

    "use_weighted_masking": "use_weighted_masking",
    # 指示是否使用加权掩码进行模型训练

    "idim": "input_dim",
    # 输入数据的维度

    "odim": "num_mel_bins",
    # 梅尔频谱图的频道数

    "spk_embed_dim": "speaker_embed_dim",
    # 说话人嵌入向量的维度

    "langs": "num_languages",
    # 语言的数量

    "spks": "num_speakers",
    # 说话人的数量
}

# 重新映射模型的 YAML 配置文件
def remap_model_yaml_config(yaml_config_path):
    # 打开并读取 YAML 配置文件
    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
        # 使用 yaml.safe_load 将 YAML 文件内容加载为 Python 对象
        args = yaml.safe_load(f)
        # 将加载的参数转换为 argparse.Namespace 对象
        args = argparse.Namespace(**args)

    # 初始化一个空的重新映射配置字典
    remapped_config = {}

    # 获取模型参数中的文本到语音转换器参数
    model_params = args.tts_conf["text2mel_params"]
    # 使用 CONFIG_MAPPING 字典进行参数重映射，未包含的键会被忽略
    for espnet_config_key, hf_config_key in CONFIG_MAPPING.items():
        # 如果 espnet_config_key 存在于模型参数中
        if espnet_config_key in model_params:
            # 将映射后的参数加入到 remapped_config 字典中
            remapped_config[hf_config_key] = model_params[espnet_config_key]

    # 返回重新映射后的配置字典，以及 args 对象中的 g2p 和 token_list 属性
    return remapped_config, args.g2p, args.token_list


def convert_espnet_state_dict_to_hf(state_dict):
    # 初始化一个空的新状态字典
    new_state_dict = {}
    # 遍历给定的状态字典（state_dict）中的每个键
    for key in state_dict:
        # 如果键名包含特定子串 "tts.generator.text2mel."
        if "tts.generator.text2mel." in key:
            # 去除键名中的 "tts.generator.text2mel."，得到新的键名
            new_key = key.replace("tts.generator.text2mel.", "")
            
            # 如果键名包含 "postnet"
            if "postnet" in key:
                # 修改新键名，将 "postnet.postnet" 替换为 "speech_decoder_postnet.layers"
                new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers")
                # 根据键名结尾不同的后缀，调整为特定的命名格式
                new_key = new_key.replace(".0.weight", ".conv.weight")
                new_key = new_key.replace(".1.weight", ".batch_norm.weight")
                new_key = new_key.replace(".1.bias", ".batch_norm.bias")
                new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean")
                new_key = new_key.replace(".1.running_var", ".batch_norm.running_var")
                new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked")
            
            # 如果键名包含 "feat_out"
            if "feat_out" in key:
                # 根据键名是否包含 "weight" 或 "bias"，确定新键名
                if "weight" in key:
                    new_key = "speech_decoder_postnet.feat_out.weight"
                if "bias" in key:
                    new_key = "speech_decoder_postnet.feat_out.bias"
            
            # 如果键名为 "encoder.embed.0.weight"
            if "encoder.embed.0.weight" in key:
                # 将 "0." 替换为空字符串，得到新键名
                new_key = new_key.replace("0.", "")
            
            # 如果键名包含 "w_1"
            if "w_1" in key:
                # 将 "w_1" 替换为 "conv1"
                new_key = new_key.replace("w_1", "conv1")
            
            # 如果键名包含 "w_2"
            if "w_2" in key:
                # 将 "w_2" 替换为 "conv2"
                new_key = new_key.replace("w_2", "conv2")
            
            # 如果键名包含 "predictor.conv"
            if "predictor.conv" in key:
                # 将 ".conv" 替换为 ".conv_layers"
                new_key = new_key.replace(".conv", ".conv_layers")
                # 使用正则表达式模式和替换规则来调整新键名的格式
                pattern = r"(\d)\.(\d)"
                replacement = (
                    r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm"
                )
                new_key = re.sub(pattern, replacement, new_key)
            
            # 如果键名中包含 "pitch_embed" 或 "energy_embed"
            if "pitch_embed" in key or "energy_embed" in key:
                # 将 "0" 替换为 "conv"
                new_key = new_key.replace("0", "conv")
            
            # 如果键名中包含 "encoders"
            if "encoders" in key:
                # 替换键名中的 "encoders" 为 "conformer_layers"
                new_key = new_key.replace("encoders", "conformer_layers")
                # 替换其他特定的后缀部分为对应的新命名格式
                new_key = new_key.replace("norm_final", "final_layer_norm")
                new_key = new_key.replace("norm_mha", "self_attn_layer_norm")
                new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm")
                new_key = new_key.replace("norm_ff", "ff_layer_norm")
                new_key = new_key.replace("norm_conv", "conv_layer_norm")
            
            # 如果键名中包含 "lid_emb"
            if "lid_emb" in key:
                # 将 "lid_emb" 替换为 "language_id_embedding"
                new_key = new_key.replace("lid_emb", "language_id_embedding")
            
            # 如果键名中包含 "sid_emb"
            if "sid_emb" in key:
                # 将 "sid_emb" 替换为 "speaker_id_embedding"
                new_key = new_key.replace("sid_emb", "speaker_id_embedding")
            
            # 将新的键名与原始状态字典中的值对应起来，添加到新的状态字典中
            new_state_dict[new_key] = state_dict[key]

    # 返回经过修改后的新状态字典
    return new_state_dict
# 使用装饰器 @torch.no_grad() 来确保在此函数中不进行梯度计算
@torch.no_grad()
# 定义函数，将 FastSpeech2Conformer 模型的检查点转换为 PyTorch 模型
def convert_FastSpeech2ConformerModel_checkpoint(
    checkpoint_path,  # 原始检查点文件的路径
    yaml_config_path,  # 模型配置文件 config.yaml 的路径
    pytorch_dump_folder_path,  # 输出的 PyTorch 模型文件夹路径
    repo_id=None,  # 可选参数，用于指定上传到 🤗 hub 的 repo ID
):
    # 调用函数 remap_model_yaml_config 读取模型参数、分词器名称及词汇表
    model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path)
    
    # 根据读取的模型参数创建 FastSpeech2ConformerConfig 配置对象
    config = FastSpeech2ConformerConfig(**model_params)

    # 根据配置对象创建 FastSpeech2ConformerModel 模型
    model = FastSpeech2ConformerModel(config)

    # 加载 ESPnet 模型的检查点文件
    espnet_checkpoint = torch.load(checkpoint_path)
    # 将 ESPnet 模型的状态字典转换为适用于 Hugging Face 的状态字典格式
    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)

    # 将转换后的状态字典加载到模型中
    model.load_state_dict(hf_compatible_state_dict)

    # 将模型保存到指定的 PyTorch 模型文件夹路径中
    model.save_pretrained(pytorch_dump_folder_path)

    # 准备分词器
    with TemporaryDirectory() as tempdir:
        # 创建词汇表的索引映射
        vocab = {token: id for id, token in enumerate(vocab)}
        # 创建词汇表文件的路径
        vocab_file = Path(tempdir) / "vocab.json"
        # 将词汇表写入到 JSON 文件中
        with open(vocab_file, "w") as f:
            json.dump(vocab, f)
        
        # 确定是否需要去除空格
        should_strip_spaces = "no_space" in tokenizer_name
        # 使用 FastSpeech2ConformerTokenizer 创建分词器对象
        tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces)

    # 将分词器保存到指定的 PyTorch 模型文件夹路径中
    tokenizer.save_pretrained(pytorch_dump_folder_path)

    # 如果提供了 repo_id，将模型和分词器推送到 🤗 hub 上
    if repo_id:
        print("Pushing to the hub...")
        model.push_to_hub(repo_id)
        tokenizer.push_to_hub(repo_id)


if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    # 添加命令行参数：原始检查点文件的路径
    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
    # 添加命令行参数：模型配置文件 config.yaml 的路径
    parser.add_argument("--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert")
    # 添加命令行参数：输出的 PyTorch 模型文件夹路径
    parser.add_argument("--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数：可选参数，用于指定上传到 🤗 hub 的 repo ID
    parser.add_argument("--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub.")

    # 解析命令行参数
    args = parser.parse_args()
    # 调用转换函数，传入解析后的命令行参数
    convert_FastSpeech2ConformerModel_checkpoint(
        args.checkpoint_path,
        args.yaml_config_path,
        args.pytorch_dump_folder_path,
        args.push_to_hub,
    )

`.\models\fastspeech2_conformer\convert_hifigan.py`

# coding=utf-8
# 版权 2023 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证版本 2.0 授权使用此文件；除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件按“原样”分发，无任何明示或暗示的担保或条件。
# 请查阅许可证了解详细的许可条件及限制。
"""将 FastSpeech2Conformer HiFi-GAN 的检查点转换为模型。"""

import argparse  # 导入命令行参数解析模块
from pathlib import Path  # 导入处理路径的模块

import torch  # 导入 PyTorch 深度学习框架
import yaml  # 导入处理 YAML 格式的模块

from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging  # 导入模型相关类和日志记录

logging.set_verbosity_info()  # 设置日志记录级别为信息
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")  # 获取模型日志记录器


def load_weights(checkpoint, hf_model, config):
    """加载权重到模型中。

    Args:
        checkpoint (dict): 检查点中的权重字典
        hf_model (FastSpeech2ConformerHifiGan): 需要加载权重的模型实例
        config (FastSpeech2ConformerHifiGanConfig): 模型的配置信息
    """
    vocoder_key_prefix = "tts.generator.vocoder."
    checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k}

    hf_model.apply_weight_norm()

    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]

    for i in range(len(config.upsample_rates)):
        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]

    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
        for j in range(len(config.resblock_dilation_sizes)):
            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]

            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]

    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]

    hf_model.remove_weight_norm()


def remap_hifigan_yaml_config(yaml_config_path):
    """重新映射 HiFi-GAN 的 YAML 配置。

    Args:
        yaml_config_path (str): YAML 配置文件的路径
    """
    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
        args = yaml.safe_load(f)
        args = argparse.Namespace(**args)

    vocoder_type = args.tts_conf["vocoder_type"]
    # 检查声码器类型是否为 "hifigan_generator"，如果不是则引发类型错误并提供详细信息
    if vocoder_type != "hifigan_generator":
        raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}")

    # 创建一个空的重映射字典
    remapped_dict = {}

    # 获取声码器参数字典
    vocoder_params = args.tts_conf["vocoder_params"]

    # 定义键映射关系字典，将 espnet 配置键映射到 huggingface 配置键
    key_mappings = {
        "channels": "upsample_initial_channel",
        "in_channels": "model_in_dim",
        "resblock_dilations": "resblock_dilation_sizes",
        "resblock_kernel_sizes": "resblock_kernel_sizes",
        "upsample_kernel_sizes": "upsample_kernel_sizes",
        "upsample_scales": "upsample_rates",
    }

    # 遍历键映射字典，将 espnet 配置中对应键的值映射到 remapped_dict 中的对应 huggingface 键
    for espnet_config_key, hf_config_key in key_mappings.items():
        remapped_dict[hf_config_key] = vocoder_params[espnet_config_key]

    # 将采样率从参数中的 TTS 配置复制到 remapped_dict
    remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"]
    
    # 设置 normalize_before 为 False
    remapped_dict["normalize_before"] = False
    
    # 从声码器参数中的非线性激活参数中获取 leaky ReLU 的负斜率，并设置到 remapped_dict
    remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"]

    # 返回重映射后的配置字典
    return remapped_dict
# 使用装饰器 @torch.no_grad() 来确保在此函数中不会计算梯度
@torch.no_grad()
# 定义函数 convert_hifigan_checkpoint，用于转换 HiFi-GAN 模型的检查点
def convert_hifigan_checkpoint(
    checkpoint_path,  # 输入参数：原始检查点的文件路径
    pytorch_dump_folder_path,  # 输入参数：输出 PyTorch 模型的文件夹路径
    yaml_config_path=None,  # 输入参数：可选的模型配置文件（YAML）路径，默认为 None
    repo_id=None,  # 输入参数：可选的 🤗 hub 上模型的 repo_id，默认为 None
):
    # 如果提供了 yaml_config_path，则使用 remap_hifigan_yaml_config 函数处理配置文件并创建配置对象
    if yaml_config_path is not None:
        config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
        config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
    else:
        # 否则，使用默认配置创建配置对象
        config = FastSpeech2ConformerHifiGanConfig()

    # 使用配置对象创建 FastSpeech2ConformerHifiGan 模型
    model = FastSpeech2ConformerHifiGan(config)

    # 加载原始检查点文件内容到 orig_checkpoint
    orig_checkpoint = torch.load(checkpoint_path)
    # 调用 load_weights 函数，将 orig_checkpoint 中的权重加载到模型中
    load_weights(orig_checkpoint, model, config)

    # 将模型保存为 PyTorch 模型到指定路径
    model.save_pretrained(pytorch_dump_folder_path)

    # 如果提供了 repo_id，则打印消息并将模型推送到 🤗 hub 上的指定 repo_id
    if repo_id:
        print("Pushing to the hub...")
        model.push_to_hub(repo_id)


# 如果当前脚本作为主程序运行，则执行以下内容
if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    # 添加命令行参数：原始检查点文件路径，必需参数
    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
    # 添加命令行参数：模型配置文件（YAML）路径，可选参数
    parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert")
    # 添加命令行参数：输出 PyTorch 模型的文件夹路径，必需参数
    parser.add_argument(
        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
    )
    # 添加命令行参数：是否推送到 🤗 hub 的 repo_id，可选参数
    parser.add_argument(
        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
    )

    # 解析命令行参数
    args = parser.parse_args()
    # 调用 convert_hifigan_checkpoint 函数，传入命令行解析得到的参数
    convert_hifigan_checkpoint(
        args.checkpoint_path,
        args.pytorch_dump_folder_path,
        args.yaml_config_path,
        args.push_to_hub,
    )

`.\models\fastspeech2_conformer\convert_model_with_hifigan.py`

# coding=utf-8
# 设置文件编码为UTF-8，确保支持中文和其他特殊字符的正确处理

# 导入必要的库和模块
import argparse  # 导入用于解析命令行参数的模块 argparse

import torch  # 导入 PyTorch 库

from transformers import (  # 从 transformers 库中导入以下模块和类
    FastSpeech2ConformerConfig,  # FastSpeech2ConformerConfig 类，用于配置 FastSpeech2Conformer 模型
    FastSpeech2ConformerHifiGan,  # FastSpeech2ConformerHifiGan 类，用于 FastSpeech2Conformer 和 HifiGan 的结合
    FastSpeech2ConformerHifiGanConfig,  # FastSpeech2ConformerHifiGanConfig 类，配置 FastSpeech2ConformerHifiGan 模型
    FastSpeech2ConformerModel,  # FastSpeech2ConformerModel 类，FastSpeech2Conformer 模型
    FastSpeech2ConformerWithHifiGan,  # FastSpeech2ConformerWithHifiGan 类，结合 FastSpeech2Conformer 和 HifiGan 的模型
    FastSpeech2ConformerWithHifiGanConfig,  # FastSpeech2ConformerWithHifiGanConfig 类，配置 FastSpeech2ConformerWithHifiGan 模型
    logging,  # logging 模块，用于记录日志
)

from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import (  # 导入本地的模块和函数
    convert_espnet_state_dict_to_hf,  # convert_espnet_state_dict_to_hf 函数，将 espnet 模型的状态字典转换为 HF 兼容格式
    remap_model_yaml_config,  # remap_model_yaml_config 函数，重映射模型的 YAML 配置
)

from .convert_hifigan import load_weights, remap_hifigan_yaml_config  # 导入本地的 load_weights 和 remap_hifigan_yaml_config 函数

# 设置日志的详细程度为 info
logging.set_verbosity_info()

# 获取 logger 对象
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")


def convert_FastSpeech2ConformerWithHifiGan_checkpoint(
    checkpoint_path,
    yaml_config_path,
    pytorch_dump_folder_path,
    repo_id=None,
):
    # 准备模型
    model_params, *_ = remap_model_yaml_config(yaml_config_path)
    model_config = FastSpeech2ConformerConfig(**model_params)  # 使用从 YAML 文件中提取的参数配置 FastSpeech2ConformerConfig

    model = FastSpeech2ConformerModel(model_config)  # 基于配置创建 FastSpeech2ConformerModel 对象

    espnet_checkpoint = torch.load(checkpoint_path)  # 加载原始 ESPnet 模型的检查点
    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)  # 将 ESPnet 模型的状态字典转换为 HF 兼容格式
    model.load_state_dict(hf_compatible_state_dict)  # 加载 HF 兼容的状态字典到模型中

    # 准备声码器
    config_kwargs = remap_hifigan_yaml_config(yaml_config_path)  # 从 YAML 文件中获取声码器配置参数
    vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)  # 使用配置参数创建 FastSpeech2ConformerHifiGanConfig

    vocoder = FastSpeech2ConformerHifiGan(vocoder_config)  # 基于配置创建 FastSpeech2ConformerHifiGan 声码器
    load_weights(espnet_checkpoint, vocoder, vocoder_config)  # 加载权重到声码器中

    # 准备模型 + 声码器组合
    config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config)
    with_hifigan_model = FastSpeech2ConformerWithHifiGan(config)  # 基于组合配置创建 FastSpeech2ConformerWithHifiGan 模型
    with_hifigan_model.model = model  # 将 FastSpeech2Conformer 模型赋给组合模型的成员变量
    with_hifigan_model.vocoder = vocoder  # 将声码器赋给组合模型的声码器成员变量

    with_hifigan_model.save_pretrained(pytorch_dump_folder_path)  # 保存组合模型到指定路径

    if repo_id:
        print("Pushing to the hub...")
        with_hifigan_model.push_to_hub(repo_id)  # 将模型推送到模型中心（hub）
    # 配置解析器添加参数
    def parse_args():
        parser = argparse.ArgumentParser(
            description="Script for converting FastSpeech2Conformer with HifiGAN Model"
        )
    
        # 必须参数: 输出的 PyTorch 模型路径
        parser.add_argument(
            "--pytorch_dump_folder_path",
            required=True,
            default=None,
            type=str,
            help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
        )
        # 选择参数: 将模型上传到 🤗 hub 的选项
        parser.add_argument(
            "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
        )
    
        # 解析参数
        args = parser.parse_args()
    
        # 主函数调用
        convert_FastSpeech2ConformerWithHifiGan_checkpoint(
            args.checkpoint_path,
            args.yaml_config_path,
            args.pytorch_dump_folder_path,
            args.push_to_hub,
        )

`.\models\fastspeech2_conformer\modeling_fastspeech2_conformer.py`

# coding=utf-8
# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch FastSpeech2Conformer model."""

import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
from torch import nn

from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import ModelOutput, add_start_docstrings, logging, replace_return_docstrings
from .configuration_fastspeech2_conformer import (
    FastSpeech2ConformerConfig,
    FastSpeech2ConformerHifiGanConfig,
    FastSpeech2ConformerWithHifiGanConfig,
)

# 获取logger对象，用于日志记录
logger = logging.get_logger(__name__)

# FastSpeech2Conformer模型的预训练模型存档列表
FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "espnet/fastspeech2_conformer",
    # See all FastSpeech2Conformer models at https://huggingface.co/models?filter=fastspeech2_conformer
]

@dataclass
# FastSpeech2ConformerModelOutput类定义，用作FastSpeech2Conformer模型的输出类型
class FastSpeech2ConformerModelOutput(ModelOutput):
    """
    Output type of [`FastSpeech2ConformerModel`].
    """
    # loss 是一个可选的 torch.FloatTensor，表示生成语谱图的损失
    loss: Optional[torch.FloatTensor] = None

    # spectrogram 是一个 torch.FloatTensor，表示预测的语谱图，其形状为 (batch_size, sequence_length, num_bins)
    spectrogram: torch.FloatTensor = None

    # encoder_last_hidden_state 是一个可选的 torch.FloatTensor，表示模型编码器最后一层的隐藏状态序列，
    # 其形状为 (batch_size, sequence_length, hidden_size)
    encoder_last_hidden_state: torch.FloatTensor = None

    # encoder_hidden_states 是一个可选的元组(torch.FloatTensor)，当传递了 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回，
    # 其中包含模型编码器每一层的隐藏状态序列，形状为 (batch_size, sequence_length, hidden_size)
    encoder_hidden_states: tuple(torch.FloatTensor) = None

    # encoder_attentions 是一个可选的元组(torch.FloatTensor)，当传递了 `output_attentions=True` 或 `config.output_attentions=True` 时返回，
    # 包含模型编码器每一层的注意力权重，形状为 (batch_size, num_heads, sequence_length, sequence_length)
    encoder_attentions: tuple(torch.FloatTensor) = None

    # decoder_hidden_states 是一个可选的元组(torch.FloatTensor)，当传递了 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回，
    # 其中包含模型解码器每一层的隐藏状态序列，形状为 (batch_size, sequence_length, hidden_size)
    decoder_hidden_states: tuple(torch.FloatTensor) = None

    # decoder_attentions 是一个可选的元组(torch.FloatTensor)，当传递了 `output_attentions=True` 或 `config.output_attentions=True` 时返回，
    # 包含模型解码器每一层的注意力权重，形状为 (batch_size, num_heads, sequence_length, sequence_length)
    decoder_attentions: tuple(torch.FloatTensor) = None

    # duration_outputs 是一个可选的 torch.LongTensor，表示持续时间预测器的输出，
    # 形状为 (batch_size, max_text_length + 1)
    duration_outputs: torch.LongTensor = None

    # pitch_outputs 是一个可选的 torch.FloatTensor，表示音高预测器的输出，
    # 形状为 (batch_size, max_text_length + 1, 1)
    pitch_outputs: torch.FloatTensor = None

    # energy_outputs 是一个可选的 torch.FloatTensor，表示能量预测器的输出，
    # 形状为 (batch_size, max_text_length + 1, 1)
    energy_outputs: torch.FloatTensor = None
    # 定义可选的变量，用于存储编码器最终隐藏状态的张量
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    # 定义可选的变量，用于存储编码器所有隐藏状态的元组张量
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 定义可选的变量，用于存储编码器注意力分布的元组张量
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 定义可选的变量，用于存储解码器隐藏状态的元组张量
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 定义可选的变量，用于存储解码器注意力分布的元组张量
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 定义默认为None的变量，用于存储输出的持续时间预测结果的长整型张量
    duration_outputs: torch.LongTensor = None
    # 定义默认为None的变量，用于存储输出的音高预测结果的浮点数张量
    pitch_outputs: torch.FloatTensor = None
    # 定义默认为None的变量，用于存储输出的能量预测结果的浮点数张量
    energy_outputs: torch.FloatTensor = None
@dataclass
class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
    """
    Output type of [`FastSpeech2ConformerWithHifiGan`].

    """

    # 用于存储生成的波形数据的张量
    waveform: torch.FloatTensor = None


_CONFIG_FOR_DOC = "FastSpeech2ConformerConfig"

FASTSPEECH2_CONFORMER_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FastSpeech2ConformerConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


HIFIGAN_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FastSpeech2ConformerConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FastSpeech2ConformerWithHifiGanConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0):
    """
    Length regulator for feed-forward Transformer.

    This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech`
    https://arxiv.org/pdf/1905.09263.pdf. The length regulator expands char or phoneme-level embedding features to
    frame-level by repeating each feature based on the corresponding predicted durations.

    Args:
        encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`):
            Batch of sequences of char or phoneme embeddings.
        duration_labels (`torch.LongTensor` of shape `(batch_size, time)`):
            Batch of durations of each frame.
        speaking_speed (`float`, *optional*, defaults to 1.0):
            Value to control speed of speech.

    Returns:
        `torch.Tensor`:
            Replicated input tensor based on durations (batch_size, time*, embedding_dim).
    """

    if speaking_speed <= 0:
        raise ValueError("`speaking_speed` must be greater than 0.")
    elif speaking_speed != 1.0:
        # Adjust duration labels based on speaking speed if it's not 1.0
        duration_labels = torch.round(duration_labels.float() * speaking_speed).long()

    if duration_labels.sum() == 0:
        # Ensure at least one frame per sequence if all durations sum to zero
        duration_labels[duration_labels.sum(dim=1).eq(0)] = 1

    # Calculate the maximum length needed based on the sum of duration labels per batch
    max_len = torch.sum(duration_labels, dim=1).max()

    # Create a padded tensor to hold the expanded embeddings
    hidden_states = torch.zeros(
        (encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)),
        dtype=torch.float,
        device=encoded_embeddings.device,
    )

    # Loop through each sequence in the batch and expand embeddings based on duration labels
    for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)):
        # Repeat each embedding based on its corresponding duration label
        repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0)
        # Place repeated embeddings into the padded tensor
        hidden_states[i, : repeated.size(0)] = repeated

    return hidden_states


class FastSpeech2ConformerDurationPredictor(nn.Module):
    """
    Duration predictor module.

    This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to
    Speech' https://arxiv.org/pdf/1905.09263.pdf The duration predictor predicts a duration of each frame in log domain
    from the hidden embeddings of encoder.

    Note:
        The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the
        outputs are calculated in log domain but in `inference`, those are calculated in linear domain.

    """
    def __init__(self, config: FastSpeech2ConformerConfig):
        super().__init__()

        # 初始化卷积层列表
        self.conv_layers = nn.ModuleList()
        # 设置在对数域计算时的偏移量
        self.log_domain_offset = 1.0

        # 根据配置信息循环创建持续预测器的卷积层
        for layer_idx in range(config.duration_predictor_layers):
            num_chans = config.duration_predictor_channels
            # 确定当前层的输入通道数
            input_channels = config.hidden_size if layer_idx == 0 else num_chans
            # 创建并添加预测器层对象到卷积层列表
            layer = FastSpeech2ConformerPredictorLayer(
                input_channels,
                num_chans,
                config.duration_predictor_kernel_size,
                config.duration_predictor_dropout_rate,
            )
            self.conv_layers.append(layer)

        # 创建线性层，输出维度为1，用于预测持续时间
        self.linear = nn.Linear(config.duration_predictor_channels, 1)

    def forward(self, encoder_hidden_states):
        """
        Args:
            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
                输入序列的批次数据.
                input_dim是每个时间步的特征维度.

        Returns:
            `torch.Tensor`: 在对数域中预测的持续时间 `(batch_size, max_text_length)`.

        """
        # 调整输入张量的维度顺序为(batch_size, input_dim, max_text_length)
        hidden_states = encoder_hidden_states.transpose(1, -1)
        
        # 逐层通过卷积层处理隐藏状态
        for layer in self.conv_layers:
            hidden_states = layer(hidden_states)

        # 在对数域中计算线性层的输出，调整维度为(batch_size, max_text_length)
        hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1)

        if not self.training:
            # 若非训练模式，转换回线性域并进行修剪
            hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long()

        return hidden_states
# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer
# 定义了一个名为 FastSpeech2ConformerBatchNormConvLayer 的类，继承自 nn.Module
class FastSpeech2ConformerBatchNormConvLayer(nn.Module):
    # 初始化方法，接受 config 和可选的 layer_id 参数
    def __init__(self, config, layer_id=0):
        super().__init__()

        # 根据 layer_id 决定输入卷积层的维度
        if layer_id == 0:
            in_conv_dim = config.num_mel_bins
        else:
            in_conv_dim = config.speech_decoder_postnet_units

        # 根据 layer_id 决定输出卷积层的维度
        if layer_id == config.speech_decoder_postnet_layers - 1:
            out_conv_dim = config.num_mel_bins
        else:
            out_conv_dim = config.speech_decoder_postnet_units

        # 创建一个 1 维卷积层，设置输入维度、输出维度、卷积核大小、步长、填充和是否包含偏置
        self.conv = nn.Conv1d(
            in_conv_dim,
            out_conv_dim,
            kernel_size=config.speech_decoder_postnet_kernel,
            stride=1,
            padding=(config.speech_decoder_postnet_kernel - 1) // 2,
            bias=False,
        )

        # 创建一个 1 维批归一化层，设置归一化的通道数
        self.batch_norm = nn.BatchNorm1d(out_conv_dim)

        # 根据 layer_id 决定是否使用激活函数 Tanh
        if layer_id < config.speech_decoder_postnet_layers - 1:
            self.activation = nn.Tanh()
        else:
            self.activation = None

        # 创建一个 Dropout 层，设置丢弃率
        self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)

    # 前向传播方法，接受 hidden_states 作为输入，返回处理后的 hidden_states
    def forward(self, hidden_states):
        # 将输入 hidden_states 经过卷积层 conv 处理
        hidden_states = self.conv(hidden_states)
        # 将卷积层的输出经过批归一化层 batch_norm 处理
        hidden_states = self.batch_norm(hidden_states)
        # 如果有激活函数 activation，则将批归一化后的结果经过激活函数处理
        if self.activation is not None:
            hidden_states = self.activation(hidden_states)
        # 将处理后的结果经过 Dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states


# 定义了一个名为 FastSpeech2ConformerSpeechDecoderPostnet 的类，继承自 nn.Module
class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module):
    # 初始化方法，接受 config 参数
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 创建一个线性层，将隐藏状态映射到输出特征的大小
        self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
        # 创建一个由多个 FastSpeech2ConformerBatchNormConvLayer 组成的层列表
        self.layers = nn.ModuleList(
            [FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
        )

    # 前向传播方法，接受 hidden_states 作为输入，返回处理后的 outputs_before_postnet 和 outputs_after_postnet
    def forward(self, hidden_states: torch.Tensor):
        # 将隐藏状态通过线性层 feat_out 映射到输出特征的大小，并重塑输出形状
        outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
        # 将重塑后的输出结果转置，以便后续处理
        layer_output = outputs_before_postnet.transpose(1, 2)
        # 遍历每个层，并将 layer_output 依次经过每一层处理
        for layer in self.layers:
            layer_output = layer(layer_output)
        # 将原始输出和经过层处理后的结果进行相加，得到最终的 outputs_after_postnet
        outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2)
        # 返回处理后的 outputs_before_postnet 和 outputs_after_postnet
        return outputs_before_postnet, outputs_after_postnet


# 定义了一个名为 FastSpeech2ConformerPredictorLayer 的类，继承自 nn.Module
class FastSpeech2ConformerPredictorLayer(nn.Module):
    # 初始化方法，接受 input_channels、num_chans、kernel_size 和 dropout_rate 参数
    def __init__(self, input_channels, num_chans, kernel_size, dropout_rate):
        super().__init__()
        # 创建一个 1 维卷积层，设置输入通道数、输出通道数、卷积核大小、步长、填充
        self.conv = nn.Conv1d(
            input_channels,
            num_chans,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
        )
        # 创建一个 ReLU 激活函数
        self.activation = nn.ReLU()
        # 创建一个 LayerNorm 层，设置归一化的通道数
        self.layer_norm = nn.LayerNorm(num_chans)
        # 创建一个 Dropout 层，设置丢弃率
        self.dropout = nn.Dropout(dropout_rate)
    # 定义一个前向传播函数，接收隐藏状态作为输入
    def forward(self, hidden_states):
        # 使用卷积层处理隐藏状态
        hidden_states = self.conv(hidden_states)
        # 对卷积层输出应用激活函数
        hidden_states = self.activation(hidden_states)

        # 在第1维上执行层归一化操作
        hidden_states = hidden_states.transpose(1, -1)
        hidden_states = self.layer_norm(hidden_states)
        hidden_states = hidden_states.transpose(1, -1)

        # 对处理后的隐藏状态应用 dropout
        hidden_states = self.dropout(hidden_states)

        # 返回处理后的隐藏状态作为输出
        return hidden_states
class FastSpeech2ConformerVariancePredictor(nn.Module):
    def __init__(
        self,
        config: FastSpeech2ConformerConfig,
        num_layers=2,
        num_chans=384,
        kernel_size=3,
        dropout_rate=0.5,
    ):
        """
        Initilize variance predictor module.

        Args:
            config (`FastSpeech2ConformerConfig`): Configuration object for the model.
            num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers.
            num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers.
            kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers.
            dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate.
        """
        super().__init__()
        # 创建包含多个卷积层的模块列表
        self.conv_layers = nn.ModuleList()
        for idx in range(num_layers):
            input_channels = config.hidden_size if idx == 0 else num_chans
            # 创建并添加一个新的卷积层到模块列表中
            layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate)
            self.conv_layers.append(layer)
        # 创建一个线性层，用于最终预测
        self.linear = nn.Linear(num_chans, 1)

    def forward(self, encoder_hidden_states, padding_masks=None):
        """
        Calculate forward propagation.

        Args:
            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
                Batch of input sequences.
            padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
                Batch of masks indicating padded part.

        Returns:
            Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`.
        """
        # 将输入的隐藏状态进行维度转置
        hidden_states = encoder_hidden_states.transpose(1, -1)
        # 通过所有卷积层进行前向传播计算
        for layer in self.conv_layers:
            hidden_states = layer(hidden_states)

        # 对输出结果进行线性变换
        hidden_states = self.linear(hidden_states.transpose(1, 2))

        # 如果提供了填充掩码，则使用掩码将填充部分置为零
        if padding_masks is not None:
            hidden_states = hidden_states.masked_fill(padding_masks, 0.0)

        return hidden_states


class FastSpeech2ConformerVarianceEmbedding(nn.Module):
    def __init__(
        self,
        in_channels=1,
        out_channels=384,
        kernel_size=1,
        padding=0,
        dropout_rate=0.0,
    ):
        """
        Initialize variance embedding module.

        Args:
            in_channels (`int`, *optional*, defaults to 1): Number of input channels.
            out_channels (`int`, *optional*, defaults to 384): Number of output channels.
            kernel_size (`int`, *optional*, defaults to 1): Kernel size of the convolutional layer.
            padding (`int`, *optional*, defaults to 0): Padding size of the convolutional layer.
            dropout_rate (`float`, *optional*, defaults to 0.0): Dropout rate.
        """
        super().__init__()
        # 创建一个卷积层，用于嵌入变量
        self.conv = nn.Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            padding=padding,
        )
        # 创建一个丢弃层，用于随机丢弃数据
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, hidden_states):
        # 将输入的隐藏状态进行维度转置
        hidden_states = hidden_states.transpose(1, 2)
        # 通过卷积层进行前向传播计算
        hidden_states = self.conv(hidden_states)
        # 通过丢弃层进行前向传播计算
        hidden_states = self.dropout(hidden_states)
        # 再次将隐藏状态的维度进行转置
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


class FastSpeech2ConformerAttention(nn.Module):
    """
    Multi-Head attention layer with relative position encoding. Details can be found in
    """
    """
    https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860.
    """

    # 初始化函数，创建一个 FastSpeech2ConformerAttention 对象
    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
        """Construct an FastSpeech2ConformerAttention object."""
        # 调用父类的初始化方法
        super().__init__()

        # 假设 d_v 总是等于 dim_key
        # 设置注意力头的数量
        self.num_heads = module_config["num_attention_heads"]
        # 获取隐藏层大小
        self.hidden_size = config.hidden_size
        # 计算 key 的维度
        self.dim_key = self.hidden_size // self.num_heads
        # 计算每个头的维度
        self.head_dim = self.hidden_size // self.num_heads

        # 初始化 Linear 层，用于查询（query）、键（key）、值（value）和输出（output）
        self.linear_q = nn.Linear(self.hidden_size, self.hidden_size)
        self.linear_k = nn.Linear(self.hidden_size, self.hidden_size)
        self.linear_v = nn.Linear(self.hidden_size, self.hidden_size)
        self.linear_out = nn.Linear(self.hidden_size, self.hidden_size)

        # Dropout 层，用于注意力机制中的 dropout
        self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"])

        # 用于位置编码的线性变换
        self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False)

        # 学习得到的偏置参数，用于矩阵 c 和矩阵 d
        # 参见论文 https://arxiv.org/abs/1901.02860 第 3.3 节的描述
        self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
        self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))

    # 移动相对位置张量
    def shift_relative_position_tensor(self, pos_tensor):
        """
        Args:
            pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor.
        """
        # 在最后一个维度上填充零，扩展张量
        zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype)
        pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1)

        # 重新组织张量的形状，将最后一个维度扩展一个单位
        pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2))
        
        # 保留位置从 0 到 time2 的部分
        pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1]

        return pos_tensor

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        pos_emb: Optional[torch.Tensor] = None,
        output_attentions: Optional[torch.Tensor] = False,
class FastSpeech2ConformerConvolutionModule(nn.Module):
    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
        super().__init__()
        # kernel_size should be an odd number for 'SAME' padding
        channels = config.hidden_size
        kernel_size = module_config["kernel_size"]
        
        # 定义第一个逐点卷积层，将输入通道数变换为2倍的输出通道数
        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
        
        # 定义深度卷积层，应用1维深度卷积，groups设置为通道数，使用SAME填充以保持长度不变
        self.depthwise_conv = nn.Conv1d(
            channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True
        )
        
        # 定义批标准化层，用于归一化深度卷积层的输出
        self.norm = nn.BatchNorm1d(channels)
        
        # 定义第二个逐点卷积层，将输出通道数恢复为原来的通道数
        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)

    def forward(self, hidden_states):
        """
        Compute convolution module.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.

        Returns:
            `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.

        """
        # 交换时间维度和特征维度，将 (batch, time, channels) 转换为 (batch, channels, time)
        hidden_states = hidden_states.transpose(1, 2)

        # 应用GLU机制，将 (batch_size, 2*channels, time) 转换为 (batch_size, channels, time)
        hidden_states = self.pointwise_conv1(hidden_states)
        hidden_states = nn.functional.glu(hidden_states, dim=1)

        # 应用深度卷积
        hidden_states = self.depthwise_conv(hidden_states)
        
        # 应用批标准化
        hidden_states = self.norm(hidden_states)

        # 应用sigmoid函数，并将结果与深度卷积输出相乘
        hidden_states = hidden_states * torch.sigmoid(hidden_states)

        # 应用第二个逐点卷积层，将 (batch, channels, time) 转换回 (batch, time, channels)
        hidden_states = self.pointwise_conv2(hidden_states)

        return hidden_states.transpose(1, 2)
    # 初始化函数，用于初始化一个 FastSpeech2ConformerConfig 类的实例
    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
        # 调用父类的初始化方法
        super().__init__()

        # 定义自注意力模块
        self.self_attn = FastSpeech2ConformerAttention(config, module_config)

        # 定义前馈模块
        self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)

        # 根据配置选择是否使用 Macaron 风格
        self.macaron_style = config.use_macaron_style_in_conformer
        if self.macaron_style:
            # 如果使用 Macaron 风格，定义额外的前馈模块和层归一化
            self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
            self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size)
            self.ff_scale = 0.5
        else:
            self.ff_scale = 1.0  # 否则设定前馈缩放因子为 1.0

        # 根据配置选择是否使用卷积模块
        self.use_cnn_module = config.use_cnn_in_conformer
        if self.use_cnn_module:
            # 如果使用卷积模块，定义卷积模块和两个层归一化
            self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config)
            self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
            self.final_layer_norm = nn.LayerNorm(config.hidden_size)

        # 定义前馈层归一化
        self.ff_layer_norm = nn.LayerNorm(config.hidden_size)

        # 定义自注意力层归一化
        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)

        # 定义 dropout 层
        self.dropout = nn.Dropout(module_config["dropout_rate"])

        # 定义 hidden size 大小
        self.size = config.hidden_size

        # 从模块配置中获取是否在归一化前执行操作和是否在拼接之后执行操作的标志
        self.normalize_before = module_config["normalize_before"]
        self.concat_after = module_config["concat_after"]
        if self.concat_after:
            # 如果在拼接之后执行操作，定义一个线性层用于拼接后的向量变换
            self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        pos_emb: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[torch.Tensor] = False,
class FastSpeech2ConformerMultiLayeredConv1d(nn.Module):
    """
    Multi-layered conv1d for Transformer block.

    This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer
    block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech'
    https://arxiv.org/pdf/1905.09263.pdf
    """

    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
        """
        Initialize FastSpeech2ConformerMultiLayeredConv1d module.

        Args:
            config (`FastSpeech2ConformerConfig`): Configuration object containing model parameters.
            module_config (`dict`): Dictionary containing specific module configurations.
        """
        super().__init__()
        # Set input channels from config
        input_channels = config.hidden_size
        # Set hidden channels from module_config
        hidden_channels = module_config["linear_units"]
        # Set kernel size from config
        kernel_size = config.positionwise_conv_kernel_size
        # Define the first convolution layer
        self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
        # Define the second convolution layer
        self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
        # Define dropout layer with dropout rate from module_config
        self.dropout = nn.Dropout(module_config["dropout_rate"])

    def forward(self, hidden_states):
        """
        Perform forward propagation through the module.

        Args:
            hidden_states (torch.Tensor): Input tensor of shape (batch_size, time, input_channels).

        Returns:
            torch.Tensor: Output tensor of shape (batch_size, time, hidden_channels).
        """
        # Transpose tensor to (batch_size, input_channels, time)
        hidden_states = hidden_states.transpose(-1, 1)
        # Apply first convolution layer
        hidden_states = self.conv1(hidden_states)
        # Apply ReLU activation function
        hidden_states = torch.relu(hidden_states)
        # Apply dropout
        hidden_states = self.dropout(hidden_states)
        # Apply second convolution layer
        hidden_states = self.conv2(hidden_states)
        # Transpose tensor back to (batch_size, time, hidden_channels)
        hidden_states = hidden_states.transpose(-1, 1)
        return hidden_states


class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
    """
    Relative positional encoding module (new implementation).

    Args:
        config (`FastSpeech2ConformerConfig`): Configuration object containing model parameters.
        module_config (`dict`): Dictionary containing specific module configurations.
    Details can be found in https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://arxiv.org/abs/1901.02860
    """

    def __init__(self, config: FastSpeech2ConformerConfig, module_config):
        """
        Construct a FastSpeech2ConformerRelPositionalEncoding object.

        Args:
            config (`FastSpeech2ConformerConfig`): Configuration object containing model parameters.
            module_config (`dict`): Dictionary containing specific module configurations.
        """
        super().__init__()
        # Initialize embedding dimension from config
        self.embed_dim = config.hidden_size
        # Set input scale as square root of embedding dimension
        self.input_scale = math.sqrt(self.embed_dim)
        # Initialize dropout layer with positional dropout rate from module_config
        self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"])
        # Initialize positional encoding as None initially
        self.pos_enc = None
        # Set maximum length for positional encoding
        self.max_len = 5000
        # Extend positional encoding with a tensor of zeros
        self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len))
    def extend_pos_enc(self, x):
        """Reset the positional encodings."""
        # 如果已经存在位置编码，则检查是否需要重新初始化
        if self.pos_enc is not None:
            # self.pos_enc 包含正负两部分
            # self.pos_enc 的长度为 2 * 输入长度 - 1
            if self.pos_enc.size(1) >= x.size(1) * 2 - 1:
                # 如果当前的数据类型或设备与输入不匹配，则将位置编码转换为相应类型和设备
                if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device:
                    self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device)
                return
        # 创建正位置编码和负位置编码
        pos_enc_positive = torch.zeros(x.size(1), self.embed_dim)
        pos_enc_negative = torch.zeros(x.size(1), self.embed_dim)
        # 生成位置向量，表示位置的相对关系
        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
        # 计算正弦和余弦项的分母
        div_term = torch.exp(
            torch.arange(0, self.embed_dim, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.embed_dim)
        )
        # 计算正位置编码的正弦和余弦值
        pos_enc_positive[:, 0::2] = torch.sin(position * div_term)
        pos_enc_positive[:, 1::2] = torch.cos(position * div_term)
        # 计算负位置编码的正弦和余弦值
        pos_enc_negative[:, 0::2] = torch.sin(-1 * position * div_term)
        pos_enc_negative[:, 1::2] = torch.cos(-1 * position * div_term)

        # 翻转正位置编码的顺序并连接正负位置编码，以支持平移技巧
        # 参考 https://arxiv.org/abs/1901.02860
        pos_enc_positive = torch.flip(pos_enc_positive, [0]).unsqueeze(0)
        pos_enc_negative = pos_enc_negative[1:].unsqueeze(0)
        pos_enc = torch.cat([pos_enc_positive, pos_enc_negative], dim=1)
        self.pos_enc = pos_enc.to(device=x.device, dtype=x.dtype)

    def forward(self, feature_representation):
        """
        Args:
            feature_representation (`torch.Tensor` of shape (batch_size, time, `*`)):
                Input tensor.

        Returns:
            `torch.Tensor`: Encoded tensor (batch_size, time, `*`).
        """
        # 扩展或重置位置编码
        self.extend_pos_enc(feature_representation)
        # 对特征表示进行缩放
        hidden_states = feature_representation * self.input_scale
        # 计算中心索引
        center_idx = self.pos_enc.size(1) // 2
        # 提取位置编码的一部分，以便与隐藏状态匹配
        pos_emb = self.pos_enc[:, center_idx - hidden_states.size(1) + 1 : center_idx + hidden_states.size(1)]
        return self.dropout(hidden_states), self.dropout(pos_emb)
# FastSpeech2ConformerEncoder 类定义，作为 FastSpeech2 模型的编码器模块
class FastSpeech2ConformerEncoder(nn.Module):
    """
    FastSpeech2ConformerEncoder encoder module.

    Args:
        config (`FastSpeech2ConformerConfig`):
            FastSpeech2ConformerConfig instance. 模型配置参数对象
        module_config (`dict`):
            Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
            包含编码器或解码器模块配置的字典，从 FastSpeech2ConformerConfig 中获取
        use_encoder_input_layer (`bool`, *optional*, defaults to `False`):
            Input layer type. 是否使用编码器输入层类型

    """

    def __init__(
        self,
        config: FastSpeech2ConformerConfig,
        module_config,
        use_encoder_input_layer=False,
    ):
        super().__init__()

        self.embed = None
        # 如果指定了使用编码器输入层，则创建一个词嵌入层
        if use_encoder_input_layer:
            self.embed = nn.Embedding(
                num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=0
            )

        # 创建相对位置编码器
        self.pos_enc = FastSpeech2ConformerRelPositionalEncoding(config, module_config)

        # 创建多个 Conformer 层的列表
        self.conformer_layers = nn.ModuleList(
            [FastSpeech2ConformerEncoderLayer(config, module_config) for _ in range(module_config["layers"])]
        )

    def forward(
        self,
        input_tensor: torch.LongTensor,
        attention_mask: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        return_dict: Optional[bool] = None,
        """
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                输入序列标记在词汇表中的索引。默认情况下，将忽略填充标记。

                可以使用 [`AutoTokenizer`] 获得索引。有关详细信息，请参见 [`PreTrainedTokenizer.encode`] 和
                [`PreTrainedTokenizer.__call__`]。

                [什么是输入 ID？](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *可选*):
                遮罩，用于避免在填充标记索引上执行注意力计算。遮罩值在 `[0, 1]` 范围内选择：

                - 对于 **未遮罩** 的标记，为 1，
                - 对于 **遮罩** 的标记，为 0。

                [什么是注意力遮罩？](../glossary#attention-mask)
            output_hidden_states (`bool`, *可选*):
                是否返回所有层的隐藏状态。有关详细信息，请参见返回张量中的 `hidden_states`。
            output_attentions (`bool`, *可选*):
                是否返回所有注意力层的注意力张量。有关详细信息，请参见返回张量中的 `attentions`。
            return_dict (`bool`, *可选*):
                是否返回 [`~utils.ModelOutput`] 而不是简单元组。
        Returns:
            `torch.Tensor`:
                形状为 `(batch, time, attention_dim)` 的输出张量。
        """
        # 将输入张量视为特征表示
        feature_representation = input_tensor
        # 如果存在嵌入层，则使用嵌入层处理特征表示
        if self.embed is not None:
            feature_representation = self.embed(feature_representation)

        # 使用位置编码器处理特征表示和位置编码
        hidden_states, pos_emb = self.pos_enc(feature_representation)

        # 初始化存储所有隐藏状态和注意力张量的元组
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 逐层处理Conformer模型的每个层
        for conformer_layer in self.conformer_layers:
            # 如果需要输出隐藏状态，则添加当前层的隐藏状态到存储中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 对当前层进行处理，获取其输出
            layer_outputs = conformer_layer(hidden_states, pos_emb, attention_mask, output_attentions)
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力张量，则添加当前层的注意力张量到存储中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 添加最后一层的隐藏状态到存储中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 根据 return_dict 决定返回格式
        if not return_dict:
            # 返回包含非空项的元组
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 返回格式化的 BaseModelOutput 对象
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
        )
class FastSpeech2ConformerLoss(nn.Module):
    def __init__(self, config: FastSpeech2ConformerConfig):
        super().__init__()

        use_masking = config.use_masking
        use_weighted_masking = config.use_weighted_masking

        # 检查是否同时开启了 use_masking 和 use_weighted_masking
        if use_masking and use_weighted_masking:
            raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")

        # 设置是否使用 masking 和 weighted masking
        self.use_masking = use_masking
        self.use_weighted_masking = use_weighted_masking

        # 根据是否使用 weighted masking 设置损失函数的缩减方式
        reduction = "none" if self.use_weighted_masking else "mean"
        # 定义 L1 损失函数
        self.l1_criterion = nn.L1Loss(reduction=reduction)
        # 定义 MSE 损失函数
        self.mse_criterion = nn.MSELoss(reduction=reduction)
        # 定义 duration 损失函数
        self.duration_criterion = nn.MSELoss(reduction=reduction)
        # 设置对数域偏移量
        self.log_domain_offset = 1.0

    def forward(
        self,
        outputs_after_postnet,
        outputs_before_postnet,
        duration_outputs,
        pitch_outputs,
        energy_outputs,
        spectrogram_labels,
        duration_labels,
        pitch_labels,
        energy_labels,
        duration_mask,
        spectrogram_mask,



class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类
    config_class = FastSpeech2ConformerConfig
    # 基础模型前缀
    base_model_prefix = "fastspeech2_conformer"

    # 主要输入名称
    main_input_name = "input_ids"

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, (nn.LayerNorm)):
            # 将 LayerNorm 层的偏置初始化为零
            module.bias.data.zero_()
            # 将 LayerNorm 层的权重初始化为 1.0
            module.weight.data.fill_(1.0)
        elif isinstance(module, nn.Conv1d):
            # 使用 Kaiming 初始化卷积层的权重
            nn.init.kaiming_normal_(module.weight)
            # 如果存在偏置，使用均匀分布初始化
            if module.bias is not None:
                key = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-key, b=key)
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化 Embedding 层的权重
            module.weight.data.normal_()
            # 如果有 padding_idx，将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, FastSpeech2ConformerAttention):
            # 使用 Xavier 初始化注意力机制中的位置偏置
            nn.init.xavier_uniform_(module.pos_bias_u)
            nn.init.xavier_uniform_(module.pos_bias_v)

    def _set_gradient_checkpointing(self, module, value=False):
        # 如果是 FastSpeech2ConformerEncoder 类型的模块，设置梯度检查点
        if isinstance(module, FastSpeech2ConformerEncoder):
            module.gradient_checkpointing = value



@add_start_docstrings(
    """FastSpeech2Conformer Model.""",
    FASTSPEECH2_CONFORMER_START_DOCSTRING,
)
class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
    """
    FastSpeech 2 module.

    This is a module of FastSpeech 2 described in 'FastSpeech 2: Fast and High-Quality End-to-End Text to Speech'
    https://arxiv.org/abs/2006.04558. Instead of quantized pitch and energy, we use token-averaged value introduced in
    """
    FastPitch: Parallel Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers instead of regular
    Transformers.
    """

    @replace_return_docstrings(output_type=FastSpeech2ConformerModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.LongTensor] = None,
        spectrogram_labels: Optional[torch.FloatTensor] = None,
        duration_labels: Optional[torch.LongTensor] = None,
        pitch_labels: Optional[torch.FloatTensor] = None,
        energy_labels: Optional[torch.FloatTensor] = None,
        speaker_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        speaker_embedding: Optional[torch.FloatTensor] = None,
        return_dict: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
# 从 transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock 复制的残差块类
class HifiGanResidualBlock(nn.Module):
    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
        super().__init__()
        self.leaky_relu_slope = leaky_relu_slope

        # 第一组卷积层列表，使用不同的扩张率创建卷积层
        self.convs1 = nn.ModuleList(
            [
                nn.Conv1d(
                    channels,
                    channels,
                    kernel_size,
                    stride=1,
                    dilation=dilation[i],
                    padding=self.get_padding(kernel_size, dilation[i]),
                )
                for i in range(len(dilation))
            ]
        )
        
        # 第二组卷积层列表，每个卷积层的扩张率都为1，但使用相同的填充函数
        self.convs2 = nn.ModuleList(
            [
                nn.Conv1d(
                    channels,
                    channels,
                    kernel_size,
                    stride=1,
                    dilation=1,
                    padding=self.get_padding(kernel_size, 1),
                )
                for _ in range(len(dilation))
            ]
        )

    # 计算卷积的填充量
    def get_padding(self, kernel_size, dilation=1):
        return (kernel_size * dilation - dilation) // 2

    # 应用权重归一化到所有卷积层
    def apply_weight_norm(self):
        for layer in self.convs1:
            nn.utils.weight_norm(layer)
        for layer in self.convs2:
            nn.utils.weight_norm(layer)

    # 移除所有卷积层的权重归一化
    def remove_weight_norm(self):
        for layer in self.convs1:
            nn.utils.remove_weight_norm(layer)
        for layer in self.convs2:
            nn.utils.remove_weight_norm(layer)

    # 前向传播函数定义
    def forward(self, hidden_states):
        for conv1, conv2 in zip(self.convs1, self.convs2):
            residual = hidden_states
            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)  # 应用 LeakyReLU 激活函数
            hidden_states = conv1(hidden_states)  # 第一组卷积层
            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)  # 再次应用 LeakyReLU 激活函数
            hidden_states = conv2(hidden_states)  # 第二组卷积层
            hidden_states = hidden_states + residual  # 加上残差连接
        return hidden_states


# 从 transformers.models.speecht5.modeling_speecht5.SpeechT5HifiGan 复制的类，并将 SpeechT5 替换为 FastSpeech2Conformer
@add_start_docstrings(
    """HiFi-GAN vocoder.""",
    HIFIGAN_START_DOCSTRING,
)
class FastSpeech2ConformerHifiGan(PreTrainedModel):
    config_class = FastSpeech2ConformerHifiGanConfig
    main_input_name = "spectrogram"
    def __init__(self, config: FastSpeech2ConformerHifiGanConfig):
        # 调用父类的初始化方法，传入配置参数
        super().__init__(config)
        # 计算使用的残差块卷积核数量和上采样率数量
        self.num_kernels = len(config.resblock_kernel_sizes)
        self.num_upsamples = len(config.upsample_rates)
        # 创建一个卷积层，用于预处理输入特征
        self.conv_pre = nn.Conv1d(
            config.model_in_dim,
            config.upsample_initial_channel,
            kernel_size=7,
            stride=1,
            padding=3,
        )

        # 创建上采样层，根据配置中的参数创建多个反卷积层，并添加到模块列表中
        self.upsampler = nn.ModuleList()
        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
            self.upsampler.append(
                nn.ConvTranspose1d(
                    config.upsample_initial_channel // (2**i),
                    config.upsample_initial_channel // (2 ** (i + 1)),
                    kernel_size=kernel_size,
                    stride=upsample_rate,
                    padding=(kernel_size - upsample_rate) // 2,
                )
            )

        # 创建残差块层，根据配置中的参数创建多个残差块，并添加到模块列表中
        self.resblocks = nn.ModuleList()
        for i in range(len(self.upsampler)):
            channels = config.upsample_initial_channel // (2 ** (i + 1))
            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))

        # 创建一个卷积层，用于后处理输出特征
        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)

        # 注册缓冲区，用于存储输入特征的均值和标准差
        self.register_buffer("mean", torch.zeros(config.model_in_dim))
        self.register_buffer("scale", torch.ones(config.model_in_dim))

        # 调用初始化权重的方法，初始化各个层的权重
        self.post_init()

    def _init_weights(self, module):
        """初始化权重的方法."""
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 对线性层和卷积层的权重进行正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置项，则将偏置项初始化为零
                module.bias.data.zero_()

    def apply_weight_norm(self):
        # 对预处理卷积层和所有上采样层应用权重归一化
        nn.utils.weight_norm(self.conv_pre)
        for layer in self.upsampler:
            nn.utils.weight_norm(layer)
        # 对所有残差块应用权重归一化
        for layer in self.resblocks:
            layer.apply_weight_norm()
        # 对后处理卷积层应用权重归一化
        nn.utils.weight_norm(self.conv_post)

    def remove_weight_norm(self):
        # 移除预处理卷积层和所有上采样层的权重归一化
        nn.utils.remove_weight_norm(self.conv_pre)
        for layer in self.upsampler:
            nn.utils.remove_weight_norm(layer)
        # 移除所有残差块的权重归一化
        for layer in self.resblocks:
            layer.remove_weight_norm()
        # 移除后处理卷积层的权重归一化
        nn.utils.remove_weight_norm(self.conv_post)
    def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
        r"""
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.

        Args:
            spectrogram (`torch.FloatTensor`):
                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
                config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        """
        # 如果需要在前处理阶段进行归一化，则对输入的频谱图进行归一化处理
        if self.config.normalize_before:
            spectrogram = (spectrogram - self.mean) / self.scale

        # 检查输入的频谱图是否是批量数据
        is_batched = spectrogram.dim() == 3
        if not is_batched:
            # 如果输入不是批量数据，则在第0维度上增加一个维度，使其变成批量数据
            spectrogram = spectrogram.unsqueeze(0)

        # 将频谱图的维度进行转置，以符合卷积层的输入要求
        hidden_states = spectrogram.transpose(2, 1)

        # 经过预处理的卷积层
        hidden_states = self.conv_pre(hidden_states)

        # 循环执行上采样操作
        for i in range(self.num_upsamples):
            # 应用 LeakyReLU 激活函数
            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
            # 执行上采样操作
            hidden_states = self.upsampler[i](hidden_states)

            # 执行残差块操作
            res_state = self.resblocks[i * self.num_kernels](hidden_states)
            for j in range(1, self.num_kernels):
                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
            # 对残差块结果进行均值处理
            hidden_states = res_state / self.num_kernels

        # 应用 LeakyReLU 激活函数
        hidden_states = nn.functional.leaky_relu(hidden_states)
        # 经过后处理的卷积层
        hidden_states = self.conv_post(hidden_states)
        # 应用 Tanh 激活函数，将输出范围限制在 [-1, 1]
        hidden_states = torch.tanh(hidden_states)

        if not is_batched:
            # 如果输入不是批量数据，则去除批量维度，并将张量展平为一维音频波形
            waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
        else:
            # 如果输入是批量数据，则去除序列长度维度，使其变为一维
            waveform = hidden_states.squeeze(1)

        return waveform
# 为 FastSpeech2ConformerWithHifiGan 类添加文档字符串，描述其作为一个文本到语音模型（生成波形）的 FastSpeech2ConformerHifiGan 语音合成器。
@add_start_docstrings(
    "The FastSpeech2ConformerModel with a FastSpeech2ConformerHifiGan vocoder head that performs text-to-speech (waveform).",
    FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING,
)
class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
    # 指定配置类为 FastSpeech2ConformerWithHifiGanConfig
    config_class = FastSpeech2ConformerWithHifiGanConfig

    # 初始化方法，接受一个 FastSpeech2ConformerWithHifiGanConfig 类型的 config 参数
    def __init__(self, config: FastSpeech2ConformerWithHifiGanConfig):
        # 调用父类的初始化方法，传入 config 参数
        super().__init__(config)

        # 创建 FastSpeech2ConformerModel 模型对象，使用 config.model_config 进行配置
        self.model = FastSpeech2ConformerModel(config.model_config)
        # 创建 FastSpeech2ConformerHifiGan 语音合成器对象，使用 config.vocoder_config 进行配置
        self.vocoder = FastSpeech2ConformerHifiGan(config.vocoder_config)

        # 将 config 参数保存为实例属性
        self.config = config

    # 重写 forward 方法的文档字符串，指定输出类型为 FastSpeech2ConformerWithHifiGanOutput，配置类为 FastSpeech2ConformerWithHifiGanConfig
    @replace_return_docstrings(
        output_type=FastSpeech2ConformerWithHifiGanOutput, config_class=FastSpeech2ConformerWithHifiGanConfig
    )
    # 前向传播方法，接受多个输入参数，所有参数都是 torch 张量类型，有些参数可以为空
    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.LongTensor] = None,
        spectrogram_labels: Optional[torch.FloatTensor] = None,
        duration_labels: Optional[torch.LongTensor] = None,
        pitch_labels: Optional[torch.FloatTensor] = None,
        energy_labels: Optional[torch.FloatTensor] = None,
        speaker_ids: Optional[torch.LongTensor] = None,
        lang_ids: Optional[torch.LongTensor] = None,
        speaker_embedding: Optional[torch.FloatTensor] = None,
        return_dict: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        # 以下参数没有在声明中列出，但在使用时会根据需要传入
        **kwargs,
    ):
        # 省略了具体的前向传播逻辑，需要在实际代码中查看
        pass

`.\models\fastspeech2_conformer\tokenization_fastspeech2_conformer.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tokenization classes for FastSpeech2Conformer.
"""
import json  # 导入处理 JSON 数据的模块
import os  # 导入操作系统相关功能的模块
from typing import Optional, Tuple  # 导入类型提示相关模块

import regex  # 导入正则表达式模块

from ...tokenization_utils import PreTrainedTokenizer  # 导入预训练分词器基类
from ...utils import logging, requires_backends  # 导入日志和后端依赖的模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# 定义词汇文件名映射字典
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}

# 定义预训练模型的词汇文件映射字典
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/vocab.json",
    },
}

# 定义预训练模型的位置编码尺寸映射字典
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    # 设置为相对较大的任意数字，因为模型输入不受相对位置编码的限制
    "espnet/fastspeech2_conformer": 4096,
}


class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
    """
    Construct a FastSpeech2Conformer tokenizer.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
            The begin of sequence token. Note that for FastSpeech2, it is the same as the `eos_token`.
        eos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
            The end of sequence token. Note that for FastSpeech2, it is the same as the `bos_token`.
        pad_token (`str`, *optional*, defaults to `"<blank>"`):
            The token used for padding, for example when batching sequences of different lengths.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        should_strip_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to strip the spaces from the list of tokens.
    """

    # 设置词汇文件名映射
    vocab_files_names = VOCAB_FILES_NAMES
    # 设置预训练模型的词汇文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 设置模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 设置预训练位置编码的最大尺寸
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self,
        vocab_file,
        bos_token="<sos/eos>",
        eos_token="<sos/eos>",
        pad_token="<blank>",
        unk_token="<unk>",
        should_strip_spaces=False,
        **kwargs,
    ):
        # 检查是否需要引入"g2p_en"后端
        requires_backends(self, "g2p_en")

        # 使用 UTF-8 编码打开词汇文件，并加载为 JSON 格式的编码器
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)

        # 导入 g2p_en 库并初始化 g2p 对象
        import g2p_en
        self.g2p = g2p_en.G2p()

        # 创建反向的编码器解码器映射关系
        self.decoder = {v: k for k, v in self.encoder.items()}

        # 调用父类的初始化方法，设置特殊标记和其他参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            should_strip_spaces=should_strip_spaces,
            **kwargs,
        )

        # 设置是否应该去除空格的标志
        self.should_strip_spaces = should_strip_spaces

    @property
    def vocab_size(self):
        # 返回词汇表大小，即解码器的长度
        return len(self.decoder)

    def get_vocab(self):
        "Returns vocab as a dict"
        # 返回编码器和添加的特殊标记编码器的组合词汇表
        return dict(self.encoder, **self.added_tokens_encoder)

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        # 扩展特殊符号
        text = regex.sub(";", ",", text)
        text = regex.sub(":", ",", text)
        text = regex.sub("-", " ", text)
        text = regex.sub("&", "and", text)

        # 去除不必要的符号
        text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)

        # 去除空白字符
        text = regex.sub(r"\s+", " ", text)

        # 将文本转换为大写
        text = text.upper()

        return text, kwargs

    def _tokenize(self, text):
        """Returns a tokenized string."""
        # 使用 g2p 对文本进行音素化
        tokens = self.g2p(text)

        # 如果需要去除空格，则过滤掉空格字符
        if self.should_strip_spaces:
            tokens = list(filter(lambda s: s != " ", tokens))

        # 添加结束标记到 tokens
        tokens.append(self.eos_token)

        return tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用编码器将 token 转换为 id
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用解码器将 id 转换为 token
        return self.decoder.get(index, self.unk_token)

    # 重写因为音素无法可靠地转换回字符串
    def decode(self, token_ids, **kwargs):
        # 发出警告，由于一对多映射，音素不能可靠地转换为字符串，改为返回 token
        logger.warn(
            "Phonemes cannot be reliably converted to a string due to the one-many mapping, converting to tokens instead."
        )
        return self.convert_ids_to_tokens(token_ids)

    # 重写因为音素无法可靠地转换回字符串
    def convert_tokens_to_string(self, tokens, **kwargs):
        # 发出警告，由于一对多映射，音素不能可靠地转换为字符串，返回 tokens
        logger.warn(
            "Phonemes cannot be reliably converted to a string due to the one-many mapping, returning the tokens."
        )
        return tokens
    # 将词汇表和特殊标记文件保存到指定目录中。

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构建词汇表文件的路径，包括可选的文件名前缀和文件名
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 打开词汇表文件，并将词汇表内容以 UTF-8 编码写入文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.get_vocab(), ensure_ascii=False))

        # 返回保存的文件路径的元组
        return (vocab_file,)

    # 返回对象的状态信息，用于序列化
    def __getstate__(self):
        state = self.__dict__.copy()
        # 移除 g2p 对象，以便对象可以被序列化
        state["g2p"] = None
        return state

    # 设置对象的状态，用于反序列化
    def __setstate__(self, d):
        self.__dict__ = d

        try:
            # 尝试导入 g2p_en 库，并初始化 g2p 对象
            import g2p_en

            self.g2p = g2p_en.G2p()
        except ImportError:
            # 如果导入失败，抛出 ImportError，并提供安装 g2p-en 库的链接
            raise ImportError(
                "You need to install g2p-en to use FastSpeech2ConformerTokenizer. "
                "See https://pypi.org/project/g2p-en/ for installation."
            )

`.\models\fastspeech2_conformer\init.py`

# 版权声明和许可条款，声明代码归 HuggingFace 团队所有
#
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证的规定，否则禁止使用此文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 没有任何明示或暗示的担保或条件。详见许可证条款
from typing import TYPE_CHECKING

# 从 utils 模块中导入所需的依赖项
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构，包含模块名称及其导出的成员列表
_import_structure = {
    "configuration_fastspeech2_conformer": [
        "FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "FastSpeech2ConformerConfig",
        "FastSpeech2ConformerHifiGanConfig",
        "FastSpeech2ConformerWithHifiGanConfig",
    ],
    "tokenization_fastspeech2_conformer": ["FastSpeech2ConformerTokenizer"],
}

# 检查是否可用 torch 库，若不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加 modeling_fastspeech2_conformer 模块的导入结构
    _import_structure["modeling_fastspeech2_conformer"] = [
        "FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FastSpeech2ConformerWithHifiGan",
        "FastSpeech2ConformerHifiGan",
        "FastSpeech2ConformerModel",
        "FastSpeech2ConformerPreTrainedModel",
    ]

# 如果是类型检查模式，从各自模块导入相关类和常量
if TYPE_CHECKING:
    from .configuration_fastspeech2_conformer import (
        FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP,
        FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP,
        FastSpeech2ConformerConfig,
        FastSpeech2ConformerHifiGanConfig,
        FastSpeech2ConformerWithHifiGanConfig,
    )
    from .tokenization_fastspeech2_conformer import FastSpeech2ConformerTokenizer

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_fastspeech2_conformer import (
            FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            FastSpeech2ConformerHifiGan,
            FastSpeech2ConformerModel,
            FastSpeech2ConformerPreTrainedModel,
            FastSpeech2ConformerWithHifiGan,
        )

# 如果不是类型检查模式，则导入 sys 模块并用 _LazyModule 替换当前模块
else:
    import sys

    # 用 _LazyModule 替换当前模块，延迟导入模块内容
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\flaubert\configuration_flaubert.py`

# coding=utf-8
# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" Flaubert configuration"""

# 从 collections 模块导入 OrderedDict 类
from collections import OrderedDict
# 从 typing 模块导入 Mapping 类型
from typing import Mapping

# 导入预训练配置的基类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入 ONNX 配置
from ...onnx import OnnxConfig
# 导入日志工具
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件的映射表，将模型名称映射到配置文件的 URL
FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "flaubert/flaubert_small_cased": "https://huggingface.co/flaubert/flaubert_small_cased/resolve/main/config.json",
    "flaubert/flaubert_base_uncased": "https://huggingface.co/flaubert/flaubert_base_uncased/resolve/main/config.json",
    "flaubert/flaubert_base_cased": "https://huggingface.co/flaubert/flaubert_base_cased/resolve/main/config.json",
    "flaubert/flaubert_large_cased": "https://huggingface.co/flaubert/flaubert_large_cased/resolve/main/config.json",
}


class FlaubertConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`FlaubertModel`] or a [`TFFlaubertModel`]. It is
    used to instantiate a FlauBERT model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the FlauBERT
    [flaubert/flaubert_base_uncased](https://huggingface.co/flaubert/flaubert_base_uncased) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    """

    # 模型类型标识为 "flaubert"
    model_type = "flaubert"
    # 属性映射，将一些通用名称映射到 Flaubert 模型的特定参数名称
    attribute_map = {
        "hidden_size": "emb_dim",
        "num_attention_heads": "n_heads",
        "num_hidden_layers": "n_layers",
        "n_words": "vocab_size",  # For backward compatibility
    }
    # 初始化函数，用于构建 FlaubertConfig 对象
    def __init__(
        self,
        pre_norm=False,  # 是否进行预归一化，默认为 False
        layerdrop=0.0,  # 层随机丢弃概率，默认为 0.0
        vocab_size=30145,  # 词汇表大小，默认为 30145
        emb_dim=2048,  # 词嵌入维度，默认为 2048
        n_layers=12,  # 层数，默认为 12
        n_heads=16,  # 注意力头数，默认为 16
        dropout=0.1,  # 全连接层和注意力层的 dropout 概率，默认为 0.1
        attention_dropout=0.1,  # 注意力机制中的 dropout 概率，默认为 0.1
        gelu_activation=True,  # 是否使用 GeLU 激活函数，默认为 True
        sinusoidal_embeddings=False,  # 是否使用正弦位置编码，默认为 False
        causal=False,  # 是否是因果模型，默认为 False
        asm=False,  # 是否使用异步条带模型，默认为 False
        n_langs=1,  # 支持的语言数量，默认为 1
        use_lang_emb=True,  # 是否使用语言嵌入，默认为 True
        max_position_embeddings=512,  # 最大位置编码数量，默认为 512
        embed_init_std=2048**-0.5,  # 嵌入初始化的标准差，默认为 2048 的负半幂
        layer_norm_eps=1e-12,  # 层归一化的 epsilon，默认为 1e-12
        init_std=0.02,  # 初始化标准差，默认为 0.02
        bos_index=0,  # 开始词索引，默认为 0
        eos_index=1,  # 结束词索引，默认为 1
        pad_index=2,  # 填充词索引，默认为 2
        unk_index=3,  # 未知词索引，默认为 3
        mask_index=5,  # 掩码词索引，默认为 5
        is_encoder=True,  # 是否是编码器，默认为 True
        summary_type="first",  # 摘要类型，默认为 "first"
        summary_use_proj=True,  # 是否对摘要进行投影，默认为 True
        summary_activation=None,  # 摘要激活函数，默认为 None
        summary_proj_to_labels=True,  # 是否对摘要投影到标签，默认为 True
        summary_first_dropout=0.1,  # 第一次摘要投影的 dropout 概率，默认为 0.1
        start_n_top=5,  # 开始的 top-N 概率，默认为 5
        end_n_top=5,  # 结束的 top-N 概率，默认为 5
        mask_token_id=0,  # 掩码的 token id，默认为 0
        lang_id=0,  # 语言 id，默认为 0
        pad_token_id=2,  # 填充的 token id，默认为 2
        bos_token_id=0,  # 开始的 token id，默认为 0
        **kwargs,  # 其余参数作为关键字参数传递
    ):
        """Constructs FlaubertConfig."""
        self.pre_norm = pre_norm  # 初始化对象的预归一化属性
        self.layerdrop = layerdrop  # 初始化对象的层随机丢弃概率属性
        self.vocab_size = vocab_size  # 初始化对象的词汇表大小属性
        self.emb_dim = emb_dim  # 初始化对象的词嵌入维度属性
        self.n_layers = n_layers  # 初始化对象的层数属性
        self.n_heads = n_heads  # 初始化对象的注意力头数属性
        self.dropout = dropout  # 初始化对象的全连接层和注意力层的 dropout 概率属性
        self.attention_dropout = attention_dropout  # 初始化对象的注意力机制中的 dropout 概率属性
        self.gelu_activation = gelu_activation  # 初始化对象的 GeLU 激活函数属性
        self.sinusoidal_embeddings = sinusoidal_embeddings  # 初始化对象的正弦位置编码属性
        self.causal = causal  # 初始化对象的因果模型属性
        self.asm = asm  # 初始化对象的异步条带模型属性
        self.n_langs = n_langs  # 初始化对象的支持的语言数量属性
        self.use_lang_emb = use_lang_emb  # 初始化对象的语言嵌入属性
        self.layer_norm_eps = layer_norm_eps  # 初始化对象的层归一化 epsilon 属性
        self.bos_index = bos_index  # 初始化对象的开始词索引属性
        self.eos_index = eos_index  # 初始化对象的结束词索引属性
        self.pad_index = pad_index  # 初始化对象的填充词索引属性
        self.unk_index = unk_index  # 初始化对象的未知词索引属性
        self.mask_index = mask_index  # 初始化对象的掩码词索引属性
        self.is_encoder = is_encoder  # 初始化对象的是否是编码器属性
        self.max_position_embeddings = max_position_embeddings  # 初始化对象的最大位置编码数量属性
        self.embed_init_std = embed_init_std  # 初始化对象的嵌入初始化标准差属性
        self.init_std = init_std  # 初始化对象的初始化标准差属性
        self.summary_type = summary_type  # 初始化对象的摘要类型属性
        self.summary_use_proj = summary_use_proj  # 初始化对象的摘要是否使用投影属性
        self.summary_activation = summary_activation  # 初始化对象的摘要激活函数属性
        self.summary_proj_to_labels = summary_proj_to_labels  # 初始化对象的摘要是否投影到标签属性
        self.summary_first_dropout = summary_first_dropout  # 初始化对象的第一次摘要投影的 dropout 概率属性
        self.start_n_top = start_n_top  # 初始化对象的开始的 top-N 概率属性
        self.end_n_top = end_n_top  # 初始化对象的结束的 top-N 概率属性
        self.mask_token_id = mask_token_id  # 初始化对象的掩码的 token id 属性
        self.lang_id = lang_id  # 初始化对象的语言 id 属性

        if "n_words" in kwargs:  # 如果关键字参数中包含 'n_words'，则将其作为属性存储在对象中
            self.n_words = kwargs["n_words"]

        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)  # 调用父类初始化方法，传递填充 token id 和开始 token id，以及其他关键字参数
# 定义一个名为 FlaubertOnnxConfig 的类，继承自 OnnxConfig 类
class FlaubertOnnxConfig(OnnxConfig):
    
    # 定义一个属性 inputs，返回一个映射，其键为字符串，值为映射（整数到字符串）
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        
        # 如果任务为 "multiple-choice"，则动态轴包含 3 个维度
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        # 否则，动态轴包含 2 个维度
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回一个有序字典，包含两个条目：input_ids 和 attention_mask，其值为动态轴
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),
                ("attention_mask", dynamic_axis),
            ]
        )

`.\models\flaubert\modeling_flaubert.py`

# coding=utf-8
# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Flaubert model, based on XLM."""

import itertools
import math
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union

import numpy as np
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import gelu
from ...modeling_outputs import (
    BaseModelOutput,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_flaubert import FlaubertConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
_CONFIG_FOR_DOC = "FlaubertConfig"

FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "flaubert/flaubert_small_cased",
    "flaubert/flaubert_base_uncased",
    "flaubert/flaubert_base_cased",
    "flaubert/flaubert_large_cased",
    # See all Flaubert models at https://huggingface.co/models?filter=flaubert
]


# Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings
def create_sinusoidal_embeddings(n_pos, dim, out):
    # 创建正弦位置编码和余弦位置编码的嵌入
    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
    out.detach_()
    out.requires_grad = False


# Copied from transformers.models.xlm.modeling_xlm.get_masks
def get_masks(slen, lengths, causal, padding_mask=None):
    """
    Generate hidden states mask, and optionally an attention mask.
    """
    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
    if padding_mask is not None:
        # 如果提供了填充遮罩，则使用该遮罩
        mask = padding_mask
    else:
        assert lengths.max().item() <= slen
        # 生成长度遮罩，确保每个位置不超过最大长度
        mask = alen < lengths[:, None]

    # attention mask 是遮罩本身，或是下三角形的自注意力（因果性）
    bs = lengths.size(0)
    # 如果 causal 参数为真，则创建自注意力掩码，使得每个位置只能注意到之前的位置
    if causal:
        # 创建自注意力掩码，通过比较每个位置的最大长度，生成一个布尔类型的掩码矩阵
        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
    else:
        # 如果 causal 参数为假，则直接使用给定的掩码
        attn_mask = mask

    # 进行一些基本的检查，确保掩码的尺寸符合预期
    assert mask.size() == (bs, slen)
    # 如果 causal 参数为真，则再次检查自注意力掩码的尺寸是否符合预期
    assert causal is False or attn_mask.size() == (bs, slen, slen)

    # 返回计算后的掩码结果
    return mask, attn_mask
# 从 transformers.models.xlm.modeling_xlm.MultiHeadAttention 中复制的多头注意力机制类定义
class MultiHeadAttention(nn.Module):
    # 类变量，用于生成唯一的层标识符
    NEW_ID = itertools.count()

    def __init__(self, n_heads, dim, config):
        # 调用父类的初始化方法
        super().__init__()
        # 为当前实例生成一个唯一的层标识符
        self.layer_id = next(MultiHeadAttention.NEW_ID)
        # 注意力机制的维度
        self.dim = dim
        # 注意力头的数量
        self.n_heads = n_heads
        # 注意力机制的dropout概率，从配置中获取
        self.dropout = config.attention_dropout
        # 断言确保维度可以被头的数量整除
        assert self.dim % self.n_heads == 0

        # 以下是线性变换层的定义
        self.q_lin = nn.Linear(dim, dim)  # 查询线性层
        self.k_lin = nn.Linear(dim, dim)  # 键线性层
        self.v_lin = nn.Linear(dim, dim)  # 值线性层
        self.out_lin = nn.Linear(dim, dim)  # 输出线性层

        # 存储需要被剪枝的注意力头的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        # 计算每个注意力头的尺寸
        attention_head_size = self.dim // self.n_heads
        # 如果没有需要剪枝的头，则直接返回
        if len(heads) == 0:
            return
        # 调用函数找到需要剪枝的头的索引
        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
        
        # 剪枝线性层
        self.q_lin = prune_linear_layer(self.q_lin, index)  # 剪枝查询线性层
        self.k_lin = prune_linear_layer(self.k_lin, index)  # 剪枝键线性层
        self.v_lin = prune_linear_layer(self.v_lin, index)  # 剪枝值线性层
        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)  # 剪枝输出线性层

        # 更新超参数
        self.n_heads = self.n_heads - len(heads)  # 更新注意力头的数量
        self.dim = attention_head_size * self.n_heads  # 更新注意力机制的维度
        self.pruned_heads = self.pruned_heads.union(heads)  # 更新已剪枝头的集合
    def forward(self, input, mask, kv=None, cache=None, head_mask=None, output_attentions=False):
        """
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        """
        # Input is (bs, qlen, dim)
        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
        # 获取输入张量的维度信息
        bs, qlen, dim = input.size()
        # 根据条件确定键值对应的长度
        if kv is None:
            klen = qlen if cache is None else cache["slen"] + qlen
        else:
            klen = kv.size(1)
        
        # 计算注意力头的数量和每个头的维度
        n_heads = self.n_heads
        dim_per_head = self.dim // n_heads
        
        # 根据掩码张量的维度情况调整形状
        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)

        def shape(x):
            """对输入张量进行线性投影"""
            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)

        def unshape(x):
            """计算上下文信息"""
            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)

        # 对查询进行线性投影
        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
        
        # 根据kv是否为空，选择相应的线性投影操作
        if kv is None:
            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
        elif cache is None or self.layer_id not in cache:
            k = v = kv
            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)

        # 如果存在缓存，则更新缓存中的键值对
        if cache is not None:
            if self.layer_id in cache:
                if kv is None:
                    k_, v_ = cache[self.layer_id]
                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
                else:
                    k, v = cache[self.layer_id]
            cache[self.layer_id] = (k, v)

        # 缩放查询向量以提高注意力分数的数值稳定性
        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
        # 计算注意力分数
        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
        # 根据掩码填充注意力分数张量
        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
        scores.masked_fill_(mask, torch.finfo(scores.dtype).min)  # (bs, n_heads, qlen, klen)

        # 使用 softmax 函数计算注意力权重
        weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
        # 在训练时应用 dropout，以防止过拟合
        weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)

        # 如果需要，对注意力头进行掩码操作
        if head_mask is not None:
            weights = weights * head_mask

        # 计算上下文向量
        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
        context = unshape(context)  # (bs, qlen, dim)

        # 应用输出层线性变换
        outputs = (self.out_lin(context),)
        # 如果需要输出注意力权重，则加入到输出中
        if output_attentions:
            outputs = outputs + (weights,)
        return outputs
# Copied from transformers.models.xlm.modeling_xlm.TransformerFFN
class TransformerFFN(nn.Module):
    def __init__(self, in_dim, dim_hidden, out_dim, config):
        super().__init__()
        self.dropout = config.dropout  # 从配置中获取丢弃率
        self.lin1 = nn.Linear(in_dim, dim_hidden)  # 第一个线性层，输入维度为in_dim，输出维度为dim_hidden
        self.lin2 = nn.Linear(dim_hidden, out_dim)  # 第二个线性层，输入维度为dim_hidden，输出维度为out_dim
        self.act = gelu if config.gelu_activation else nn.functional.relu  # 激活函数选择为GELU或ReLU
        self.chunk_size_feed_forward = config.chunk_size_feed_forward  # 前馈过程中的分块大小
        self.seq_len_dim = 1  # 序列长度的维度设为1

    def forward(self, input):
        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)

    def ff_chunk(self, input):
        x = self.lin1(input)  # 第一线性层的输出
        x = self.act(x)  # 激活函数的应用
        x = self.lin2(x)  # 第二线性层的输出
        x = nn.functional.dropout(x, p=self.dropout, training=self.training)  # 使用丢弃法进行正则化
        return x


FLAUBERT_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

FLAUBERT_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
    FLAUBERT_START_DOCSTRING,
)
# Copied from transformers.models.xlm.modeling_xlm.XLMPredLayer with XLM->Flaubert
class FlaubertPredLayer(nn.Module):
    """
    Prediction layer (cross_entropy or adaptive_softmax).
    """

    def __init__(self, config):
        super().__init__()
        self.asm = config.asm  # 是否使用自适应softmax
        self.n_words = config.n_words  # 词汇表中的词汇数
        self.pad_index = config.pad_index  # 填充索引
        dim = config.emb_dim  # 嵌入维度

        if config.asm is False:
            self.proj = nn.Linear(dim, config.n_words, bias=True)  # 线性投影层，用于非自适应softmax情况
        else:
            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
                in_features=dim,
                n_classes=config.n_words,
                cutoffs=config.asm_cutoffs,
                div_value=config.asm_div_value,
                head_bias=True,  # 默认为False
            )  # 自适应softmax层，用于自适应softmax情况
    # 定义前向传播函数，计算损失和（可选）分数
    def forward(self, x, y=None):
        """Compute the loss, and optionally the scores."""
        # 初始化空的输出元组
        outputs = ()
        
        # 如果不使用自动微分机制（Autoregressive Sequence Modeling, ASM），则执行以下操作
        if self.asm is False:
            # 计算投影得分
            scores = self.proj(x)
            # 将得分添加到输出元组中
            outputs = (scores,) + outputs
            # 如果指定了目标标签 y，则计算交叉熵损失
            if y is not None:
                # 计算交叉熵损失，对预测得分进行视图重塑以匹配期望形状
                loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean")
                # 将损失添加到输出元组中
                outputs = (loss,) + outputs
        # 如果使用自动微分机制（ASM），则执行以下操作
        else:
            # 计算投影的对数概率得分
            scores = self.proj.log_prob(x)
            # 将得分添加到输出元组中
            outputs = (scores,) + outputs
            # 如果指定了目标标签 y，则计算投影和目标的损失
            if y is not None:
                # 使用投影模型和目标计算损失
                _, loss = self.proj(x, y)
                # 将损失添加到输出元组中
                outputs = (loss,) + outputs
        
        # 返回输出元组，包含得分和（可选）损失
        return outputs
# 从 transformers.models.xlm.modeling_xlm.XLMPreTrainedModel 复制并修改为支持 Flaubert 模型的基类
class FlaubertPreTrainedModel(PreTrainedModel):
    """
    处理权重初始化、预训练模型下载和加载的抽象类。
    """

    # 配置类为 FlaubertConfig
    config_class = FlaubertConfig
    # 不加载 TensorFlow 的权重
    load_tf_weights = None
    # 基础模型前缀为 "transformer"
    base_model_prefix = "transformer"

    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

    @property
    def dummy_inputs(self):
        # 定义虚拟输入
        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
        # 如果配置中使用语言嵌入且语言数量大于 1，则定义语言列表
        if self.config.use_lang_emb and self.config.n_langs > 1:
            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
        else:
            langs_list = None
        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}

    def _init_weights(self, module):
        """初始化模型的权重。"""
        if isinstance(module, nn.Embedding):
            # 如果是 Embedding 层，并且配置中指定了初始化标准差，则使用正态分布初始化权重
            if self.config is not None and self.config.embed_init_std is not None:
                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
            # 如果有填充索引，则将填充索引位置的权重置为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        if isinstance(module, nn.Linear):
            # 如果是 Linear 层，并且配置中指定了初始化标准差，则使用正态分布初始化权重和常数初始化偏置
            if self.config is not None and self.config.init_std is not None:
                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0.0)
        if isinstance(module, nn.LayerNorm):
            # 如果是 LayerNorm 层，则将偏置置零，权重置为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


class FlaubertModel(FlaubertPreTrainedModel):
    # 从 transformers.models.xlm.modeling_xlm.XLMModel.get_input_embeddings 复制
    def get_input_embeddings(self):
        # 返回 embeddings 属性作为输入嵌入
        return self.embeddings

    # 从 transformers.models.xlm.modeling_xlm.XLMModel.set_input_embeddings 复制
    def set_input_embeddings(self, new_embeddings):
        # 设置 embeddings 属性为新的嵌入
        self.embeddings = new_embeddings

    # 从 transformers.models.xlm.modeling_xlm.XLMModel._prune_heads 复制
    def _prune_heads(self, heads_to_prune):
        """
        对模型的注意力头进行修剪。
        heads_to_prune: {层号: 需要在该层中修剪的头列表} 参见基类 PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.attentions[layer].prune_heads(heads)

    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义一个方法 `forward`，用于模型的前向传播
    def forward(
        self,
        # 输入序列的token IDs，类型为长整型张量，可选
        input_ids: Optional[torch.LongTensor] = None,
        # 注意力掩码，类型为单精度浮点张量，可选
        attention_mask: Optional[torch.FloatTensor] = None,
        # 语言标识符张量，可选
        langs: Optional[torch.Tensor] = None,
        # token类型IDs张量，可选
        token_type_ids: Optional[torch.LongTensor] = None,
        # 位置IDs张量，可选
        position_ids: Optional[torch.LongTensor] = None,
        # 序列长度张量，可选
        lengths: Optional[torch.LongTensor] = None,
        # 缓存字典，键为字符串，值为单精度浮点张量，可选
        cache: Optional[Dict[str, torch.FloatTensor]] = None,
        # 头部掩码，类型为单精度浮点张量，可选
        head_mask: Optional[torch.FloatTensor] = None,
        # 输入嵌入张量，类型为单精度浮点张量，可选
        inputs_embeds: Optional[torch.FloatTensor] = None,
        # 是否输出注意力权重，布尔类型，可选
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，布尔类型，可选
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典形式的输出，布尔类型，可选
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    FLAUBERT_START_DOCSTRING,
)
# 通过继承FlaubertPreTrainedModel类定义了一个带有语言建模头部的Flaubert模型，头部线性层的权重与输入嵌入层相关联
class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
    _tied_weights_keys = ["pred_layer.proj.weight"]

    def __init__(self, config):
        super().__init__(config)
        # 初始化transformer部分，使用FlaubertModel
        self.transformer = FlaubertModel(config)
        # 初始化预测层，使用FlaubertPredLayer
        self.pred_layer = FlaubertPredLayer(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        # 返回预测层的投影权重
        return self.pred_layer.proj

    def set_output_embeddings(self, new_embeddings):
        # 设置预测层的投影权重为新的嵌入
        self.pred_layer.proj = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        # 获取mask_token_id和lang_id
        mask_token_id = self.config.mask_token_id
        lang_id = self.config.lang_id

        # 计算有效批次大小
        effective_batch_size = input_ids.shape[0]
        # 创建mask_token张量
        mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
        # 将mask_token连接到input_ids中
        input_ids = torch.cat([input_ids, mask_token], dim=1)
        # 如果存在lang_id，则创建相同维度的langs张量；否则设置为None
        if lang_id is not None:
            langs = torch.full_like(input_ids, lang_id)
        else:
            langs = None
        # 返回准备好的输入字典
        return {"input_ids": input_ids, "langs": langs}

    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
        mask="<special1>",
    )
    # 重写了forward方法，提供了输入和输出的详细文档字符串
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        langs: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        lengths: Optional[torch.Tensor] = None,
        cache: Optional[Dict[str, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 确定是否返回字典形式的结果
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用transformer处理输入数据，得到transformer的输出结果
        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从transformer的输出中获取主要的输出（通常是logits）
        output = transformer_outputs[0]

        # 使用预测层（pred_layer）生成最终的输出
        outputs = self.pred_layer(output, labels)  # (loss, logits) or (logits,) depending on if labels are provided.

        # 如果不是以字典形式返回结果，则将额外的输出（如hidden_states, attentions等）附加到outputs中返回
        if not return_dict:
            return outputs + transformer_outputs[1:]

        # 以MaskedLMOutput对象的形式返回结果，包括损失值（如果labels不为None）、logits、隐藏状态和注意力权重
        return MaskedLMOutput(
            loss=outputs[0] if labels is not None else None,
            logits=outputs[0] if labels is None else outputs[1],
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
# 将模型描述文档添加到类定义上方，说明该类是基于 Flaubert 模型的序列分类/回归模型，
# 其顶部有一个线性层（线性层位于汇总输出之上），用于例如 GLUE 任务。
@add_start_docstrings(
    """
    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
    e.g. for GLUE tasks.
    """,
    FLAUBERT_START_DOCSTRING,
)
# 从 transformers.models.xlm.modeling_xlm.XLMForSequenceClassification 复制而来，
# 将 XLM_INPUTS 替换为 FLAUBERT_INPUTS，将 XLM 替换为 Flaubert
class FlaubertForSequenceClassification(FlaubertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数目
        self.config = config

        self.transformer = FlaubertModel(config)  # 初始化 Flaubert 模型
        self.sequence_summary = SequenceSummary(config)  # 初始化序列汇总器

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播函数定义，接受多种输入参数，返回序列分类器的输出
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        langs: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        lengths: Optional[torch.Tensor] = None,
        cache: Optional[Dict[str, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用指定的 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 transformer 处理输入序列
        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 transformer 输出中获取主要输出
        output = transformer_outputs[0]
        
        # 对输出进行序列汇总
        logits = self.sequence_summary(output)

        # 初始化损失为 None
        loss = None

        # 如果提供了标签
        if labels is not None:
            # 根据问题类型确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 如果标签数为 1，则计算回归损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 否则计算多标签回归损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 如果是单标签分类问题，则计算交叉熵损失
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 如果是多标签分类问题，则计算二元交叉熵损失
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不使用 return_dict，则返回输出元组
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 使用 SequenceClassifierOutput 类返回结果
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
@add_start_docstrings(
    """
    Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    FLAUBERT_START_DOCSTRING,
)
# 从transformers.models.xlm.modeling_xlm.XLMForTokenClassification复制而来，将XLM_INPUTS改为FLAUBERT_INPUTS，将XLM改为Flaubert
class FlaubertForTokenClassification(FlaubertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化Flaubert模型
        self.transformer = FlaubertModel(config)
        # Dropout层，使用配置中的dropout率
        self.dropout = nn.Dropout(config.dropout)
        # 分类器，将隐藏状态输出映射到num_labels维度
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播函数，接受多种输入参数，返回TokenClassifierOutput类型的输出
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        langs: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        lengths: Optional[torch.Tensor] = None,
        cache: Optional[Dict[str, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 返回一个元组或 TokenClassifierOutput 对象
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        langs: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        lengths: Optional[torch.Tensor] = None,
        cache: Optional[Dict[str, Optional[torch.Tensor]]] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        """
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 确保 return_dict 不为 None
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    
        # 将输入传递给 transformer 层进行处理
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    
        # 从 transformer 输出中获取序列输出
        sequence_output = outputs[0]
    
        # 对序列输出应用 dropout
        sequence_output = self.dropout(sequence_output)
        
        # 使用分类器得到 logits
        logits = self.classifier(sequence_output)
    
        # 初始化损失为 None
        loss = None
        # 如果给定了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
    
        # 如果 return_dict 为 True，则返回 TokenClassifierOutput 对象
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    FLAUBERT_START_DOCSTRING,
)
# 定义一个新的类，用于执行简单的问答任务，基于 Flaubert 模型，添加了用于提取问题答案的分类头部
class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 Flaubert 模型
        self.transformer = FlaubertModel(config)
        # 线性层，用于生成 `span start logits` 和 `span end logits`
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播函数，接收多个输入参数，计算模型输出
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        langs: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        lengths: Optional[torch.Tensor] = None,
        cache: Optional[Dict[str, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    """
    Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    FLAUBERT_START_DOCSTRING,
)
# 数据类，用于定义 Flaubert 模型在问答输出时的结果结构，继承自 ModelOutput
@dataclass
class FlaubertForQuestionAnsweringOutput(ModelOutput):
    """
    Base class for outputs of question answering models using a `SquadHead`.
    """



# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
# 从 XLMForQuestionAnsweringSimple 模型复制而来，做了相应的替换以适应 Flaubert 模型
# 根据给定配置创建一个简单的问答模型
class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
            分类损失，作为开始标记、结束标记（如果提供的话还包括is_impossible）分类损失的总和。
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            开始标记的前config.start_n_top个可能性的对数概率（beam-search）。
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            开始标记的前config.start_n_top个可能性的索引（beam-search）。
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            结束标记的前config.start_n_top * config.end_n_top个可能性的对数概率（beam-search）。
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            结束标记的前config.start_n_top * config.end_n_top个可能性的索引（beam-search）。
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            答案的`is_impossible`标签的对数概率。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态的元组，包括每层的输出和初始嵌入的输出，形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重的元组，每层一个，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，用于计算自注意力头中的加权平均值。
# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnswering with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
class FlaubertForQuestionAnswering(FlaubertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # Initialize FlaubertModel with provided configuration
        self.transformer = FlaubertModel(config)
        # Initialize SQuADHead for question answering tasks
        self.qa_outputs = SQuADHead(config)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=FlaubertForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        langs: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        lengths: Optional[torch.Tensor] = None,
        cache: Optional[Dict[str, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        is_impossible: Optional[torch.Tensor] = None,
        cls_index: Optional[torch.Tensor] = None,
        p_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass for FlaubertForQuestionAnswering model.

        Args:
            input_ids (torch.Tensor, optional): Indices of input sequence tokens.
            attention_mask (torch.Tensor, optional): Mask to avoid performing attention on padding tokens.
            langs (torch.Tensor, optional): Language IDs for multi-lingual models like XLM.
            token_type_ids (torch.Tensor, optional): Segment token indices to indicate first and second portions of the inputs.
            position_ids (torch.Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings.
            lengths (torch.Tensor, optional): Lengths of each sequence to handle padded inputs.
            cache (Dict[str, torch.Tensor], optional): Dictionary with precomputed hidden states.
            head_mask (torch.Tensor, optional): Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (torch.Tensor, optional): Embedded representation of the inputs.
            start_positions (torch.Tensor, optional): Start position for the answer span.
            end_positions (torch.Tensor, optional): End position for the answer span.
            is_impossible (torch.Tensor, optional): Whether the question has no possible answer.
            cls_index (torch.Tensor, optional): Position of the classification token in the input sequence.
            p_mask (torch.Tensor, optional): Mask of tokens which can't be in the answer.
            output_attentions (bool, optional): Whether to output attentions weights.
            output_hidden_states (bool, optional): Whether to output all hidden-states.
            return_dict (bool, optional): Whether to return a single dictionary instead of a tuple of outputs.

        Returns:
            Union[FlaubertForQuestionAnsweringOutput, Tuple[torch.Tensor]]:
                Depending on `return_dict`, either a dictionary with main outputs or a tuple of outputs.
        """
        # Actual forward logic will be implemented here
        pass

# Copied from transformer.models.xlm.modeling_xlm.XLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
class FlaubertForMultipleChoice(FlaubertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # Initialize FlaubertModel with provided configuration
        self.transformer = FlaubertModel(config)
        # Sequence summarization layer
        self.sequence_summary = SequenceSummary(config)
        # Linear layer for projecting logits
        self.logits_proj = nn.Linear(config.num_labels, 1)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(
        FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        langs: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        lengths: Optional[torch.Tensor] = None,
        cache: Optional[Dict[str, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass for FlaubertForMultipleChoice model.

        Args:
            input_ids (torch.Tensor, optional): Indices of input sequence tokens.
            attention_mask (torch.Tensor, optional): Mask to avoid performing attention on padding tokens.
            langs (torch.Tensor, optional): Language IDs for multi-lingual models like XLM.
            token_type_ids (torch.Tensor, optional): Segment token indices to indicate first and second portions of the inputs.
            position_ids (torch.Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings.
            lengths (torch.Tensor, optional): Lengths of each sequence to handle padded inputs.
            cache (Dict[str, torch.Tensor], optional): Dictionary with precomputed hidden states.
            head_mask (torch.Tensor, optional): Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (torch.Tensor, optional): Embedded representation of the inputs.
            labels (torch.Tensor, optional): Labels for computing the multiple choice classification loss.
            output_attentions (bool, optional): Whether to output attentions weights.
            output_hidden_states (bool, optional): Whether to output all hidden-states.
            return_dict (bool, optional): Whether to return a single dictionary instead of a tuple of outputs.

        Returns:
            Union[MultipleChoiceModelOutput, Tuple[torch.Tensor]]:
                Depending on `return_dict`, either a dictionary with main outputs or a tuple of outputs.
        """
        # Actual forward logic will be implemented here
        pass
    # 定义模型的前向传播函数，接受多个可选的输入参数，均为 torch.Tensor 类型
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs，可选参数
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可选参数
        langs: Optional[torch.Tensor] = None,  # 语言标识符，可选参数
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，可选参数
        position_ids: Optional[torch.Tensor] = None,  # 位置 IDs，可选参数
        lengths: Optional[torch.Tensor] = None,  # 序列长度信息，可选参数
        cache: Optional[Dict[str, torch.Tensor]] = None,  # 缓存，字典类型，可选参数
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，可选参数
        inputs_embeds: Optional[torch.Tensor] = None,  # 嵌入表示，可选参数
        labels: Optional[torch.Tensor] = None,  # 标签，可选参数
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选参数
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选参数
        return_dict: Optional[bool] = None,  # 是否返回一个字典形式的结果，可选参数
        ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 设置返回字典的默认值为self.config.use_return_dict，如果return_dict不为None，则使用传入的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 获取第二维的大小，即选项数量，从input_ids获取，如果input_ids为None，则从inputs_embeds获取
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 如果input_ids不为None，则将其视图重新排列为(-1, input_ids.size(-1))，否则为None
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 如果attention_mask不为None，则将其视图重新排列为(-1, attention_mask.size(-1))，否则为None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 如果token_type_ids不为None，则将其视图重新排列为(-1, token_type_ids.size(-1))，否则为None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        # 如果position_ids不为None，则将其视图重新排列为(-1, position_ids.size(-1))，否则为None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 如果langs不为None，则将其视图重新排列为(-1, langs.size(-1))，否则为None
        langs = langs.view(-1, langs.size(-1)) if langs is not None else None
        # 如果inputs_embeds不为None，则将其视图重新排列为(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))，否则为None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 如果lengths不为None，则发出警告并将其设为None，因为Flaubert多选模型不支持使用lengths参数
        if lengths is not None:
            logger.warning(
                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
                "attention mask instead."
            )
            lengths = None

        # 将所有参数传递给transformer模型进行前向传播，获取transformer的输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取transformer的输出中的第一个元素，通常是模型输出的主要部分
        output = transformer_outputs[0]
        # 使用sequence_summary方法对输出进行汇总
        logits = self.sequence_summary(output)
        # 使用logits_proj方法对logits进行处理，使其形状为(-1, num_choices)
        logits = self.logits_proj(logits)
        reshaped_logits = logits.view(-1, num_choices)

        # 如果labels不为None，则计算交叉熵损失
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果return_dict为False，则返回元组形式的输出；否则返回MultipleChoiceModelOutput对象
        if not return_dict:
            output = (reshaped_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回MultipleChoiceModelOutput对象，其中包含loss、logits以及transformer_outputs的hidden_states和attentions
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

`.\models\flaubert\modeling_tf_flaubert.py`

# 引入必要的库和模块
import itertools  # itertools模块用于高效地迭代操作
import random  # random模块用于生成随机数和随机选择
import warnings  # warnings模块用于管理警告信息
from dataclasses import dataclass  # dataclass用于创建数据类，简化数据对象的创建和操作
from typing import Dict, Optional, Tuple, Union  # 引入类型提示，用于静态类型检查

import numpy as np  # 引入numpy库，用于数值计算
import tensorflow as tf  # 引入TensorFlow库，用于构建和训练神经网络模型

# 从transformers库中引入所需的TensorFlow相关模块和类
from ...activations_tf import get_tf_activation  # 从transformers.activations_tf模块导入get_tf_activation函数
from ...modeling_tf_outputs import (  # 导入transformers.modeling_tf_outputs模块中的各种输出类
    TFBaseModelOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (  # 导入transformers.modeling_tf_utils模块中的各种实用函数和类
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFSequenceSummary,
    TFSharedEmbeddings,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import (  # 导入transformers.tf_utils模块中的各种实用函数
    check_embeddings_within_bounds,
    shape_list,
    stable_softmax,
)
from ...utils import (  # 导入transformers.utils模块中的各种实用函数和类
    MULTIPLE_CHOICE_DUMMY_INPUTS,
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_flaubert import FlaubertConfig  # 导入当前目录下的FlaubertConfig配置类

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义文档中用到的一些常量
_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
_CONFIG_FOR_DOC = "FlaubertConfig"

# 定义Flaubert预训练模型存档列表
TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    # 查看所有Flaubert模型的存档列表：https://huggingface.co/models?filter=flaubert
]

# 定义Flaubert模型的起始文档字符串
FLAUBERT_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    # Parameters: config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
    # Initializing with a config file does not load the weights associated with the model, only the
    # configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    config ([`FlaubertConfig`]):
        # Model configuration class containing all model parameters.
        Model configuration class with all the parameters of the model.
        
        # Initializing with a config file does not load the weights associated with the model, only the configuration.
        Initializing with a config file does not load the weights associated with the model, only the configuration.
        
        # Check out the [`~PreTrained Model method weights associated Model from explained. contains meaning unknown.
"""

FLAUBERT_INPUTS_DOCSTRING = r"""
"""


def get_masks(slen, lengths, causal, padding_mask=None):
    """
    Generate hidden states mask, and optionally an attention mask.
    """
    # 获取批量大小
    bs = shape_list(lengths)[0]
    # 如果提供了padding_mask，则使用之；否则根据lengths生成mask
    if padding_mask is not None:
        mask = padding_mask
    else:
        # 生成一个长度为slen的序列
        alen = tf.range(slen, dtype=lengths.dtype)
        # 生成mask，标识每个位置是否有效（小于对应的lengths）
        mask = alen < tf.expand_dims(lengths, axis=1)

    # attention mask可以是与mask相同，或者是下三角形式的（因果性）
    if causal:
        # 下三角形式的注意力mask
        attn_mask = tf.less_equal(
            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
        )
    else:
        attn_mask = mask

    # 断言检查
    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
    if causal:
        tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])

    return mask, attn_mask


class TFFlaubertPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = FlaubertConfig
    base_model_prefix = "transformer"

    @property
    def dummy_inputs(self):
        # 有时候Flaubert模型包含语言嵌入，如果需要，不要忘记同时构建它们
        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
        if self.config.use_lang_emb and self.config.n_langs > 1:
            return {
                "input_ids": inputs_list,
                "attention_mask": attns_list,
                "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
            }
        else:
            return {"input_ids": inputs_list, "attention_mask": attns_list}


@add_start_docstrings(
    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
    FLAUBERT_START_DOCSTRING,
)
class TFFlaubertModel(TFFlaubertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化transformer模块
        self.transformer = TFFlaubertMainLayer(config, name="transformer")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFBaseModelOutput]:
        # 调用 Transformer 模型进行前向传播，并返回输出结果
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 返回 Transformer 的输出结果
        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果存在 Transformer 模型，则在 TensorFlow 图中构建它
        if getattr(self, "transformer", None) is not None:
            # 使用 Transformer 的名称作为 TensorFlow 名称空间
            with tf.name_scope(self.transformer.name):
                # 构建 Transformer 模型
                self.transformer.build(None)
# 从transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention复制并将XLM->Flaubert
class TFFlaubertMultiHeadAttention(keras.layers.Layer):
    # 类变量，用于生成每个实例的唯一标识符
    NEW_ID = itertools.count()

    def __init__(self, n_heads, dim, config, **kwargs):
        super().__init__(**kwargs)
        # 为当前实例分配唯一的标识符
        self.layer_id = next(TFFlaubertMultiHeadAttention.NEW_ID)
        self.dim = dim  # 注意力机制中向量的维度
        self.n_heads = n_heads  # 注意力头的数量
        self.output_attentions = config.output_attentions  # 控制是否输出注意力权重
        assert self.dim % self.n_heads == 0  # 确保维度可以被注意力头数量整除

        # 初始化查询、键、值的线性层
        self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
        self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
        self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
        # 初始化输出线性层
        self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
        # 注意力机制中的dropout层
        self.dropout = keras.layers.Dropout(config.attention_dropout)
        # 被剪枝掉的注意力头集合
        self.pruned_heads = set()
        self.dim = dim  # 注意力机制中向量的维度

    # 未实现的方法，用于剪枝注意力头
    def prune_heads(self, heads):
        raise NotImplementedError

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在查询线性层，则构建查询线性层
        if getattr(self, "q_lin", None) is not None:
            with tf.name_scope(self.q_lin.name):
                self.q_lin.build([None, None, self.dim])
        # 如果存在键线性层，则构建键线性层
        if getattr(self, "k_lin", None) is not None:
            with tf.name_scope(self.k_lin.name):
                self.k_lin.build([None, None, self.dim])
        # 如果存在值线性层，则构建值线性层
        if getattr(self, "v_lin", None) is not None:
            with tf.name_scope(self.v_lin.name):
                self.v_lin.build([None, None, self.dim])
        # 如果存在输出线性层，则构建输出线性层
        if getattr(self, "out_lin", None) is not None:
            with tf.name_scope(self.out_lin.name):
                self.out_lin.build([None, None, self.dim])


# 从transformers.models.xlm.modeling_tf_xlm.TFXLMTransformerFFN复制
class TFFlaubertTransformerFFN(keras.layers.Layer):
    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
        super().__init__(**kwargs)

        # 第一个全连接层，用于FFN的第一步变换
        self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
        # 第二个全连接层，用于FFN的第二步变换
        self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
        # 激活函数选择，如果配置为GELU激活函数，则使用GELU，否则使用ReLU
        self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
        # dropout层，用于防止过拟合
        self.dropout = keras.layers.Dropout(config.dropout)
        self.in_dim = in_dim  # 输入维度
        self.dim_hidden = dim_hidden  # 隐藏层维度

    def call(self, input, training=False):
        # 第一步变换：输入经过第一个全连接层
        x = self.lin1(input)
        # 应用激活函数
        x = self.act(x)
        # 第二步变换：经过第二个全连接层
        x = self.lin2(x)
        # 应用dropout，只有在训练时才应用dropout
        x = self.dropout(x, training=training)

        return x
    # 定义一个方法 `build`，用于构建模型
    def build(self, input_shape=None):
        # 如果模型已经构建完成，直接返回，不进行重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        
        # 检查并构建第一个线性层 `lin1`
        if getattr(self, "lin1", None) is not None:
            # 使用 `lin1` 的名称作为命名空间，构建 `lin1` 层，输入维度为 [None, None, self.in_dim]
            with tf.name_scope(self.lin1.name):
                self.lin1.build([None, None, self.in_dim])
        
        # 检查并构建第二个线性层 `lin2`
        if getattr(self, "lin2", None) is not None:
            # 使用 `lin2` 的名称作为命名空间，构建 `lin2` 层，输入维度为 [None, None, self.dim_hidden]
            with tf.name_scope(self.lin2.name):
                self.lin2.build([None, None, self.dim_hidden])
# 定义一个可序列化的 Keras 层，用于处理 Flaubert 模型的主要功能
@keras_serializable
class TFFlaubertMainLayer(keras.layers.Layer):
    # 配置类指定为 FlaubertConfig
    config_class = FlaubertConfig

    # 初始化函数，接受配置参数 config 和其他关键字参数
    def __init__(self, config, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 将传入的配置参数 config 存储在实例变量 self.config 中
        self.config = config
        # 从配置中获取并存储各种属性，如头数、语言数、嵌入维度等
        self.n_heads = config.n_heads
        self.n_langs = config.n_langs
        self.dim = config.emb_dim
        self.hidden_dim = self.dim * 4
        self.n_words = config.n_words
        self.pad_index = config.pad_index
        self.causal = config.causal
        self.n_layers = config.n_layers
        self.use_lang_emb = config.use_lang_emb
        self.layerdrop = getattr(config, "layerdrop", 0.0)
        self.pre_norm = getattr(config, "pre_norm", False)
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.return_dict = config.use_return_dict
        self.max_position_embeddings = config.max_position_embeddings
        self.embed_init_std = config.embed_init_std

        # 创建一个 Dropout 层，用于后续的 Dropout 操作
        self.dropout = keras.layers.Dropout(config.dropout)
        
        # 创建共享的嵌入层 TFSharedEmbeddings，用于词嵌入
        self.embeddings = TFSharedEmbeddings(
            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
        )
        
        # 创建层归一化层，用于嵌入层的输出
        self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
        
        # 初始化存储注意力、层归一化、前馈神经网络和第二层归一化的列表
        self.attentions = []
        self.layer_norm1 = []
        self.ffns = []
        self.layer_norm2 = []

        # 根据层数 self.n_layers 迭代创建注意力、层归一化、前馈神经网络和第二层归一化
        for i in range(self.n_layers):
            # 创建多头注意力层 TFFlaubertMultiHeadAttention
            self.attentions.append(
                TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}")
            )
            
            # 创建第一层归一化层
            self.layer_norm1.append(
                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}")
            )
            
            # 创建 Transformer 中的前馈神经网络层 TFFlaubertTransformerFFN
            self.ffns.append(
                TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}")
            )
            
            # 创建第二层归一化层
            self.layer_norm2.append(
                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}")
            )
    # 在 TensorFlow 名称空间 "position_embeddings" 下创建位置嵌入权重矩阵
    def build(self, input_shape=None):
        with tf.name_scope("position_embeddings"):
            # 添加权重变量，用于存储位置嵌入矩阵，形状为 [最大位置数, 嵌入维度]
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.dim],
                initializer=get_initializer(self.embed_init_std),
            )

        # 如果有多语言且使用语言嵌入，创建语言嵌入权重矩阵
        if self.n_langs > 1 and self.use_lang_emb:
            with tf.name_scope("lang_embeddings"):
                # 添加权重变量，用于存储语言嵌入矩阵，形状为 [语言数, 嵌入维度]
                self.lang_embeddings = self.add_weight(
                    name="embeddings",
                    shape=[self.n_langs, self.dim],
                    initializer=get_initializer(self.embed_init_std),
                )

        # 如果已经建立过网络结构，则直接返回
        if self.built:
            return
        self.built = True

        # 如果存在 embeddings 属性，则构建 embeddings 属性
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)

        # 如果存在 layer_norm_emb 属性，则构建 layer_norm_emb 属性
        if getattr(self, "layer_norm_emb", None) is not None:
            with tf.name_scope(self.layer_norm_emb.name):
                # 构建 layer_norm_emb 属性，形状为 [None, None, 嵌入维度]
                self.layer_norm_emb.build([None, None, self.dim])

        # 遍历注意力层列表，构建每个注意力层
        for layer in self.attentions:
            with tf.name_scope(layer.name):
                layer.build(None)

        # 遍历第一个层归一化列表，构建每个层归一化层
        for layer in self.layer_norm1:
            with tf.name_scope(layer.name):
                # 构建每个层归一化层，形状为 [None, None, 嵌入维度]
                layer.build([None, None, self.dim])

        # 遍历前馈神经网络列表，构建每个前馈神经网络层
        for layer in self.ffns:
            with tf.name_scope(layer.name):
                layer.build(None)

        # 遍历第二个层归一化列表，构建每个层归一化层
        for layer in self.layer_norm2:
            with tf.name_scope(layer.name):
                # 构建每个层归一化层，形状为 [None, None, 嵌入维度]
                layer.build([None, None, self.dim])

    # 返回 embeddings 属性
    def get_input_embeddings(self):
        return self.embeddings

    # 设置 embeddings 属性的值，并更新其词汇大小
    def set_input_embeddings(self, value):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # 装饰器，用于解包输入参数并调用模型
    @unpack_inputs
    def call(
        self,
        input_ids: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMPredLayer
class TFFlaubertPredLayer(keras.layers.Layer):
    """
    Prediction layer (cross_entropy or adaptive_softmax).
    """

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.asm = config.asm  # 是否使用自适应 softmax
        self.n_words = config.n_words  # 词汇表中的单词数量
        self.pad_index = config.pad_index  # 填充索引

        if config.asm is False:
            self.input_embeddings = input_embeddings  # 输入的嵌入层对象
        else:
            raise NotImplementedError  # 如果使用自适应 softmax，暂未实现的情况
            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
            #     in_features=dim,
            #     n_classes=config.n_words,
            #     cutoffs=config.asm_cutoffs,
            #     div_value=config.asm_div_value,
            #     head_bias=True,  # 默认为 False
            # )

    def build(self, input_shape):
        # 输出的权重与输入的嵌入层相同，但是每个标记有一个独立的输出偏置
        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")

        super().build(input_shape)

    def get_output_embeddings(self):
        return self.input_embeddings  # 返回输入的嵌入层对象

    def set_output_embeddings(self, value):
        self.input_embeddings.weight = value  # 设置输入的嵌入层权重
        self.input_embeddings.vocab_size = shape_list(value)[0]  # 设置词汇表大小为权重的第一维大小

    def get_bias(self):
        return {"bias": self.bias}  # 返回偏置参数

    def set_bias(self, value):
        self.bias = value["bias"]  # 设置偏置参数
        self.vocab_size = shape_list(value["bias"])[0]  # 设置词汇表大小为偏置参数的第一维大小

    def call(self, hidden_states):
        hidden_states = self.input_embeddings(hidden_states, mode="linear")  # 使用线性模式进行嵌入
        hidden_states = hidden_states + self.bias  # 加上偏置参数

        return hidden_states


@dataclass
class TFFlaubertWithLMHeadModelOutput(ModelOutput):
    """
    Base class for [`TFFlaubertWithLMHeadModel`] outputs.

    Args:
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    logits: tf.Tensor = None  # 语言建模头部的预测分数（SoftMax 前的每个词汇标记的分数）
    # 定义变量 hidden_states，类型为 Tuple[tf.Tensor] 或 None，初始值为 None
    hidden_states: Tuple[tf.Tensor] | None = None
    # 定义变量 attentions，类型为 Tuple[tf.Tensor] 或 None，初始值为 None
    attentions: Tuple[tf.Tensor] | None = None
"""
The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
# 导入必要的库和模块
@add_start_docstrings(
    """
    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    FLAUBERT_START_DOCSTRING,
)
# 定义 TFFlaubertWithLMHeadModel 类，继承自 TFFlaubertPreTrainedModel
class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel):
    
    # 初始化方法
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)
        # 创建 Flaubert 主层，并命名为 "transformer"
        self.transformer = TFFlaubertMainLayer(config, name="transformer")
        # 创建 Flaubert 预测层，并将 embeddings 传入，命名为 "pred_layer_._proj"
        self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
        # Flaubert 模型不支持过去的缓存特性
        self.supports_xla_generation = False

    # 获取语言模型头部的方法
    def get_lm_head(self):
        return self.pred_layer

    # 获取前缀偏置名字的方法（已弃用）
    def get_prefix_bias_name(self):
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        return self.name + "/" + self.pred_layer.name

    # 准备生成所需输入的方法
    def prepare_inputs_for_generation(self, inputs, **kwargs):
        # 获取配置中的 mask_token_id 和 lang_id
        mask_token_id = self.config.mask_token_id
        lang_id = self.config.lang_id

        # 获取有效的批处理大小
        effective_batch_size = inputs.shape[0]
        # 创建 mask_token，填充到输入张量的末尾
        mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id
        inputs = tf.concat([inputs, mask_token], axis=1)

        # 如果 lang_id 不为 None，则创建相应语言标识符张量
        if lang_id is not None:
            langs = tf.ones_like(inputs) * lang_id
        else:
            langs = None
        return {"input_ids": inputs, "langs": langs}

    # 定义模型的调用方法，处理输入并执行前向传播
    @unpack_inputs
    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFFlaubertWithLMHeadModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: np.ndarray | tf.Tensor | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs
    ):
        # 方法用于执行模型的前向传播，并接受多种类型的输入参数
    ) -> Union[Tuple, TFFlaubertWithLMHeadModelOutput]:
        # 调用 Transformer 模型处理输入数据，返回变换后的输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 Transformer 输出中获取主要的输出
        output = transformer_outputs[0]
        # 通过预测层对主要输出进行预测
        outputs = self.pred_layer(output)

        # 如果不要求返回字典，则以元组形式返回预测输出和 Transformer 的其余输出
        if not return_dict:
            return (outputs,) + transformer_outputs[1:]

        # 如果要求返回字典，则创建 TFFlaubertWithLMHeadModelOutput 对象
        return TFFlaubertWithLMHeadModelOutput(
            logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
        )

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        # 标记模型为已构建状态
        self.built = True
        # 如果存在 Transformer 属性，则构建 Transformer
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果存在预测层属性，则构建预测层
        if getattr(self, "pred_layer", None) is not None:
            with tf.name_scope(self.pred_layer.name):
                self.pred_layer.build(None)
# 添加模型的文档字符串，描述该模型在顶部具有一个用于序列分类/回归的头部（在汇总输出之上的线性层），例如用于GLUE任务。
# 这里使用了FLAUBERT_START_DOCSTRING和其他相关文档字符串作为起始。
@add_start_docstrings(
    """
    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
    e.g. for GLUE tasks.
    """,
    FLAUBERT_START_DOCSTRING,
)
# 从transformers.models.xlm.modeling_tf_xlm.TFXLMForSequenceClassification复制而来，
# 将XLM_INPUTS替换为FLAUBERT_INPUTS，将XLM替换为Flaubert
class TFFlaubertForSequenceClassification(TFFlaubertPreTrainedModel, TFSequenceClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 初始化transformer层，使用TFFlaubertMainLayer
        self.transformer = TFFlaubertMainLayer(config, name="transformer")
        # 初始化序列汇总层，使用TFSequenceSummary
        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")

    # 将输入解包后传递给模型的前向计算，添加了FLAUBERT_INPUTS_DOCSTRING作为模型前向计算的文档字符串
    # 还包括了checkpoint、output_type、config_class作为代码示例的文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 调用transformer模型，传入各种参数，获取transformer的输出
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 获取transformer输出的第一个元素作为输出
        output = transformer_outputs[0]

        # 对输出进行序列摘要，生成logits
        logits = self.sequence_summary(output)

        # 如果labels不为None，计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不返回字典形式的结果，则将logits与其余transformer输出拼接在一起返回
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回TFSequenceClassifierOutput对象，包含损失、logits、隐藏状态和注意力权重
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过网络，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在transformer模型，则在其名字域内构建transformer
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果存在sequence_summary模型，则在其名字域内构建sequence_summary
        if getattr(self, "sequence_summary", None) is not None:
            with tf.name_scope(self.sequence_summary.name):
                self.sequence_summary.build(None)
@add_start_docstrings(
    """
    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    FLAUBERT_START_DOCSTRING,
)
# 基于 Flaubert 模型，添加了一个面向提取式问答任务（如 SQuAD）的跨度分类头部（在隐藏状态输出之上的线性层，用于计算 `span start logits` 和 `span end logits`）。
class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestionAnsweringLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFFlaubertMainLayer(config, name="transformer")
        # 初始化一个全连接层用于问答输出，其输出维度为 config.num_labels，使用指定的初始化方式初始化权重
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播函数，接受多种输入，并返回一个字典或 TFQuestionAnsweringModelOutput 类型的输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        # 可能的训练参数包括起始位置和结束位置，用于指定答案的位置
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 获取transformer模型的输出，包括各种参数和特征
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从transformer输出的序列输出中提取结果
        sequence_output = transformer_outputs[0]

        # 使用qa_outputs层对序列输出进行转换，得到起始和结束位置的logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        # 计算损失函数
        loss = None
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 如果不返回字典，则输出start_logits、end_logits和transformer_outputs的其余部分
        if not return_dict:
            output = (start_logits, end_logits) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回TFQuestionAnsweringModelOutput对象，包括loss、start_logits、end_logits以及其他transformer输出
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        self.built = True
        # 如果存在transformer模型，则构建transformer层
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果存在qa_outputs层，则构建qa_outputs层
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    FLAUBERT_START_DOCSTRING,
)
# 定义了一个用于标记分类任务的 Flaubert 模型，该模型在隐藏状态输出之上添加了一个线性层，用于例如命名实体识别（NER）任务。
# 此处的注释是一个函数装饰器，用于在类的开头添加文档字符串。

class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassificationLoss):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 初始化模型的主要组件
        self.transformer = TFFlaubertMainLayer(config, name="transformer")
        # 添加一个用于防止过拟合的 dropout 层
        self.dropout = keras.layers.Dropout(config.dropout)
        # 分类器层，使用全连接层，输出维度为标签数目
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义了模型的前向传播方法
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
        # 以下是函数参数列表，包括输入数据和模型的控制参数
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用Transformer模型，传入各种输入参数，并获取输出结果
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从Transformer的输出中获取序列输出
        sequence_output = transformer_outputs[0]

        # 对序列输出进行dropout操作，根据训练状态进行不同的处理
        sequence_output = self.dropout(sequence_output, training=training)
        # 将dropout后的输出传入分类器获取logits
        logits = self.classifier(sequence_output)

        # 如果没有提供labels，则损失为None；否则计算token分类损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果不返回字典形式的输出，则按元组形式组织返回结果
        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果返回字典形式的输出，则组织为TFTokenClassifierOutput对象
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        # 如果存在Transformer模型，则构建Transformer模型
        if getattr(self, "transformer", None) is not None:
            with tf.name_scope(self.transformer.name):
                self.transformer.build(None)
        # 如果存在分类器模型，则构建分类器模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    """,
    FLAUBERT_START_DOCSTRING,
)
# 定义了一个新的 TensorFlow 模型类 TFFlaubertForMultipleChoice，继承自 TFFlaubertPreTrainedModel 和 TFMultipleChoiceLoss
class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLoss):
    def __init__(self, config, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 创建 Flaubert 主层，并命名为 'transformer'
        self.transformer = TFFlaubertMainLayer(config, name="transformer")
        # 创建序列摘要层，使用 TFSequenceSummary 类，初始化范围为 config.init_std，命名为 'sequence_summary'
        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
        # 创建 logits 投影层，使用 Dense 层，输出维度为 1，初始化器使用 config.initializer_range，命名为 'logits_proj'
        self.logits_proj = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
        )
        # 设置实例变量 config 为传入的配置对象
        self.config = config

    @property
    def dummy_inputs(self):
        """
        Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        """
        # 如果配置要求使用语言嵌入并且语言数量大于 1，则返回带有语言信息的输入字典
        if self.config.use_lang_emb and self.config.n_langs > 1:
            return {
                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
                "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
            }
        else:
            # 否则，只返回包含 input_ids 的输入字典
            return {
                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
            }

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
        FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        langs: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        lengths: np.ndarray | tf.Tensor | None = None,
        cache: Optional[Dict[str, tf.Tensor]] = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: bool = False,
    # 定义方法的返回类型，可以是 TFMultipleChoiceModelOutput 或者包含 tf.Tensor 的元组
    -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        # 如果存在 input_ids，则获取选择项数量和序列长度
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]
            seq_length = shape_list(input_ids)[2]
        else:
            # 否则，使用 inputs_embeds 获取选择项数量和序列长度
            num_choices = shape_list(inputs_embeds)[1]
            seq_length = shape_list(inputs_embeds)[2]

        # 将 input_ids 重塑为二维张量 (-1, seq_length)，如果 input_ids 不为 None
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        # 将 attention_mask 重塑为二维张量 (-1, seq_length)，如果 attention_mask 不为 None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        # 将 token_type_ids 重塑为二维张量 (-1, seq_length)，如果 token_type_ids 不为 None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        # 将 position_ids 重塑为二维张量 (-1, seq_length)，如果 position_ids 不为 None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        # 将 langs 重塑为二维张量 (-1, seq_length)，如果 langs 不为 None
        flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
        # 将 inputs_embeds 重塑为三维张量 (-1, seq_length, shape_list(inputs_embeds)[3])，如果 inputs_embeds 不为 None
        flat_inputs_embeds = (
            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
            if inputs_embeds is not None
            else None
        )

        # 如果 lengths 不为 None，则发出警告并将其设为 None，因为 Flaubert 多选模型不能使用 lengths 参数，应使用 attention mask
        if lengths is not None:
            logger.warning(
                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
                "attention mask instead.",
            )
            lengths = None

        # 调用 transformer 方法，传递所有扁平化后的输入，获取 transformer 的输出
        transformer_outputs = self.transformer(
            flat_input_ids,
            flat_attention_mask,
            flat_langs,
            flat_token_type_ids,
            flat_position_ids,
            lengths,
            cache,
            head_mask,
            flat_inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 transformer_outputs 中获取第一个输出，即 transformer 的输出
        output = transformer_outputs[0]
        # 使用 sequence_summary 方法对输出进行汇总
        logits = self.sequence_summary(output)
        # 使用 logits_proj 方法对 logits 进行投影
        logits = self.logits_proj(logits)
        # 将 logits 重塑为二维张量 (-1, num_choices)
        reshaped_logits = tf.reshape(logits, (-1, num_choices))

        # 如果 labels 为 None，则损失为 None；否则使用 hf_compute_loss 方法计算损失
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)

        # 如果不返回字典形式的结果，则将输出重塑为预期的格式
        if not return_dict:
            output = (reshaped_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMultipleChoiceModelOutput 对象，包含损失、logits、隐藏状态和注意力权重
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
    # 如果模型已经构建完成，则直接返回，不进行重复构建
    if self.built:
        return
    # 将模型标记为已构建状态
    self.built = True

    # 如果存在 transformer 属性，对 transformer 进行构建
    if getattr(self, "transformer", None) is not None:
        # 在 TensorFlow 中为 transformer 创建命名空间，确保命名唯一性
        with tf.name_scope(self.transformer.name):
            self.transformer.build(None)

    # 如果存在 sequence_summary 属性，对 sequence_summary 进行构建
    if getattr(self, "sequence_summary", None) is not None:
        # 在 TensorFlow 中为 sequence_summary 创建命名空间，确保命名唯一性
        with tf.name_scope(self.sequence_summary.name):
            self.sequence_summary.build(None)

    # 如果存在 logits_proj 属性，对 logits_proj 进行构建
    if getattr(self, "logits_proj", None) is not None:
        # 在 TensorFlow 中为 logits_proj 创建命名空间，确保命名唯一性
        with tf.name_scope(self.logits_proj.name):
            # 构建 logits_proj，指定输入形状为 [None, None, self.config.num_labels]
            self.logits_proj.build([None, None, self.config.num_labels])

`.\models\flaubert\tokenization_flaubert.py`

# 定义一个函数用于将文本转换为 Unicode 格式（如果尚未转换），假设输入是 UTF-8 编码的文本
def convert_to_unicode(text):
    """
    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
    """
    # 定义函数 ensure_text，确保输入的文本 s 是字符串类型，并按指定编码和错误处理方式解码为文本
    def ensure_text(s, encoding="utf-8", errors="strict"):
        # 如果 s 是字节类型，则解码为字符串
        if isinstance(s, bytes):
            return s.decode(encoding, errors)
        # 如果 s 已经是字符串类型，则直接返回
        elif isinstance(s, str):
            return s
        # 如果 s 不是预期的字节或字符串类型，则引发类型错误异常
        else:
            raise TypeError(f"not expecting type '{type(s)}'")

    # 调用 ensure_text 函数，确保输入的 text 是字符串类型，使用 utf-8 编码，忽略解码中的错误
    return ensure_text(text, encoding="utf-8", errors="ignore")
# Copied from transformers.models.xlm.tokenization_xlm.get_pairs
def get_pairs(word):
    """
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    """
    pairs = set()
    prev_char = word[0]  # 获取单词的第一个字符
    for char in word[1:]:  # 迭代单词中除第一个字符外的所有字符
        pairs.add((prev_char, char))  # 将相邻字符组成的元组添加到集合中
        prev_char = char  # 更新前一个字符为当前字符
    return pairs  # 返回字符对的集合


# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct
def replace_unicode_punct(text):
    """
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    """
    text = text.replace("，", ",")  # 替换中文逗号为英文逗号
    text = re.sub(r"。\s*", ". ", text)  # 替换中文句号后的空白为单个空格
    text = text.replace("、", ",")  # 替换中文顿号为英文逗号
    text = text.replace("”", '"')  # 替换中文右双引号为英文右双引号
    text = text.replace("“", '"')  # 替换中文左双引号为英文左双引号
    text = text.replace("∶", ":")  # 替换中文分号为英文冒号
    text = text.replace("：", ":")  # 替换中文冒号为英文冒号
    text = text.replace("？", "?")  # 替换中文问号为英文问号
    text = text.replace("《", '"')  # 替换中文书名号为英文双引号
    text = text.replace("》", '"')  # 替换中文书名号为英文双引号
    text = text.replace("）", ")")  # 替换中文右括号为英文右括号
    text = text.replace("！", "!")  # 替换中文感叹号为英文感叹号
    text = text.replace("（", "(")  # 替换中文左括号为英文左括号
    text = text.replace("；", ";")  # 替换中文分号为英文分号
    text = text.replace("１", "1")  # 替换全角数字为半角数字
    text = text.replace("」", '"')  # 替换中文右引号为英文双引号
    text = text.replace("「", '"')  # 替换中文左引号为英文双引号
    text = text.replace("０", "0")  # 替换全角数字为半角数字
    text = text.replace("３", "3")  # 替换全角数字为半角数字
    text = text.replace("２", "2")  # 替换全角数字为半角数字
    text = text.replace("５", "5")  # 替换全角数字为半角数字
    text = text.replace("６", "6")  # 替换全角数字为半角数字
    text = text.replace("９", "9")  # 替换全角数字为半角数字
    text = text.replace("７", "7")  # 替换全角数字为半角数字
    text = text.replace("８", "8")  # 替换全角数字为半角数字
    text = text.replace("４", "4")  # 替换全角数字为半角数字
    text = re.sub(r"．\s*", ". ", text)  # 替换中文句号后的空白为单个空格
    text = text.replace("～", "~")  # 替换中文波浪号为英文波浪号
    text = text.replace("’", "'")  # 替换中文右单引号为英文右单引号
    text = text.replace("…", "...")  # 替换中文省略号为英文省略号
    text = text.replace("━", "-")  # 替换中文长破折号为英文短破折号
    text = text.replace("〈", "<")  # 替换中文左尖括号为英文左尖括号
    text = text.replace("〉", ">")  # 替换中文右尖括号为英文右尖括号
    text = text.replace("【", "[")  # 替换中文左方括号为英文左方括号
    text = text.replace("】", "]")  # 替换中文右方括号为英文右方括号
    text = text.replace("％", "%")  # 替换全角百分号为半角百分号
    return text  # 返回替换后的文本


# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char
def remove_non_printing_char(text):
    """
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    """
    output = []  # 创建空列表，用于存储输出的字符
    for char in text:  # 遍历输入文本的每一个字符
        cat = unicodedata.category(char)  # 获取当前字符的 Unicode 分类
        if cat.startswith("C"):  # 如果当前字符是控制字符
            continue  # 跳过当前字符
        output.append(char)  # 将非控制字符添加到输出列表中
    return "".join(output)  # 返回连接后的输出字符串


class FlaubertTokenizer(PreTrainedTokenizer):
    """
    Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:

    - Moses preprocessing and tokenization.
    - Normalizing all inputs text.
    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
      "__classify__") to a vocabulary.
    - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    # 定义一个类，继承自BertPreTrainedModel类，用于语言模型的预训练
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Vocabulary file.  # 词汇表文件的路径
        merges_file (`str`):
            Merges file.  # 合并文件的路径
        do_lowercase (`bool`, *optional*, defaults to `False`):
            Controls lower casing.  # 控制是否进行小写处理的布尔值，默认为False
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.  # 未知标记，用于表示词汇表中不存在的词语，默认为"<unk>"
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>
            # 在预训练期间使用的序列开始标记，也可用作序列分类器标记
            <Tip>

            在使用特殊标记构建序列时，这不是用于序列开头的标记。实际使用的标记是`cls_token`。

            </Tip>
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
            # 分隔符标记，在构建来自多个序列的序列时使用，例如用于序列分类或问题回答
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
            # 填充标记，在批处理不同长度序列时使用
        cls_token (`str`, *optional*, defaults to `"</s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
            # 分类器标记，在进行序列分类时使用，整个序列而不是每个标记的分类
        mask_token (`str`, *optional*, defaults to `"<special1>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
            # 用于掩码值的标记，在使用掩码语言建模训练模型时使用，模型会预测这种标记
        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
            List of additional special tokens.
            # 额外特殊标记的列表
        lang2id (`Dict[str, int]`, *optional*):
            Dictionary mapping languages string identifiers to their IDs.
            # 将语言字符串标识符映射到其ID的字典
        id2lang (`Dict[int, str]`, *optional*):
            Dictionary mapping language IDs to their string identifiers.
            # 将语言ID映射到其字符串标识符的字典
    """

    vocab_files_names = VOCAB_FILES_NAMES
    # 从预定义的全局变量中获取词汇表文件名列表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 从预定义的全局变量中获取预训练模型的词汇表文件映射
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    # 从预定义的全局变量中获取预训练模型的初始化配置
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 从预定义的全局变量中获取预训练模型的最大输入尺寸
    def __init__(
        self,
        vocab_file,
        merges_file,
        do_lowercase=False,
        unk_token="<unk>",
        bos_token="<s>",
        sep_token="</s>",
        pad_token="<pad>",
        cls_token="</s>",
        mask_token="<special1>",
        additional_special_tokens=[
            "<special0>",
            "<special1>",
            "<special2>",
            "<special3>",
            "<special4>",
            "<special5>",
            "<special6>",
            "<special7>",
            "<special8>",
            "<special9>",
        ],
        lang2id=None,
        id2lang=None,
        **kwargs,
    ):
        # 检查是否有传入`do_lowercase_and_remove_accent`，但该参数不会起作用于当前类，始终设置为`False`
        do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
        if do_lowercase_and_remove_accent is not None:
            logger.warning(
                "`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything."
                " `FlaubertTokenizer` will always set it to `False`."
            )
        # 始终将`do_lowercase_and_remove_accent`设置为`False`
        self.do_lowercase_and_remove_accent = False

        # 是否将输入文本转换为小写的标志
        self.do_lowercase = do_lowercase

        # 尝试导入`sacremoses`库，如果导入失败则抛出`ImportError`异常
        try:
            import sacremoses
        except ImportError:
            raise ImportError(
                "You need to install sacremoses to use FlaubertTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        # 初始化`sacremoses`模块
        self.sm = sacremoses

        # 缓存`sacremoses.MosesPunctNormalizer`实例
        self.cache_moses_punct_normalizer = {}
        # 缓存`sacremoses.MosesTokenizer`实例
        self.cache_moses_tokenizer = {}
        
        # 需要使用自定义分词器的语言集合
        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
        
        # 设置语言到ID的映射
        self.lang2id = lang2id
        # 设置ID到语言的映射
        self.id2lang = id2lang
        # 如果`lang2id`和`id2lang`都不为`None`，则断言它们长度相同
        if lang2id is not None and id2lang is not None:
            assert len(lang2id) == len(id2lang)

        # 日语分词器实例
        self.ja_word_tokenizer = None
        # 中文分词器实例
        self.zh_word_tokenizer = None

        # 使用UTF-8编码打开词汇表文件，并将其加载为字典`self.encoder`
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        
        # 构建反向词典`self.decoder`
        self.decoder = {v: k for k, v in self.encoder.items()}
        
        # 使用UTF-8编码打开BPE合并文件，并解析为合并操作序列`merges`
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        
        # 构建BPE合并操作的排名字典`self.bpe_ranks`
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        
        # 缓存
        self.cache = {}

        # 调用父类的初始化方法，设置特殊token等参数
        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            lang2id=lang2id,
            id2lang=id2lang,
            **kwargs,
        )

    @property
    # 从`transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case`复制而来
    def do_lower_case(self):
        # 返回是否将输入文本转换为小写的标志
        return self.do_lowercase_and_remove_accent
    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm 复制而来
    def moses_punct_norm(self, text, lang):
        # 如果语言不在缓存中，则创建一个新的 MosesPunctNormalizer 对象
        if lang not in self.cache_moses_punct_normalizer:
            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
            self.cache_moses_punct_normalizer[lang] = punct_normalizer
        else:
            # 否则从缓存中获取已存在的 MosesPunctNormalizer 对象
            punct_normalizer = self.cache_moses_punct_normalizer[lang]
        # 调用 MosesPunctNormalizer 对象的 normalize 方法进行标点符号的规范化处理
        return punct_normalizer.normalize(text)
    
    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize 复制而来
    def moses_tokenize(self, text, lang):
        # 如果语言不在缓存中，则创建一个新的 MosesTokenizer 对象
        if lang not in self.cache_moses_tokenizer:
            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
            self.cache_moses_tokenizer[lang] = moses_tokenizer
        else:
            # 否则从缓存中获取已存在的 MosesTokenizer 对象
            moses_tokenizer = self.cache_moses_tokenizer[lang]
        # 调用 MosesTokenizer 对象的 tokenize 方法对文本进行分词处理
        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
    
    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline 复制而来
    def moses_pipeline(self, text, lang):
        # 调用 replace_unicode_punct 方法处理文本中的 Unicode 标点符号
        text = replace_unicode_punct(text)
        # 调用 self.moses_punct_norm 方法对文本进行 Moses 标点符号规范化处理
        text = self.moses_punct_norm(text, lang)
        # 调用 remove_non_printing_char 方法移除文本中的非打印字符
        text = remove_non_printing_char(text)
        # 返回处理后的文本
        return text
    
    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize 复制而来
    def ja_tokenize(self, text):
        # 如果 self.ja_word_tokenizer 为空，则尝试导入 Mykytea 并创建一个新的 Mykytea 对象
        if self.ja_word_tokenizer is None:
            try:
                import Mykytea
    
                self.ja_word_tokenizer = Mykytea.Mykytea(
                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
                )
            except (AttributeError, ImportError):
                # 如果导入失败，则记录错误信息并引发异常
                logger.error(
                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
                    " (https://github.com/chezou/Mykytea-python) with the following steps"
                )
                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
                logger.error("2. autoreconf -i")
                logger.error("3. ./configure --prefix=$HOME/local")
                logger.error("4. make && make install")
                logger.error("5. pip install kytea")
                raise
        # 调用 Mykytea 对象的 getWS 方法对文本进行日语分词处理，并返回分词结果列表
        return list(self.ja_word_tokenizer.getWS(text))
    
    @property
    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size 复制而来
    def vocab_size(self):
        # 返回 self.encoder 的长度，即词汇表的大小
        return len(self.encoder)
    
    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab 复制而来
    def get_vocab(self):
        # 将 self.encoder 和 self.added_tokens_encoder 合并为一个字典，并返回
        return dict(self.encoder, **self.added_tokens_encoder)
    
    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe 复制而来
    # 使用 BPE 算法对输入的 token 进行处理
    def bpe(self, token):
        # 将 token 转换为包含特殊结束符的元组形式
        word = tuple(token[:-1]) + (token[-1] + "</w>",)
        
        # 如果 token 已经被缓存，直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        
        # 获取 token 的所有可能 bigram 组合
        pairs = get_pairs(word)

        # 如果没有 bigram 组合，直接返回带结束符的 token
        if not pairs:
            return token + "</w>"

        # 循环处理直到无法再合并为止
        while True:
            # 找到当前权重最小的 bigram
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            
            # 如果该 bigram 不在预定义的权重中，停止循环
            if bigram not in self.bpe_ranks:
                break
            
            # 分解 word 中的 bigram 并重新组合
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            
            # 如果 word 只剩一个元素，停止循环
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        
        # 将 word 转换为字符串形式
        word = " ".join(word)
        
        # 处理特定的结束符情况
        if word == "\n  </w>":
            word = "\n</w>"
        
        # 将处理后的结果缓存起来
        self.cache[token] = word
        
        # 返回处理后的 token
        return word

    # 预处理文本，替换特殊标点符号并标准化 Unicode 格式
    def preprocess_text(self, text):
        text = text.replace("``", '"').replace("''", '"')
        text = convert_to_unicode(text)
        text = unicodedata.normalize("NFC", text)

        # 如果需要转换为小写，执行转换操作
        if self.do_lowercase:
            text = text.lower()

        # 返回预处理后的文本
        return text

    # 将输入文本进行分词处理
    def _tokenize(self, text, bypass_tokenizer=False):
        """
        Tokenize a string given language code using Moses.

        Details of tokenization:

            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`

        Args:
            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
              (bool). If True, we only apply BPE.

        Returns:
            List of tokens.
        """
        # 设定语言为法语
        lang = "fr"
        
        # 如果语言码存在且不在预加载的语言映射中，记录错误日志
        if lang and self.lang2id and lang not in self.lang2id:
            logger.error(
                "Supplied language code not found in lang2id mapping. Please check that your language is supported by"
                " the loaded pretrained model."
            )

        # 根据参数决定是否绕过默认的分词器
        if bypass_tokenizer:
            text = text.split()
        else:
            text = self.preprocess_text(text)  # 预处理文本
            text = self.moses_pipeline(text, lang=lang)  # 使用 Moses 处理流水线
            text = self.moses_tokenize(text, lang=lang)  # 使用 Moses 进行分词

        split_tokens = []
        # 对每个 token 进行 BPE 处理并扩展到 split_tokens 列表中
        for token in text:
            if token:
                split_tokens.extend(list(self.bpe(token).split(" ")))

        # 返回处理后的 token 列表
        return split_tokens
    def _convert_token_to_id(self, token):
        """Converts a token (str) into an id using the vocabulary."""
        # Return the ID corresponding to the token from the encoder dictionary; if token not found, return the ID for unknown token (unk_token).
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) into a token (str) using the vocabulary."""
        # Return the token corresponding to the index from the decoder dictionary; if index not found, return the unknown token (unk_token).
        return self.decoder.get(index, self.unk_token)

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings) into a single string."""
        # Concatenate tokens into a single string, replace "</w>" with space, and strip leading/trailing whitespace.
        out_string = "".join(tokens).replace("</w>", " ").strip()
        return out_string

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. An XLM sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of input IDs with the appropriate special tokens added.

        """
        bos = [self.bos_token_id]  # Define the beginning-of-sequence token ID
        sep = [self.sep_token_id]  # Define the separator token ID

        if token_ids_1 is None:
            return bos + token_ids_0 + sep  # Return single sequence with special tokens
        return bos + token_ids_0 + sep + token_ids_1 + sep  # Return pair of sequences with special tokens

    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve a mask of 1s and 0s indicating the presence of special tokens in the input sequences.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs corresponding to the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs corresponding to the second sequence.
            already_has_special_tokens (`bool`, *optional*):
                Whether the input lists already include special tokens.

        Returns:
            `List[int]`: List where each element is 1 if the corresponding token is special, otherwise 0.

        """
    # 继承父类的方法，获取特殊标记掩码，当已经存在特殊标记时调用
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    # 从两个传入的序列中创建用于序列对分类任务的类型标记。XLM 序列对标记的格式如下：
    #
    # ```
    # 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
    # | 第一个序列    | 第二个序列 |
    # ```
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]  # 分隔符标记的 ID
        cls = [self.cls_token_id]  # 类别标记的 ID
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]  # 返回第一个序列部分的标记类型 ID 列表（全为0）

        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    # 从 transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary 复制
    # 将词汇表保存到指定目录下的文件中
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构建词汇文件的路径，包括可选的前缀和文件名
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        
        # 构建合并文件的路径，包括可选的前缀和文件名
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将编码器（encoder）对象以 JSON 格式写入词汇文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # 初始化索引值
        index = 0
        # 将 BPE（Byte Pair Encoding）标记和它们的索引按升序排序后写入合并文件
        with open(merge_file, "w", encoding="utf-8") as writer:
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                # 如果索引不连续，记录警告信息
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回保存的词汇文件路径和合并文件路径
        return vocab_file, merge_file

    # 从对象状态中获取数据，用于序列化对象
    # 参考自 transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sm"] = None
        return state

    # 从序列化数据中设置对象状态，用于反序列化对象
    # 参考自 transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__
    def __setstate__(self, d):
        self.__dict__ = d

        # 尝试导入 sacremoses 库，如果失败则抛出 ImportError
        try:
            import sacremoses
        except ImportError:
            raise ImportError(
                "You need to install sacremoses to use XLMTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        # 将导入的 sacremoses 赋值给对象属性 self.sm
        self.sm = sacremoses

Transformers-源码解析-四十八-

Transformers 源码解析（四十八）

.\models\falcon\__init__.py

.\models\fastspeech2_conformer\configuration_fastspeech2_conformer.py

.\models\fastspeech2_conformer\convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py

.\models\fastspeech2_conformer\convert_hifigan.py

.\models\fastspeech2_conformer\convert_model_with_hifigan.py

.\models\fastspeech2_conformer\modeling_fastspeech2_conformer.py

.\models\fastspeech2_conformer\tokenization_fastspeech2_conformer.py

.\models\fastspeech2_conformer\__init__.py

.\models\flaubert\configuration_flaubert.py

.\models\flaubert\modeling_flaubert.py

.\models\flaubert\modeling_tf_flaubert.py

.\models\flaubert\tokenization_flaubert.py

`.\models\falcon\init.py`

`.\models\fastspeech2_conformer\configuration_fastspeech2_conformer.py`

`.\models\fastspeech2_conformer\convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py`

`.\models\fastspeech2_conformer\convert_hifigan.py`

`.\models\fastspeech2_conformer\convert_model_with_hifigan.py`

`.\models\fastspeech2_conformer\modeling_fastspeech2_conformer.py`

`.\models\fastspeech2_conformer\tokenization_fastspeech2_conformer.py`

`.\models\fastspeech2_conformer\init.py`

`.\models\flaubert\configuration_flaubert.py`

`.\models\flaubert\modeling_flaubert.py`

`.\models\flaubert\modeling_tf_flaubert.py`

`.\models\flaubert\tokenization_flaubert.py`