Transformers 源码解析（一百二十二）

`.\models\wav2vec2\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从内部模块中导入异常类和延迟加载模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义导入结构的字典，用于指定每个模块导入的内容
_import_structure = {
    "configuration_wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config"],
    "feature_extraction_wav2vec2": ["Wav2Vec2FeatureExtractor"],
    "processing_wav2vec2": ["Wav2Vec2Processor"],
    "tokenization_wav2vec2": ["Wav2Vec2CTCTokenizer", "Wav2Vec2Tokenizer"],
}

# 尝试导入 torch 相关模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果成功导入，则添加 torch 版本的模型结构到 _import_structure 中
    _import_structure["modeling_wav2vec2"] = [
        "WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Wav2Vec2ForAudioFrameClassification",
        "Wav2Vec2ForCTC",
        "Wav2Vec2ForMaskedLM",
        "Wav2Vec2ForPreTraining",
        "Wav2Vec2ForSequenceClassification",
        "Wav2Vec2ForXVector",
        "Wav2Vec2Model",
        "Wav2Vec2PreTrainedModel",
    ]

# 尝试导入 tensorflow 相关模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果成功导入，则添加 tensorflow 版本的模型结构到 _import_structure 中
    _import_structure["modeling_tf_wav2vec2"] = [
        "TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFWav2Vec2ForCTC",
        "TFWav2Vec2Model",
        "TFWav2Vec2PreTrainedModel",
        "TFWav2Vec2ForSequenceClassification",
    ]

# 尝试导入 flax 相关模块，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果成功导入，则添加 flax 版本的模型结构到 _import_structure 中
    _import_structure["modeling_flax_wav2vec2"] = [
        "FlaxWav2Vec2ForCTC",
        "FlaxWav2Vec2ForPreTraining",
        "FlaxWav2Vec2Model",
        "FlaxWav2Vec2PreTrainedModel",
    ]

# 如果正在进行类型检查，导入类型检查所需的模块和类
if TYPE_CHECKING:
    from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config
    from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
    from .processing_wav2vec2 import Wav2Vec2Processor
    from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer

    # 再次尝试导入 torch 相关模块，用于类型检查，如果不可用则跳过
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模型相关模块和预训练模型的存档列表（针对其他框架）
        from .modeling_wav2vec2 import (
            WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
            Wav2Vec2ForAudioFrameClassification,
            Wav2Vec2ForCTC,
            Wav2Vec2ForMaskedLM,
            Wav2Vec2ForPreTraining,
            Wav2Vec2ForSequenceClassification,
            Wav2Vec2ForXVector,
            Wav2Vec2Model,
            Wav2Vec2PreTrainedModel,
        )

    try:
        # 检查是否可用 TensorFlow
        if not is_tf_available():
            # 如果 TensorFlow 不可用，则抛出异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 TensorFlow 不可用，不做处理，继续执行
        pass
    else:
        # 导入 TensorFlow 版本的模型相关模块和预训练模型的存档列表
        from .modeling_tf_wav2vec2 import (
            TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFWav2Vec2ForCTC,
            TFWav2Vec2ForSequenceClassification,
            TFWav2Vec2Model,
            TFWav2Vec2PreTrainedModel,
        )

    try:
        # 检查是否可用 Flax
        if not is_flax_available():
            # 如果 Flax 不可用，则抛出异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 如果 Flax 不可用，不做处理，继续执行
        pass
    else:
        # 导入 Flax 版本的模型相关模块和预训练模型的存档列表
        from .modeling_tf_wav2vec2 import (
            FlaxWav2Vec2ForCTC,
            FlaxWav2Vec2ForPreTraining,
            FlaxWav2Vec2Model,
            FlaxWav2Vec2PreTrainedModel,
        )
else:
    # 导入系统模块 sys
    import sys
    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\wav2vec2_bert\configuration_wav2vec2_bert.py`

# coding=utf-8
# Copyright 2024 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Wav2Vec2Bert model configuration"""


# 导入预训练配置类 PretrainedConfig 和日志工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取logger对象，用于记录日志信息
logger = logging.get_logger(__name__)

# 定义预训练配置文件的映射字典，指定预训练模型名称和其配置文件的下载链接
WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/w2v-bert-2.0": "https://huggingface.co/facebook/w2v-bert-2.0/resolve/main/config.json",
}

# 定义 Wav2Vec2BertConfig 类，继承自 PretrainedConfig
class Wav2Vec2BertConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Wav2Vec2BertModel`]. It is used to
    instantiate an Wav2Vec2Bert model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Bert
    [facebook/wav2vec2-bert-rel-pos-large](https://huggingface.co/facebook/wav2vec2-bert-rel-pos-large)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import Wav2Vec2BertConfig, Wav2Vec2BertModel

    >>> # Initializing a Wav2Vec2Bert facebook/wav2vec2-bert-rel-pos-large style configuration
    >>> configuration = Wav2Vec2BertConfig()

    >>> # Initializing a model (with random weights) from the facebook/wav2vec2-bert-rel-pos-large style configuration
    >>> model = Wav2Vec2BertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 定义模型类型为 "wav2vec2-bert"
    model_type = "wav2vec2-bert"
    # 初始化函数，用于初始化一个类实例
    def __init__(
        self,
        vocab_size=None,  # 词汇表大小，默认为 None
        hidden_size=1024,  # 隐藏层大小，默认为 1024
        num_hidden_layers=24,  # 隐藏层数，默认为 24
        num_attention_heads=16,  # 注意力头数，默认为 16
        intermediate_size=4096,  # 中间层大小，默认为 4096
        feature_projection_input_dim=160,  # 特征投影输入维度，默认为 160
        hidden_act="swish",  # 隐藏层激活函数，默认为 "swish"
        hidden_dropout=0.0,  # 隐藏层的 dropout 概率，默认为 0.0
        activation_dropout=0.0,  # 激活函数的 dropout 概率，默认为 0.0
        attention_dropout=0.0,  # 注意力机制的 dropout 概率，默认为 0.0
        feat_proj_dropout=0.0,  # 特征投影的 dropout 概率，默认为 0.0
        final_dropout=0.1,  # 最终输出的 dropout 概率，默认为 0.1
        layerdrop=0.1,  # 层级丢弃的概率，默认为 0.1
        initializer_range=0.02,  # 初始化范围，默认为 0.02
        layer_norm_eps=1e-5,  # 层归一化的 epsilon，默认为 1e-5
        apply_spec_augment=True,  # 是否应用语音数据增强，默认为 True
        mask_time_prob=0.05,  # 时间掩码概率，默认为 0.05
        mask_time_length=10,  # 时间掩码长度，默认为 10
        mask_time_min_masks=2,  # 时间掩码的最小数量，默认为 2
        mask_feature_prob=0.0,  # 特征掩码概率，默认为 0.0
        mask_feature_length=10,  # 特征掩码长度，默认为 10
        mask_feature_min_masks=0,  # 特征掩码的最小数量，默认为 0
        ctc_loss_reduction="sum",  # CTC 损失函数的减少方式，默认为 "sum"
        ctc_zero_infinity=False,  # CTC 损失函数中是否将无限值作为零处理，默认为 False
        use_weighted_layer_sum=False,  # 是否使用加权层总和，默认为 False
        classifier_proj_size=768,  # 分类器投影大小，默认为 768
        tdnn_dim=(512, 512, 512, 512, 1500),  # TDNN 层维度，默认为 (512, 512, 512, 512, 1500)
        tdnn_kernel=(5, 3, 3, 1, 1),  # TDNN 层卷积核大小，默认为 (5, 3, 3, 1, 1)
        tdnn_dilation=(1, 2, 3, 1, 1),  # TDNN 层膨胀率，默认为 (1, 2, 3, 1, 1)
        xvector_output_dim=512,  # x-vector 输出维度，默认为 512
        pad_token_id=0,  # 填充 token 的 ID，默认为 0
        bos_token_id=1,  # 开始 token 的 ID，默认为 1
        eos_token_id=2,  # 结束 token 的 ID，默认为 2
        add_adapter=False,  # 是否添加适配器层，默认为 False
        adapter_kernel_size=3,  # 适配器层的卷积核大小，默认为 3
        adapter_stride=2,  # 适配器层的步幅，默认为 2
        num_adapter_layers=1,  # 适配器层数量，默认为 1
        adapter_act="relu",  # 适配器层的激活函数，默认为 "relu"
        use_intermediate_ffn_before_adapter=False,  # 是否在适配器层之前使用中间的 Feed Forward 层，默认为 False
        output_hidden_size=None,  # 输出的隐藏层大小，默认为 None
        position_embeddings_type="relative_key",  # 位置嵌入的类型，默认为 "relative_key"
        rotary_embedding_base=10000,  # 旋转嵌入的基础值，默认为 10000
        max_source_positions=5000,  # 最大源位置，默认为 5000
        left_max_position_embeddings=64,  # 左侧最大位置嵌入数，默认为 64
        right_max_position_embeddings=8,  # 右侧最大位置嵌入数，默认为 8
        conv_depthwise_kernel_size=31,  # 深度卷积的卷积核大小，默认为 31
        conformer_conv_dropout=0.1,  # Conformer 模型的卷积层 dropout 概率，默认为 0.1
        **kwargs,  # 其他参数，以字典形式接收
    ):
    
    @property
    # 计算输入特征到 logits 的比率
    def inputs_to_logits_ratio(self):
        # 计算 ratio 为特征投影输入维度的两倍
        ratio = self.feature_projection_input_dim * 2
        # 如果添加了适配器，则乘以适配器步幅的适配器层数量次方
        if self.add_adapter:
            ratio = ratio * (self.adapter_stride**self.num_adapter_layers)
        return ratio

`.\models\wav2vec2_bert\convert_wav2vec2_seamless_checkpoint.py`

# 定义函数用于计算模型参数总数，不包括特定键名的参数
def param_count(model):
    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])


# 定义私有函数，用于转换模型参数
def _convert_model(
    original_model,
    hf_model,
    convert_list,
):
    # 获取原始模型的状态字典
    state_dict = original_model.state_dict()
    # 遍历状态字典中的键值对列表
    for k, v in list(state_dict.items()):
        # 复制键，准备进行重命名
        new_key = k
        # 遍历转换列表，将符合条件的旧层名替换为新层名
        for old_layer_name, new_layer_name in convert_list:
            if old_layer_name in new_key:
                new_key = new_key.replace(old_layer_name, new_layer_name)

        # 手动处理层归一化的情况
        if ".layer_norm" in new_key and new_key.split(".layer_norm")[0][-1].isnumeric():
            new_key = new_key.replace("layer_norm", "final_layer_norm")

        # 检查是否需要移除当前键
        add_key = True
        for key in keys_to_remove:
            if key in new_key:
                # 如果键中包含需要移除的关键词，则从状态字典中移除该键值对
                state_dict.pop(k)
                add_key = False
                break

        # 如果不需要移除，则将更新后的键值对添加回状态字典中
        if add_key:
            state_dict[new_key] = state_dict.pop(k)

    # 计算多余的键（存在于状态字典中但不在预期模型中的）
    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
    # 过滤掉不必要的参数（如包含"num_updates"的键）
    extra_keys = set({k for k in extra_keys if "num_updates" not in k})
    # 计算缺失的键（存在于预期模型中但不在状态字典中的）
    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())

    # 如果存在多余的键，则抛出数值错误异常
    if len(extra_keys) != 0:
        raise ValueError(f"extra keys found: {extra_keys}")
    # 如果存在缺失的键，则抛出数值错误异常
    if len(missing_keys) != 0:
        raise ValueError(f"missing keys: {missing_keys}")

    # 使用更新后的状态字典加载预训练模型的状态
    hf_model.load_state_dict(state_dict, strict=True)
    # 计算加载后模型的参数数量
    n_params = param_count(hf_model)

    # 记录模型加载完成并输出参数数量（以百万为单位）
    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")

    # 将模型设置为评估模式
    hf_model.eval()
    # 删除状态字典，释放内存
    del state_dict

    # 返回加载并配置好的模型
    return hf_model
# 使用 @torch.no_grad() 装饰器，确保在模型推断过程中不进行梯度计算
@torch.no_grad()
# 定义函数 convert_wav2vec2_bert_checkpoint，用于将模型权重从 Wav2Vec2 转换到 Transformers 设计
def convert_wav2vec2_bert_checkpoint(
    checkpoint_path,
    pytorch_dump_folder_path,
    config_path=None,
    repo_id=None,
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 如果提供了 config_path，则从预训练的配置文件加载 Wav2Vec2BertConfig，并设置隐藏层激活函数为 "swish"
    if config_path is not None:
        config = Wav2Vec2BertConfig.from_pretrained(config_path, hidden_act="swish")
    else:
        # 否则创建一个新的 Wav2Vec2BertConfig 对象，关闭 spec-augment
        config = Wav2Vec2BertConfig(apply_spec_augment=False)

    # 根据配置创建 Wav2Vec2BertModel 模型对象
    hf_wav2vec = Wav2Vec2BertModel(config)

    # 加载 Conformer 模型，将其类型转换为 torch.float32，并设为评估模式
    model = load_conformer_shaw_model(checkpoint_path, dtype=torch.float32)
    model.eval()

    # 将 Conformer 模型的权重转换到 hf_wav2vec 模型中，使用预定义的转换列表 wav2vec_convert_list
    hf_wav2vec = _convert_model(model, hf_wav2vec, wav2vec_convert_list)

    # 将转换后的 hf_wav2vec 模型保存到指定的 PyTorch 转储文件夹中
    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)

    # 如果提供了 repo_id，则将 hf_wav2vec 模型推送到指定的仓库，并创建 pull request
    if repo_id:
        hf_wav2vec.push_to_hub(repo_id, create_pr=True)

    # 创建 SeamlessM4TFeatureExtractor 特征提取器对象，设置填充值为 1
    fe = SeamlessM4TFeatureExtractor(padding_value=1)
    # 将特征提取器的处理器类设为 "Wav2Vec2BertProcessor"
    fe._set_processor_class("Wav2Vec2BertProcessor")
    # 将特征提取器保存到指定的 PyTorch 转储文件夹中
    fe.save_pretrained(pytorch_dump_folder_path)

    # 如果提供了 repo_id，则将特征提取器推送到指定的仓库，并创建 pull request
    if repo_id:
        fe.push_to_hub(repo_id, create_pr=True)

    # 如果提供了 args.audio_path，则加载音频文件，并进行必要的预处理和特征提取
    if args.audio_path:
        # 加载音频文件，并获取波形和采样率
        waveform, sample_rate = torchaudio.load(args.audio_path)
        # 使用特征提取器的采样率对波形进行重新采样
        waveform = torchaudio.functional.resample(waveform, sample_rate, fe.sampling_rate)

        # 创建 WaveformToFbankConverter 对象，将波形转换为 FBANK 特征
        fbank_converter = WaveformToFbankConverter(
            num_mel_bins=80,
            waveform_scale=2**15,
            channel_last=True,
            standardize=True,
            dtype=torch.float32,
        )
        # 创建 Collater 对象，用于对 FBANK 特征进行填充
        collater = Collater(pad_value=1)

        # 构建解码后的音频字典 decoded_audio
        decoded_audio = {"waveform": waveform.T, "sample_rate": fe.sampling_rate, "format": -1}
        # 对解码后的音频数据应用特征提取器，并获取 FBANK 特征及其填充掩码
        src = collater(fbank_converter(decoded_audio))["fbank"]
        seqs, padding_mask = get_seqs_and_padding_mask(src)

        # 在推断模式下运行模型的前端编码器和编码器，获取原始输出和填充掩码
        with torch.inference_mode():
            seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
            original_output, padding_mask = model.encoder(seqs, padding_mask)

        # 将 hf_wav2vec 模型设为评估模式
        hf_wav2vec.eval()

        # 使用特征提取器对音频进行编码，并通过 hf_wav2vec 模型获取输出
        inputs = fe(waveform, return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs = hf_wav2vec(**inputs)

        # 使用 torch.testing.assert_close 检查原始模型输出和转换后模型输出的相似性
        torch.testing.assert_close(original_output, outputs.last_hidden_state, atol=5e-3, rtol=5e-3)


# 如果当前脚本作为主程序运行，则解析命令行参数
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output PyTorch model.",
    )
    parser.add_argument(
        "--checkpoint_path", default="conformer_shaw", type=str, help="Path to seamless communication checkpoint"
    )
    parser.add_argument(
        "--config_path",
        default=None,
        type=str,
        help="Path to hf config.json of model to convert",
    )
    parser.add_argument("--repo_id", default=None, type=str, help="Push to this repo id if precised.")
    parser.add_argument(
        "--audio_path",
        default=None,
        type=str,
        help="If specified, check that the original model and the converted model produce the same outputs.",
    )

    # 解析命令行参数
    args = parser.parse_args()
    # 调用函数 convert_wav2vec2_bert_checkpoint，将指定的参数传递给它
    convert_wav2vec2_bert_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.repo_id
    )

`.\models\wav2vec2_bert\modeling_wav2vec2_bert.py`

# coding=utf-8
# Copyright 2024 The Seamless Authors and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Wav2Vec2-BERT model."""

import math  # 导入数学函数库
import warnings  # 导入警告处理模块
from typing import Optional, Tuple, Union  # 导入类型提示模块

import numpy as np  # 导入数值计算库numpy
import torch  # 导入深度学习框架PyTorch
import torch.utils.checkpoint  # 导入PyTorch的checkpoint工具
from torch import nn  # 导入PyTorch的神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 导入激活函数映射
from ...integrations.deepspeed import is_deepspeed_zero3_enabled  # 导入DeepSpeed集成模块
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask  # 导入注意力掩码工具函数
from ...modeling_outputs import (  # 导入模型输出类
    BaseModelOutput,
    CausalLMOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
    Wav2Vec2BaseModelOutput,
    XVectorOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型工具函数
from ...utils import (  # 导入工具函数
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_peft_available,
    logging,
)
from .configuration_wav2vec2_bert import Wav2Vec2BertConfig  # 导入Wav2Vec2-BERT配置

logger = logging.get_logger(__name__)  # 获取日志记录器


_HIDDEN_STATES_START_POSITION = 2  # 隐藏状态的起始位置索引

# General docstring
_CONFIG_FOR_DOC = "Wav2Vec2BertConfig"  # 文档中的配置信息

# Base docstring
_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0"  # 基础检查点的文档字符串
_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en"  # 预训练检查点的文档字符串
_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024]  # 预期输出的形状

# CTC docstring
_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'"  # CTC任务的预期输出示例
_CTC_EXPECTED_LOSS = 17.04  # CTC任务的预期损失

WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [  # Wav2Vec2-BERT模型的预训练模型列表
    "facebook/w2v-bert-2.0",
    # See all Wav2Vec2-BERT models at https://huggingface.co/models?filter=wav2vec2-bert
]


# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
    """
    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
    stops at the corresponding element in `seq_lens`.
    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
        seq_lens (`torch.Tensor` of shape `(batch)`:
            Each element represents the length of the sequence at the same index in `hidden_states`
    Returns:
        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
    """
    # 获取隐藏状态张量的形状信息，并分别赋给 batch_size 和 mask_seq_len
    batch_size, mask_seq_len = hidden_states.shape[:2]
    
    # 在当前设备上创建一个张量，包含从 0 到 mask_seq_len-1 的整数序列，并扩展为二维的 batch_size 行，mask_seq_len 列的张量
    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
    
    # 创建一个布尔掩码张量，其中元素为 True 表示相应位置的索引大于等于 seq_lens 中对应的值，否则为 False
    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
    
    # 创建一个与 hidden_states 张量相同形状的新张量，所有元素初始化为 1
    mask = hidden_states.new_ones((batch_size, mask_seq_len))
    
    # 使用布尔掩码 bool_mask 将 mask 中对应位置的元素置为 0
    mask = mask.masked_fill(bool_mask, 0)
    
    # 返回生成的 mask 张量，用于在序列中标记不需要处理的位置
    return mask
# 从 transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices 复制而来的函数
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    计算给定形状的随机掩码区间。用于实现 ASR 中的 SpecAugment 数据增强方法。
    注意：此方法未经优化以在 TPU 上运行，应作为训练过程中的预处理步骤在 CPU 上运行。

    Args:
        shape: 要计算掩码的形状。应为一个大小为 2 的元组，第一个元素是批量大小，第二个元素是要跨越的轴的长度。
        mask_prob: 将被掩盖的整个轴的百分比（介于 0 和 1 之间）。由 `mask_prob*shape[1]/mask_length` 计算生成长度为 `mask_length` 的独立掩码区间的数量。
                  由于重叠，`mask_prob` 是一个上限，实际百分比会较小。
        mask_length: 掩码的大小
        min_masks: 最小掩码数量
        attention_mask: （右填充的）注意力掩码，独立地缩短每个批处理维度的特征轴。

    Returns:
        np.ndarray: 一个布尔类型的数组，表示掩码位置的二维数组。
    """
    batch_size, sequence_length = shape

    if mask_length < 1:
        raise ValueError("`mask_length` 必须大于 0.")

    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` 必须小于 `sequence_length`，但得到 `mask_length`: {mask_length}"
            f" 和 `sequence_length`: {sequence_length}`"
        )

    # epsilon 用于概率舍入
    epsilon = np.random.rand(1).item()

    def compute_num_masked_span(input_length):
        """给定输入长度，计算应掩盖多少个区间"""
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
        num_masked_span = max(num_masked_span, min_masks)

        # 确保掩盖的区间数量 <= sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length

        # 确保 num_masked span 也 <= input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)

        return num_masked_span

    # 计算批处理中的掩盖区间数量
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]
    )

    # SpecAugment 掩码初始化
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
    spec_aug_mask_idxs = []

    # 计算在序列长度内的最大掩盖区间数
    max_num_masked_span = compute_num_masked_span(sequence_length)
    # 如果最大被遮蔽跨度为0，则直接返回原始的spec_aug_mask
    if max_num_masked_span == 0:
        return spec_aug_mask

    # 遍历每个输入的长度
    for input_length in input_lengths:
        # 计算当前输入的被遮蔽跨度数量
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要遮蔽的索引位置
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 如果没有选中任何索引，则说明所有的输入长度小于mask_length，此时使用最后一个位置作为虚拟遮蔽索引
        if len(spec_aug_mask_idx) == 0:
            # 这种情况只会发生在input_length严格小于sequence_length的情况下，
            # 最后一个token必须是填充token，可以用作虚拟的遮蔽ID
            dummy_mask_idx = sequence_length - 1
        else:
            # 否则使用选中的第一个索引作为虚拟遮蔽索引
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟遮蔽索引添加到spec_aug_mask_idx中，以确保所有批次的维度相同
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        # 将当前批次的遮蔽索引添加到列表中
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将遮蔽索引列表转换为NumPy数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将遮蔽索引扩展为遮蔽跨度
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    # 将遮蔽索引重塑为(batch_size, max_num_masked_span * mask_length)
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 为开始索引添加偏移量，以确保索引现在创建一个跨度
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保遮蔽索引不超过sequence_length - 1
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 将遮蔽索引应用到spec_aug_mask中
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回处理后的spec_aug_mask
    return spec_aug_mask
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices
# 定义函数 `_sample_negative_indices`，用于从特征向量中采样负向量索引
def _sample_negative_indices(
    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
):
    """
    Sample `num_negatives` vectors from feature vectors.
    从特征向量中随机采样 `num_negatives` 个向量索引。
    """
    batch_size, sequence_length = features_shape

    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
    # 生成正向量本身的索引，并将其重复 `num_negatives` 次
    sequence_length_range = np.arange(sequence_length)

    # get `num_negatives` random vector indices from the same utterance
    # 从同一话语中获取 `num_negatives` 个随机向量索引
    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)

    # Convert mask_time_indices to boolean if provided, otherwise create a boolean mask of all True
    # 如果提供了 mask_time_indices，则将其转换为布尔型，否则创建一个全为 True 的布尔掩码
    mask_time_indices = (
        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
    )

    # Iterate over batches
    # 遍历每个批次
    for batch_idx in range(batch_size):
        # Determine the upper bound for valid indices based on mask_time_indices
        # 基于 mask_time_indices 确定有效索引的上界
        high = mask_time_indices[batch_idx].sum() - 1
        # Get mapped masked indices from sequence_length_range based on mask_time_indices
        # 根据 mask_time_indices 从 sequence_length_range 中获取映射后的掩码索引
        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]

        # Create a matrix of feature indices broadcasting to shape (high + 1, num_negatives)
        # 创建一个广播到形状 (high + 1, num_negatives) 的特征索引矩阵
        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
        # Sample `num_negatives` indices randomly within range (0, high)
        # 在范围 (0, high) 内随机采样 `num_negatives` 个索引
        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
        # Avoid sampling the same positive vector, but maintain uniform distribution
        # 避免采样相同的正向量，但保持均匀分布
        sampled_indices[sampled_indices >= feature_indices] += 1

        # Remap to actual indices
        # 将采样后的索引重新映射到实际索引
        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]

        # Correct for batch size
        # 校正批次大小
        sampled_negative_indices[batch_idx] += batch_idx * sequence_length

    # Return sampled negative indices
    # 返回采样的负向量索引
    return sampled_negative_indices


# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRotaryPositionalEmbedding with Wav2Vec2Conformer->Wav2Vec2Bert
# 定义类 `Wav2Vec2BertRotaryPositionalEmbedding`，实现旋转位置嵌入
class Wav2Vec2BertRotaryPositionalEmbedding(nn.Module):
    """Rotary positional embedding
    旋转位置嵌入模块
    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
    """

    def __init__(self, config):
        # 初始化方法
        super().__init__()
        dim = config.hidden_size // config.num_attention_heads
        base = config.rotary_embedding_base

        # Compute inverse frequencies for rotary positional embeddings
        # 计算旋转位置嵌入的反向频率
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
        # Register inverse frequencies as a buffer, not trainable
        # 将反向频率注册为缓冲区，不参与训练
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.cached_sequence_length = None
        self.cached_rotary_positional_embedding = None
    def forward(self, hidden_states):
        # 获取隐藏状态的序列长度
        sequence_length = hidden_states.shape[1]

        # 如果序列长度与缓存的序列长度相同且已缓存的旋转位置嵌入不为空，则直接返回缓存的旋转位置嵌入
        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
            return self.cached_rotary_positional_embedding

        # 更新缓存的序列长度为当前序列长度
        self.cached_sequence_length = sequence_length
        # 使用时间戳创建频率矩阵，将时间戳转换为与 inv_freq 相同的数据类型
        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
        # 构建嵌入向量，将频率矩阵按最后一个维度连接起来
        embeddings = torch.cat((freqs, freqs), dim=-1)

        # 计算嵌入向量的余弦和正弦值
        cos_embeddings = embeddings.cos()[:, None, None, :]
        sin_embeddings = embeddings.sin()[:, None, None, :]
        # 将计算得到的嵌入向量转换为与隐藏状态输入相同的数据类型
        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
        # 返回缓存的旋转位置嵌入
        return self.cached_rotary_positional_embedding
# 从transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRelPositionalEmbedding复制的代码，
# 将Wav2Vec2Conformer改为Wav2Vec2Bert
class Wav2Vec2BertRelPositionalEmbedding(nn.Module):
    """相对位置编码模块。"""

    def __init__(self, config):
        super().__init__()
        self.max_len = config.max_source_positions  # 设置最大长度为配置中的源位置最大数
        self.d_model = config.hidden_size  # 设置模型的隐藏层大小为配置中的隐藏大小
        self.pe = None  # 初始化位置编码为None
        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))  # 调用extend_pe方法扩展位置编码

    def extend_pe(self, x):
        # 重置位置编码
        if self.pe is not None:
            # self.pe包含正负两部分
            # self.pe的长度为2 * 输入长度 - 1
            if self.pe.size(1) >= x.size(1) * 2 - 1:
                if self.pe.dtype != x.dtype or self.pe.device != x.device:
                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        # 假设`i`是查询向量的位置，`j`是键向量的位置。当键位于左侧时（i>j），使用正的相对位置，否则使用负的相对位置。
        pe_positive = torch.zeros(x.size(1), self.d_model)
        pe_negative = torch.zeros(x.size(1), self.d_model)
        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
        )
        pe_positive[:, 0::2] = torch.sin(position * div_term)
        pe_positive[:, 1::2] = torch.cos(position * div_term)
        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)

        # 反转正索引的顺序，并连接正负索引。这用于支持位移技巧，参见https://arxiv.org/abs/1901.02860
        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
        pe_negative = pe_negative[1:].unsqueeze(0)
        pe = torch.cat([pe_positive, pe_negative], dim=1)
        self.pe = pe.to(device=x.device, dtype=x.dtype)

    def forward(self, hidden_states: torch.Tensor):
        self.extend_pe(hidden_states)
        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
        relative_position_embeddings = self.pe[:, start_idx:end_idx]

        return relative_position_embeddings


class Wav2Vec2BertFeatureProjection(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
        self.dropout = nn.Dropout(config.feat_proj_dropout)
    # 定义一个方法 forward，接收隐藏状态作为输入参数
    def forward(self, hidden_states):
        # 对隐藏状态进行层归一化处理，用于量化
        norm_hidden_states = self.layer_norm(hidden_states)
        # 对归一化后的隐藏状态进行投影
        hidden_states = self.projection(norm_hidden_states)
        # 对投影后的隐藏状态进行 dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的隐藏状态和归一化前的隐藏状态
        return hidden_states, norm_hidden_states
class Wav2Vec2BertFeedForward(nn.Module):
    def __init__(self, config, act_fn=None, hidden_size=None):
        super().__init__()
        act_fn = act_fn if act_fn is not None else config.hidden_act  # 设置激活函数，如果未提供则使用配置中的默认值
        hidden_size = hidden_size if hidden_size is not None else config.hidden_size  # 设置隐藏层大小，如果未提供则使用配置中的默认值
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)  # 中间层使用激活dropout

        self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size)  # 中间层的全连接层
        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn  # 中间层的激活函数

        self.output_dense = nn.Linear(config.intermediate_size, hidden_size)  # 输出层的全连接层
        self.output_dropout = nn.Dropout(config.hidden_dropout)  # 输出层使用隐藏dropout

    # 从transformers库中的wav2vec2模型中的Wav2Vec2FeedForward类复制而来的forward方法
    def forward(self, hidden_states):
        hidden_states = self.intermediate_dense(hidden_states)  # 中间层的全连接操作
        hidden_states = self.intermediate_act_fn(hidden_states)  # 中间层的激活函数操作
        hidden_states = self.intermediate_dropout(hidden_states)  # 中间层的dropout操作

        hidden_states = self.output_dense(hidden_states)  # 输出层的全连接操作
        hidden_states = self.output_dropout(hidden_states)  # 输出层的dropout操作
        return hidden_states


class Wav2Vec2BertConvolutionModule(nn.Module):
    """Convolution block used in the conformer block"""

    def __init__(self, config):
        super().__init__()
        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # 层归一化操作
        self.pointwise_conv1 = nn.Conv1d(
            config.hidden_size,
            2 * config.hidden_size,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )  # 第一个1x1卷积层

        self.glu = nn.GLU(dim=1)  # GLU激活函数，应用在第一个卷积层的输出上
        self.depthwise_conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            config.conv_depthwise_kernel_size,
            stride=1,
            padding=0,
            groups=config.hidden_size,
            bias=False,
        )  # 深度卷积层，使用组卷积来处理每个通道独立的操作

        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # 深度卷积层后的层归一化
        self.activation = ACT2FN[config.hidden_act]  # 使用指定的激活函数
        self.pointwise_conv2 = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )  # 第二个1x1卷积层

        self.dropout = nn.Dropout(config.conformer_conv_dropout)  # 卷积模块的dropout操作
    # 对输入的 hidden_states 进行层归一化处理
    hidden_states = self.layer_norm(hidden_states)

    # 如果传入了 attention_mask，确保在深度卷积中不泄露填充位置的信息
    if attention_mask is not None:
        # 将 attention_mask 转换成布尔张量，并在未填充位置上用 0 替换
        hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)

    # 交换 hidden_states 张量的第一维（batch 维）和第二维（时间步维）的顺序
    hidden_states = hidden_states.transpose(1, 2)

    # 应用 GLU 机制，通过 pointwise_conv1 进行卷积操作
    # 结果张量维度变为 (batch, channel, dim)
    hidden_states = self.pointwise_conv1(hidden_states)

    # 经过 GLU 激活函数处理，输出维度为 (batch, channel, dim)
    hidden_states = self.glu(hidden_states)

    # 对 hidden_states 序列进行左侧填充，以适应因果卷积的需要
    hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))

    # 应用一维深度卷积操作，处理输入序列
    hidden_states = self.depthwise_conv(hidden_states)

    # 对深度卷积后的 hidden_states 进行层归一化，然后恢复原始维度顺序
    hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

    # 应用激活函数处理 hidden_states
    hidden_states = self.activation(hidden_states)

    # 通过 pointwise_conv2 进行卷积操作
    hidden_states = self.pointwise_conv2(hidden_states)

    # 对输出进行 dropout 处理，以防止过拟合
    hidden_states = self.dropout(hidden_states)

    # 最后再次交换 hidden_states 张量的第一维和第二维的顺序，返回结果
    hidden_states = hidden_states.transpose(1, 2)
    return hidden_states
    """Construct an Wav2Vec2BertSelfAttention object.
    Can be enhanced with rotary or relative position embeddings.
    """
    # 定义 Wav2Vec2BertSelfAttention 类，用于构建自注意力机制模块，支持旋转或相对位置编码的增强功能

    def __init__(self, config, is_adapter_attention=False):
        super().__init__()
        # 调用父类构造函数进行初始化

        hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size
        # 根据是否适配器注意力选择隐藏层大小或输出隐藏层大小

        self.head_size = hidden_size // config.num_attention_heads
        # 计算每个注意力头的大小
        self.num_heads = config.num_attention_heads
        # 设置注意力头的数量
        self.position_embeddings_type = config.position_embeddings_type if not is_adapter_attention else None
        # 根据是否适配器注意力选择位置编码类型或设为None

        self.linear_q = nn.Linear(hidden_size, hidden_size)
        # Query 线性变换层
        self.linear_k = nn.Linear(hidden_size, hidden_size)
        # Key 线性变换层
        self.linear_v = nn.Linear(hidden_size, hidden_size)
        # Value 线性变换层
        self.linear_out = nn.Linear(hidden_size, hidden_size)
        # 输出线性变换层

        self.dropout = nn.Dropout(p=config.attention_dropout)
        # Dropout 层，用于注意力计算时的随机失活

        if self.position_embeddings_type == "relative":
            # 如果位置编码类型为 "relative"
            self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False)
            # 用于位置编码的线性变换层
            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
            # 用于矩阵 c 的可学习偏置参数
            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
            # 用于矩阵 d 的可学习偏置参数

        if self.position_embeddings_type == "relative_key":
            # 如果位置编码类型为 "relative_key"
            self.left_max_position_embeddings = config.left_max_position_embeddings
            # 左侧最大位置编码的数量
            self.right_max_position_embeddings = config.right_max_position_embeddings
            # 右侧最大位置编码的数量
            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
            # 总位置数量
            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
            # 距离编码的嵌入层，根据位置数量和头大小初始化

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        relative_position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 模型的前向传播函数，接收输入张量和可选的注意力掩码、相对位置编码张量以及输出注意力权重的标志

        # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_rotary_embedding
        # 从 transformers 库中的另一个模块复制的部分，用于应用旋转嵌入的函数
    # 对输入的隐藏状态应用旋转嵌入
    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
        # 获取批次大小、序列长度和隐藏单元大小
        batch_size, sequence_length, hidden_size = hidden_states.size()
        
        # 将隐藏状态重新形状为(batch_size, sequence_length, num_heads, head_size)
        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)

        # 从相对位置嵌入中提取余弦和正弦值
        cos = relative_position_embeddings[0, :sequence_length, ...]
        sin = relative_position_embeddings[1, :sequence_length, ...]

        # 将隐藏状态进行转置
        hidden_states = hidden_states.transpose(0, 1)
        
        # 分割旋转的状态，分别处理前半部分和后半部分
        rotated_states_begin = hidden_states[..., : self.head_size // 2]
        rotated_states_end = hidden_states[..., self.head_size // 2 :]
        
        # 拼接旋转后的状态，按照最后一个维度进行拼接
        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
        
        # 应用旋转嵌入到隐藏状态中
        hidden_states = (hidden_states * cos) + (rotated_states * sin)
        
        # 恢复隐藏状态的转置
        hidden_states = hidden_states.transpose(0, 1)

        # 将隐藏状态重新形状为(batch_size, sequence_length, num_heads * head_size)
        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)

        # 返回处理后的隐藏状态
        return hidden_states

    # 从transformers库中的wav2vec2_conformer模型复制的代码，用于应用相对位置嵌入
    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
        # 1. project positional embeddings
        # 将相对位置嵌入投影到新的空间
        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
        # 重新组织张量形状以适应多头注意力机制的计算需求
        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
        )
        # 调整张量的维度顺序以进行多头注意力计算
        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)

        # 2. Add bias to query
        # 将偏置项添加到查询向量中，以引入位置信息
        query = query.transpose(1, 2)
        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)

        # 3. attention score: first compute matrix a and matrix c
        # 根据文献中描述的方法计算注意力矩阵 A 和 C
        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))

        # 4. then compute matrix b and matrix d
        # 根据文献中描述的方法计算注意力矩阵 B 和 D
        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)

        # 5. shift matrix b and matrix d
        # 在注意力矩阵 B 上进行零填充和移位操作
        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]

        # 6. sum matrices
        # 将计算得到的注意力矩阵 A+C 和修正后的注意力矩阵 B 加总并除以缩放因子
        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)

        return scores
class Wav2Vec2BertEncoderLayer(nn.Module):
    """Conformer block based on https://arxiv.org/abs/2005.08100."""

    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        dropout = config.attention_dropout

        # Feed-forward 1
        self.ffn1_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        self.ffn1 = Wav2Vec2BertFeedForward(config)

        # Self-Attention
        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        self.self_attn_dropout = nn.Dropout(dropout)
        self.self_attn = Wav2Vec2BertSelfAttention(config)

        # Conformer Convolution
        self.conv_module = Wav2Vec2BertConvolutionModule(config)

        # Feed-forward 2
        self.ffn2_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        self.ffn2 = Wav2Vec2BertFeedForward(config)
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states,
        attention_mask: Optional[torch.Tensor] = None,
        relative_position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        conv_attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = hidden_states

        # 1. Feed-Forward 1 layer
        residual = hidden_states
        hidden_states = self.ffn1_layer_norm(hidden_states)  # Layer normalization on the input
        hidden_states = self.ffn1(hidden_states)  # First feed-forward neural network transformation
        hidden_states = hidden_states * 0.5 + residual  # Residual connection and scaling

        residual = hidden_states

        # 2. Self-Attention layer
        hidden_states = self.self_attn_layer_norm(hidden_states)  # Layer normalization on the output of FFN1
        hidden_states, attn_weights = self.self_attn(  # Self-attention mechanism
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            relative_position_embeddings=relative_position_embeddings,
            output_attentions=output_attentions,
        )
        hidden_states = self.self_attn_dropout(hidden_states)  # Dropout applied to self-attention output
        hidden_states = hidden_states + residual  # Residual connection after self-attention

        # 3. Convolutional Layer
        residual = hidden_states
        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)  # Conformer convolution operation
        hidden_states = residual + hidden_states  # Residual connection after convolutional module

        # 4. Feed-Forward 2 Layer
        residual = hidden_states
        hidden_states = self.ffn2_layer_norm(hidden_states)  # Layer normalization on the output of the convolution
        hidden_states = self.ffn2(hidden_states)  # Second feed-forward neural network transformation
        hidden_states = hidden_states * 0.5 + residual  # Residual connection and scaling
        hidden_states = self.final_layer_norm(hidden_states)  # Final layer normalization

        return hidden_states, attn_weights  # Return final hidden states and attention weights
    # 初始化函数，用于创建一个新的神经网络模型实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 将传入的配置信息保存在对象实例中
        self.config = config

        # 根据配置文件中的位置嵌入类型，选择不同的位置嵌入方法
        if config.position_embeddings_type == "relative":
            # 如果位置嵌入类型为相对位置嵌入，则使用Wav2Vec2BertRelPositionalEmbedding类
            self.embed_positions = Wav2Vec2BertRelPositionalEmbedding(config)
        elif config.position_embeddings_type == "rotary":
            # 如果位置嵌入类型为旋转位置嵌入，则使用Wav2Vec2BertRotaryPositionalEmbedding类
            self.embed_positions = Wav2Vec2BertRotaryPositionalEmbedding(config)
        else:
            # 如果未指定有效的位置嵌入类型，则将位置嵌入设为None
            self.embed_positions = None

        # 定义一个用于随机失活的Dropout层，根据配置中的隐藏层失活率来设置
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 创建一个神经网络层的列表，其中每一层都是Wav2Vec2BertEncoderLayer的实例，数量由配置文件中的隐藏层数决定
        self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 默认情况下，梯度检查点设置为False
        self.gradient_checkpointing = False

    # 前向传播函数，接收输入状态并进行模型前向计算
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        ):
class Wav2Vec2BertAdapter(nn.Module):
    def __init__(self, config):
        super().__init__()
        # feature dim might need to be down-projected
        # 如果配置中输出的隐藏大小与隐藏大小不同，可能需要降维
        if config.output_hidden_size != config.hidden_size:
            # 创建线性层，用于将隐藏状态降维到输出的隐藏大小
            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
            # 创建 LayerNorm 层，用于归一化降维后的隐藏状态
            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps)
        else:
            self.proj = self.proj_layer_norm = None
        # 创建多个 Wav2Vec2BertAdapterLayer 层组成的列表
        self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers))
        # 设置层间隔率
        self.layerdrop = config.layerdrop

        # 获取适配器卷积的核大小和步长
        self.kernel_size = config.adapter_kernel_size
        self.stride = config.adapter_stride

    def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens):
        # 如果序列长度为空，则返回空
        if seq_lens is None:
            return seq_lens
        # 计算填充长度
        pad = self.kernel_size // 2
        # 计算子采样长度
        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
        return seq_lens.floor()

    def forward(self, hidden_states, attention_mask=None):
        # 如果需要降维隐藏状态
        if self.proj is not None and self.proj_layer_norm is not None:
            # 降维隐藏状态
            hidden_states = self.proj(hidden_states)
            # 对降维后的隐藏状态进行 LayerNorm 归一化
            hidden_states = self.proj_layer_norm(hidden_states)

        # 初始化子采样长度为 None
        sub_sampled_lengths = None
        # 如果存在注意力遮罩
        if attention_mask is not None:
            # 计算子采样长度
            sub_sampled_lengths = (attention_mask.size(1) - (1 - attention_mask.int()).sum(1)).to(hidden_states.device)

        # 遍历每个适配器层
        for layer in self.layers:
            # 随机生成一个 layerdrop 概率值
            layerdrop_prob = torch.rand([])
            # 根据注意力遮罩计算子采样长度
            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(sub_sampled_lengths)
            # 如果处于推理阶段或者未丢弃该层
            if not self.training or (layerdrop_prob > self.layerdrop):
                # 将隐藏状态传递给适配器层处理
                hidden_states = layer(
                    hidden_states, attention_mask=attention_mask, sub_sampled_lengths=sub_sampled_lengths
                )

        # 返回处理后的隐藏状态
        return hidden_states


class Wav2Vec2BertAdapterLayer(nn.Module):
    # 待实现
    def __init__(self, config):
        super().__init__()
        embed_dim = config.output_hidden_size  # 从配置中获取嵌入维度大小
        dropout = config.conformer_conv_dropout  # 从配置中获取卷积层的dropout率

        self.kernel_size = config.adapter_kernel_size  # 从配置中获取卷积核大小
        self.stride = config.adapter_stride  # 从配置中获取卷积的步长

        # 1. residual convolution
        self.residual_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 使用 LayerNorm 对残差连接后的特征进行归一化
        self.residual_conv = nn.Conv1d(
            embed_dim,
            2 * embed_dim,
            self.kernel_size,
            stride=self.stride,
            padding=self.stride // 2,
        )
        # 定义一个一维卷积层，用于残差连接

        self.activation = nn.GLU(dim=1)
        # 定义一个门控线性单元（GLU），应用于卷积输出

        # Self-Attention
        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 使用 LayerNorm 对自注意力层的输出进行归一化
        self.self_attn_conv = nn.Conv1d(
            embed_dim,
            2 * embed_dim,
            self.kernel_size,
            stride=self.stride,
            padding=self.stride // 2,
        )
        # 定义一个一维卷积层，用于自注意力机制

        self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True)
        # 创建一个自定义的自注意力层实例

        self.self_attn_dropout = nn.Dropout(dropout)
        # 定义一个dropout层，用于自注意力的输出

        # Feed-forward
        self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 使用 LayerNorm 对前馈网络层的输出进行归一化
        self.ffn = Wav2Vec2BertFeedForward(config, act_fn=config.adapter_act, hidden_size=embed_dim)
        # 创建一个自定义的前馈网络层实例，用于特征转换和映射
    ):
        # 计算残差连接的归一化
        residual = self.residual_layer_norm(hidden_states)

        # 对残差进行池化，以匹配多头注意力输出的序列长度。
        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
        residual = residual.transpose(1, 2)
        residual = self.residual_conv(residual)
        residual = self.activation(residual)
        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
        residual = residual.transpose(1, 2)

        # 对自注意力层的隐藏状态进行归一化
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 在输入多头注意力层之前进行池化。
        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
        hidden_states = hidden_states.transpose(1, 2)
        hidden_states = self.self_attn_conv(hidden_states)
        hidden_states = self.activation(hidden_states)
        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
        hidden_states = hidden_states.transpose(1, 2)

        # 如果存在注意力掩码，进行相应的计算
        if attention_mask is not None:
            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
            attention_mask = _prepare_4d_attention_mask(
                attention_mask,
                hidden_states.dtype,
            )

        # 剩余的计算步骤与普通的Transformer编码器层相同。
        hidden_states, attn_weigths = self.self_attn(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
        )
        hidden_states = self.self_attn_dropout(hidden_states)
        hidden_states = hidden_states + residual

        # 更新残差
        residual = hidden_states

        # 应用前馈网络层的归一化
        hidden_states = self.ffn_layer_norm(hidden_states)
        hidden_states = self.ffn(hidden_states) + residual

        return hidden_states
# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerPreTrainedModel
# with Wav2Vec2Conformer->Wav2Vec2Bert, wav2vec2_conformer->wav2vec2_bert, input_values->input_features

class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = Wav2Vec2BertConfig  # 指定配置类为Wav2Vec2BertConfig
    base_model_prefix = "wav2vec2_bert"  # 模型的基本前缀名
    main_input_name = "input_features"  # 主要输入名称为input_features
    supports_gradient_checkpointing = True  # 支持梯度检查点

    # Ignore copy
    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, Wav2Vec2BertSelfAttention):
            if hasattr(module, "pos_bias_u"):
                nn.init.xavier_uniform_(module.pos_bias_u)  # 使用Xavier初始化pos_bias_u
            if hasattr(module, "pos_bias_v"):
                nn.init.xavier_uniform_(module.pos_bias_v)  # 使用Xavier初始化pos_bias_v
        elif isinstance(module, Wav2Vec2BertFeatureProjection):
            k = math.sqrt(1 / module.projection.in_features)
            nn.init.uniform_(module.projection.weight, a=-k, b=k)  # 均匀分布初始化projection.weight
            nn.init.uniform_(module.projection.bias, a=-k, b=k)  # 均匀分布初始化projection.bias
        elif isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)  # 正态分布初始化weight
            if module.bias is not None:
                module.bias.data.zero_()  # 将bias初始化为零
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            module.bias.data.zero_()  # 将bias初始化为零
            module.weight.data.fill_(1.0)  # 将weight初始化为1.0
        elif isinstance(module, nn.Conv1d):
            nn.init.kaiming_normal_(module.weight)  # 使用Kaiming正态分布初始化weight
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)  # 均匀分布初始化bias

    # Ignore copy
    def _get_feat_extract_output_lengths(
        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
    ):
        """
        Computes the output length of the convolutional layers
        """
        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter

        def _conv_out_length(input_length, kernel_size, stride, padding):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1

        if add_adapter:
            padding = self.config.adapter_kernel_size // 2
            for _ in range(self.config.num_adapter_layers):
                input_lengths = _conv_out_length(
                    input_lengths, self.config.adapter_kernel_size, self.config.adapter_stride, padding
                )

        return input_lengths

    def _get_feature_vector_attention_mask(
        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
    ):
    ):
        # 计算没有填充的部分的长度，即 attention_mask.sum(-1)，但不能原地操作以便在推断模式下运行。
        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]

        # 根据非填充长度获取特征提取器的输出长度，并根据需要添加适配器
        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
        output_lengths = output_lengths.to(torch.long)

        # 获取批处理大小
        batch_size = attention_mask.shape[0]

        # 创建一个全零的注意力掩码张量，形状为 (batch_size, feature_vector_length)，类型与设备与 attention_mask 一致
        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )

        # 确保在输出长度索引之前的所有位置都被注意到
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1

        # 将注意力掩码张量沿着最后一个维度翻转，然后累积求和，并再次翻转，最后将其转换为布尔类型
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()

        # 返回处理后的注意力掩码张量
        return attention_mask
# 定义 Wav2Vec2BertModel 的文档字符串，描述了该模型的基本信息和引用的论文
WAV2VEC2_BERT_START_DOCSTRING = r"""
    Wav2Vec2Bert was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
    Auli.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.

    Parameters:
        config ([`Wav2Vec2BertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义 Wav2Vec2BertModel 的输入文档字符串，描述了模型的输入参数及其含义
WAV2VEC2_BERT_INPUTS_DOCSTRING = r"""
    Args:
        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details.
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 使用 @add_start_docstrings 装饰器添加了额外的文档字符串，描述了 Wav2Vec2BertModel 的基本信息和配置参数
@add_start_docstrings(
    "The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top.",
    WAV2VEC2_BERT_START_DOCSTRING,
)
    # 初始化函数，接收一个 Wav2Vec2BertConfig 类型的参数 config
    def __init__(self, config: Wav2Vec2BertConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将传入的 config 参数保存到 self.config 中
        self.config = config
        # 创建一个 Wav2Vec2BertFeatureProjection 对象并保存到 self.feature_projection 中
        self.feature_projection = Wav2Vec2BertFeatureProjection(config)

        # 如果 config 中 mask_time_prob 大于 0.0 或者 config 中 mask_feature_prob 大于 0.0，
        # 则需要创建一个 nn.Parameter 类型的张量 self.masked_spec_embed
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            # 用均匀分布初始化 self.masked_spec_embed
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())

        # 创建一个 Wav2Vec2BertEncoder 对象并保存到 self.encoder 中
        self.encoder = Wav2Vec2BertEncoder(config)

        # 如果 config 中 add_adapter 为 True，则创建一个 Wav2Vec2BertAdapter 对象并保存到 self.adapter 中
        # 否则 self.adapter 为 None
        self.adapter = Wav2Vec2BertAdapter(config) if config.add_adapter else None

        # 如果 config 中 use_intermediate_ffn_before_adapter 为 True，
        # 则创建一个 Wav2Vec2BertFeedForward 对象并保存到 self.intermediate_ffn 中
        # 激活函数为 "relu"
        self.intermediate_ffn = None
        if config.use_intermediate_ffn_before_adapter:
            self.intermediate_ffn = Wav2Vec2BertFeedForward(config, act_fn="relu")

        # 调用类的 post_init 方法，用于初始化权重和应用最终处理
        self.post_init()
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # apply SpecAugment along time axis with given mask_time_indices
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # compute mask indices for time axis if not provided
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        return hidden_states

    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_features: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        # 如果未指定是否输出注意力权重，则使用模型配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定是否输出隐藏状态，则使用模型配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定是否返回字典形式的输出，则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入特征映射到特征投影空间
        hidden_states, extract_features = self.feature_projection(input_features)
        # 根据给定的时间索引和注意力掩码对隐藏状态进行掩码操作
        hidden_states = self._mask_hidden_states(
            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
        )

        # 编码器处理隐藏状态，并返回编码器的输出
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的隐藏状态作为下一步的处理对象
        hidden_states = encoder_outputs[0]

        # 如果存在中间的Feed Forward Network，则对隐藏状态进行扩展处理
        if self.intermediate_ffn:
            expanded_hidden_states = self.intermediate_ffn(hidden_states)
            hidden_states = hidden_states + 0.5 * expanded_hidden_states

        # 如果存在适配器，则使用适配器处理隐藏状态
        if self.adapter is not None:
            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)

        # 如果不要求返回字典形式的输出，则返回一个元组，包括隐藏状态、提取的特征以及可能的额外输出
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]

        # 返回 Wav2Vec2BaseModelOutput 类的对象，包括最终的隐藏状态、提取的特征、编码器的隐藏状态和注意力权重
        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
# 为 Wav2Vec2BertForCTC 类添加文档字符串，描述其作为 Connectionist Temporal Classification (CTC) 语言建模头部的 Wav2Vec2Bert 模型。
@add_start_docstrings(
    """Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    WAV2VEC2_BERT_START_DOCSTRING,
)
class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
    # 从 transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForCTC.__init__ 复制过来，将 Wav2Vec2Conformer 类重命名为 Wav2Vec2Bert，WAV2VEC2_CONFORMER 重命名为 WAV2VEC2_BERT，wav2vec2_conformer 重命名为 wav2vec2_bert。
    def __init__(self, config, target_lang: Optional[str] = None):
        super().__init__(config)

        # 使用 Wav2Vec2BertModel 创建 wav2vec2_bert 模型
        self.wav2vec2_bert = Wav2Vec2BertModel(config)
        # 添加 dropout 层
        self.dropout = nn.Dropout(config.final_dropout)

        # 设置目标语言
        self.target_lang = target_lang

        # 如果配置中未定义语言模型头部的词汇表大小，则抛出值错误异常
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `Wav2Vec2BertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )

        # 根据配置信息初始化线性层 lm_head
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 初始化权重并应用最终处理
        self.post_init()

    # 添加文档字符串到模型前向方法，描述 WAV2VEC2_BERT 输入的格式和用途
    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
    # 添加代码示例的文档字符串，指定检查点、输出类型、配置类、预期输出和预期损失
    @add_code_sample_docstrings(
        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_features: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        # 输入特征：可选的 torch.Tensor 类型
        # 注意力掩码：可选的 torch.Tensor 类型，默认为 None
        # 输出注意力：可选的布尔类型，默认为 None
        # 输出隐藏状态：可选的布尔类型，默认为 None
        # 返回字典：可选的布尔类型，默认为 None
        # 标签：可选的 torch.Tensor 类型，默认为 None
    ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """

        # Determine whether to use return_dict based on provided argument or configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input features through wav2vec2_bert model
        outputs = self.wav2vec2_bert(
            input_features,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Retrieve hidden states from the model output and apply dropout
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # Generate logits using the language model head
        logits = self.lm_head(hidden_states)

        # Initialize loss variable
        loss = None
        if labels is not None:
            # Check if any label value exceeds the vocabulary size
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # Calculate input_lengths based on attention_mask
            attention_mask = (
                attention_mask
                if attention_mask is not None
                else torch.ones(input_features.shape[:2], device=input_features.device, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum([-1])).to(torch.long)

            # Mask out invalid labels and compute target_lengths
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # Compute log probabilities using log_softmax and transpose for CTC loss computation
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # Disable cudnn to ensure reproducibility in loss calculation
            with torch.backends.cudnn.flags(enabled=False):
                # Compute CTC loss using log_probs and other parameters
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # If return_dict is False, return a tuple with logits and other outputs
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # If return_dict is True, return CausalLMOutput object with necessary attributes
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
"""
Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output) for
tasks like SUPERB Keyword Spotting.
"""
@add_start_docstrings(
    """
    Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output) for
    tasks like SUPERB Keyword Spotting.
    """,
    WAV2VEC2_BERT_START_DOCSTRING,
)
class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert
    def __init__(self, config):
        super().__init__(config)

        # Check if adapter usage is enabled; raise error if so
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
            )
        
        # Initialize the Wav2Vec2BertModel
        self.wav2vec2_bert = Wav2Vec2BertModel(config)
        
        # Calculate the number of layers including input embeddings
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        
        # Initialize layer weights if weighted layer sum is enabled
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # Project pooled output to a specified size
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
        
        # Final linear layer for classification
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        # Freeze parameters of the Wav2Vec2BertModel
        for param in self.wav2vec2_bert.parameters():
            param.requires_grad = False

    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features
    def forward(
        self,
        input_features: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        ):
        # Implement the forward pass for Wav2Vec2BertForSequenceClassification
        pass
        ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 设置返回的字典对象，如果未指定则根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据配置决定是否输出隐藏层状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用wav2vec2_bert模型，传入输入特征和其他参数，获取输出结果
        outputs = self.wav2vec2_bert(
            input_features,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 根据配置决定是否使用加权层求和
        if self.config.use_weighted_layer_sum:
            # 提取隐藏状态，并按照层权重进行加权求和
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 直接使用第一个输出作为隐藏状态
            hidden_states = outputs[0]

        # 将隐藏状态投影到特征空间
        hidden_states = self.projector(hidden_states)

        # 如果没有提供注意力掩码，则使用平均值作为汇聚输出
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            # 根据注意力掩码生成特征向量的掩码，并根据掩码对隐藏状态进行填充
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states[~padding_mask] = 0.0
            # 按照掩码进行加权求和，得到汇聚输出
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        # 对汇聚输出进行分类
        logits = self.classifier(pooled_output)

        # 初始化损失为None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        # 如果不使用返回字典，则按照旧版的输出格式返回结果
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 使用新版的SequenceClassifierOutput格式返回结果
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 用于音频帧分类任务的 Wav2Vec2Bert 模型，其在顶部带有帧分类头部。
@add_start_docstrings(
    """
    Wav2Vec2Bert Model with a frame classification head on top for tasks like Speaker Diarization.
    """,
    WAV2VEC2_BERT_START_DOCSTRING,
)
class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel):
    # 从 transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.__init__ 复制，并将 Wav2Vec2Conformer 替换为 Wav2Vec2Bert，WAV2VEC2_CONFORMER 替换为 WAV2VEC2_BERT，wav2vec2_conformer 替换为 wav2vec2_bert
    def __init__(self, config):
        super().__init__(config)

        # 检查是否存在适配器并且配置要求使用适配器，如果是则引发值错误
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Audio frame classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
            )
        
        # 创建 Wav2Vec2BertModel 实例并赋值给 self.wav2vec2_bert
        self.wav2vec2_bert = Wav2Vec2BertModel(config)
        
        # 计算变换层的数量（变换器层 + 输入嵌入层），如果配置要求使用加权层求和，则初始化权重
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 创建分类器线性层，输入维度为 config.hidden_size，输出维度为 config.num_labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        # 保存标签数量到 self.num_labels
        self.num_labels = config.num_labels

        # 初始化模型权重
        self.init_weights()

    # 从 transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.freeze_base_model 复制，并将 wav2vec2_conformer 替换为 wav2vec2_bert
    def freeze_base_model(self):
        """
        调用此函数将禁用基础模型的梯度计算，使其参数在训练过程中不会被更新。仅分类头部将被更新。
        """
        # 遍历 self.wav2vec2_bert 的所有参数，并设置 requires_grad=False 来禁用梯度计算
        for param in self.wav2vec2_bert.parameters():
            param.requires_grad = False

    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从 transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.forward 复制，并将 wav2vec2_conformer 替换为 wav2vec2_bert，input_values 替换为 input_features
    def forward(
        self,
        input_features: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # 函数前向传播，接受输入特征 input_features（可选的 torch.Tensor）
        # attention_mask（可选的 torch.Tensor）用于指定哪些元素需要注意，labels（可选的 torch.Tensor）用于指定预测的标签
        # output_attentions（可选的 bool）指示是否返回注意力权重，output_hidden_states（可选的 bool）指示是否返回隐藏状态
        # return_dict（可选的 bool）指示是否返回字典格式的输出
        
        # 使用 self.wav2vec2_bert 进行前向传播，将输入特征 input_features 作为输入
        # 返回的结果为 TokenClassifierOutput 类型的输出
        
        # 具体使用示例请参考代码库中的模型检查点 _BASE_CHECKPOINT_FOR_DOC
        # 返回结果类型为 TokenClassifierOutput，配置类为 _CONFIG_FOR_DOC，处理的模态为音频

        pass


这里只是注释代码，没有实际的代码内容需要输出。
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 根据需要设置是否返回字典类型的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据配置选择是否输出隐藏状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用wav2vec2_bert模型进行前向传播
        outputs = self.wav2vec2_bert(
            input_features,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置为使用加权层求和，则对隐藏状态进行加权求和
        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]  # 获取隐藏状态列表的起始位置
            hidden_states = torch.stack(hidden_states, dim=1)  # 在指定维度上堆叠隐藏状态
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)  # 对层权重进行softmax归一化
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)  # 按照权重求和隐藏状态
        else:
            hidden_states = outputs[0]  # 否则直接使用第一个输出作为隐藏状态

        logits = self.classifier(hidden_states)  # 使用分类器对隐藏状态进行分类

        loss = None
        if labels is not None:
            # 如果提供了标签，计算交叉熵损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))

        if not return_dict:
            # 如果不需要返回字典类型的输出，则返回分类器的logits和可能的隐藏状态列表
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return output

        # 否则返回一个TokenClassifierOutput对象，包含损失、logits、隐藏状态和注意力权重
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 定义 AMSoftmaxLoss 类，用于实现 AM-Softmax 损失函数
class AMSoftmaxLoss(nn.Module):
    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
        super(AMSoftmaxLoss, self).__init__()
        # 设置 AM-Softmax 的参数：缩放因子和边界值
        self.scale = scale
        self.margin = margin
        self.num_labels = num_labels
        # 使用随机初始化的权重作为模型参数，需计算梯度
        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
        # 使用交叉熵损失作为损失函数
        self.loss = nn.CrossEntropyLoss()

    def forward(self, hidden_states, labels):
        # 将标签展平以便与预测结果匹配
        labels = labels.flatten()
        # 对权重进行 L2 归一化
        weight = nn.functional.normalize(self.weight, dim=0)
        # 对输入的隐藏状态进行 L2 归一化
        hidden_states = nn.functional.normalize(hidden_states, dim=1)
        # 计算余弦相似度
        cos_theta = torch.mm(hidden_states, weight)
        # 计算 AM-Softmax 中的 psi 值
        psi = cos_theta - self.margin

        # 将标签转换为独热编码
        onehot = nn.functional.one_hot(labels, self.num_labels)
        # 计算最终的预测 logits
        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
        # 计算损失
        loss = self.loss(logits, labels)

        return loss


# 定义 TDNNLayer 类，实现时间延迟神经网络中的一层
class TDNNLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 从配置中获取输入和输出维度，以及卷积核大小和扩张率
        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
        self.out_conv_dim = config.tdnn_dim[layer_id]
        self.kernel_size = config.tdnn_kernel[layer_id]
        self.dilation = config.tdnn_dilation[layer_id]

        # 使用线性层作为卷积核
        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
        # 激活函数为 ReLU
        self.activation = nn.ReLU()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 检查是否可用 peft 库，并警告用户
        if is_peft_available():
            from peft.tuners.lora import LoraLayer

            if isinstance(self.kernel, LoraLayer):
                warnings.warn(
                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
                    "You should exclude TDNNLayer from LoRA's target modules.",
                )

        # 调整输入张量的维度顺序以进行卷积计算
        hidden_states = hidden_states.transpose(1, 2)
        # 调整卷积核的形状以匹配卷积函数的要求
        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
        # 使用函数式 API 执行一维卷积操作
        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
        # 再次调整张量的维度顺序以还原原始形状
        hidden_states = hidden_states.transpose(1, 2)

        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


@add_start_docstrings(
    """
    用于 Speaker Verification 等任务的 Wav2Vec2Bert 模型，顶部带有 XVector 特征提取头。
    """,
    WAV2VEC2_BERT_START_DOCSTRING,
)
# 定义 Wav2Vec2BertForXVector 类，继承自 Wav2Vec2BertPreTrainedModel 类
class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel):
    # 从 transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.__init__ 复制，替换相关字符串
    # 初始化函数，接受配置参数并调用父类的初始化方法
    def __init__(self, config):
        super().__init__(config)

        # 创建Wav2Vec2BertModel模型实例
        self.wav2vec2_bert = Wav2Vec2BertModel(config)
        
        # 计算层数，包括Transformer层和输入嵌入层
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        
        # 如果配置中使用加权层求和，则初始化层权重为均匀分布
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 创建线性层，用于将隐藏状态映射到TDNN输入维度
        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])

        # 创建TDNN层列表
        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
        self.tdnn = nn.ModuleList(tdnn_layers)

        # 创建特征提取器线性层，将TDNN输出映射到x-vector输出维度
        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
        
        # 创建分类器线性层，将x-vector输出映射到类别数目维度
        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)

        # 创建AMSoftmax损失函数实例，用于训练中的目标函数
        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)

        # 初始化模型权重
        self.init_weights()

    # 冻结基础模型，使得在训练过程中不更新其参数
    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.wav2vec2_bert.parameters():
            param.requires_grad = False

    # 计算TDNN层的输出长度
    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        Computes the output length of the TDNN layers
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
            return (input_length - kernel_size) // stride + 1

        # 遍历每个TDNN层的卷积核大小，更新输入长度
        for kernel_size in self.config.tdnn_kernel:
            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)

        return input_lengths

    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
        output_type=XVectorOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 定义前向传播函数，将wav2vec2_bert改名为wav2vec2_bert，input_values改名为input_features
    def forward(
        self,
        input_features: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, XVectorOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据配置决定是否输出隐藏状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用wav2vec2_bert模型进行前向传播
        outputs = self.wav2vec2_bert(
            input_features,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果使用加权层求和，则进行加权求和操作
        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则，直接使用输出的第一个隐藏状态
            hidden_states = outputs[0]

        # 将隐藏状态投影到指定维度
        hidden_states = self.projector(hidden_states)

        # 对每一层的TDNN进行前向传播
        for tdnn_layer in self.tdnn:
            hidden_states = tdnn_layer(hidden_states)

        # 统计汇聚操作
        if attention_mask is None:
            # 如果没有注意力掩码，则计算整体平均值和标准差
            mean_features = hidden_states.mean(dim=1)
            std_features = hidden_states.std(dim=1)
        else:
            # 如果有注意力掩码，则根据掩码计算每层的长度
            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
            mean_features = []
            std_features = []
            # 对每一层进行统计汇聚
            for i, length in enumerate(tdnn_output_lengths):
                mean_features.append(hidden_states[i, :length].mean(dim=0))
                std_features.append(hidden_states[i, :length].std(dim=0))
            mean_features = torch.stack(mean_features)
            std_features = torch.stack(std_features)
        # 将均值和标准差拼接在一起作为统计汇聚结果
        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)

        # 使用特征提取器处理统计汇聚的结果
        output_embeddings = self.feature_extractor(statistic_pooling)
        # 使用分类器得到最终的logits
        logits = self.classifier(output_embeddings)

        # 计算损失值（如果有标签的话）
        loss = None
        if labels is not None:
            loss = self.objective(logits, labels)

        # 根据是否使用返回字典决定返回的内容
        if not return_dict:
            # 如果不使用返回字典，则返回一个元组，包含logits、output_embeddings和所有隐藏状态
            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 如果使用返回字典，则创建XVectorOutput对象并返回
        return XVectorOutput(
            loss=loss,
            logits=logits,
            embeddings=output_embeddings,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\wav2vec2_bert\processing_wav2vec2_bert.py`

# 设置编码格式为 UTF-8
# 版权声明：2024 年 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“现状”提供软件，
# 没有任何明示或暗示的担保或条件。请参阅许可证以了解详细信息。
"""
Wav2Vec2-BERT 的语音处理器类
"""
# 引入警告模块
import warnings

# 导入处理工具函数
from ...processing_utils import ProcessorMixin
# 导入特征提取模块
from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
# 导入 Wav2Vec2 CTC 分词器
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer


class Wav2Vec2BertProcessor(ProcessorMixin):
    r"""
    构建一个 Wav2Vec2-BERT 处理器，将 Wav2Vec2-BERT 特征提取器和 Wav2Vec2 CTC 分词器封装为单个处理器。

    [`Wav2Vec2Processor`] 提供了 [`SeamlessM4TFeatureExtractor`] 和 [`PreTrainedTokenizer`] 的所有功能。
    有关更多信息，请参阅 [`~Wav2Vec2Processor.__call__`] 和 [`~Wav2Vec2Processor.decode`] 的文档字符串。

    Args:
        feature_extractor (`SeamlessM4TFeatureExtractor`):
            [`SeamlessM4TFeatureExtractor`] 的实例。特征提取器是必需的输入。
        tokenizer ([`PreTrainedTokenizer`]):
            [`PreTrainedTokenizer`] 的实例。分词器是必需的输入。
    """

    feature_extractor_class = "SeamlessM4TFeatureExtractor"
    tokenizer_class = "AutoTokenizer"

    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        try:
            # 尝试从预训练模型加载
            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
        except OSError:
            # 若加载失败，则发出警告
            warnings.warn(
                f"Loading a tokenizer inside {cls.__name__} from a config that does not"
                " include a `tokenizer_class` attribute is deprecated and will be "
                "removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
                " attribute to either your `config.json` or `tokenizer_config.json` "
                "file to suppress this warning: ",
                FutureWarning,
            )

            # 从预训练模型加载特征提取器和分词器
            feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)

            # 返回处理器实例
            return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
    def __call__(self, audio=None, text=None, **kwargs):
        """
        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
        and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
        `None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as a list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The audio or batch of audios to be prepared. Each audio can be a NumPy array or PyTorch tensor. In case
                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
                and T the sample length of the audio.
            kwargs (*optional*):
                Remaining dictionary of keyword arguments that will be passed to the feature extractor and/or the
                tokenizer.
        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
            - **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which timestamps should be attended to by the model when `audio` is not `None`.
              When only `text` is specified, returns the token attention mask.
            - **labels** -- List of token ids to be fed to a model. Returned when both `text` and `audio` are not `None`.
            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `audio` is `None`.
        """

        # Pop the 'sampling_rate' from kwargs, if present
        sampling_rate = kwargs.pop("sampling_rate", None)

        # Raise an error if both audio and text inputs are None
        if audio is None and text is None:
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        # If audio input is provided, call feature_extractor to preprocess audio
        if audio is not None:
            inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)

        # If text input is provided, call tokenizer to encode the text
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

        # If only text input is provided, return the processed inputs
        if text is None:
            return inputs
        # If only audio input is provided, return the encoded text
        elif audio is None:
            return encodings
        # If both audio and text inputs are provided, merge inputs and encodings, and return
        else:
            inputs["labels"] = encodings["input_ids"]
            return inputs
    # 如果 `input_features` 不为 `None`，则将 `input_features` 和 `kwargs` 参数传递给 `SeamlessM4TFeatureExtractor` 的 `pad` 方法进行填充。
    # 如果 `labels` 不为 `None`，则将 `labels` 和 `kwargs` 参数传递给 `PreTrainedTokenizer` 的 `pad` 方法进行填充。
    # 更多信息请参考上述两个方法的文档字符串。
    def pad(self, input_features=None, labels=None, **kwargs):
        if input_features is None and labels is None:
            raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
        
        # 如果 `input_features` 不为 `None`，调用 `feature_extractor` 的 `pad` 方法进行填充
        if input_features is not None:
            input_features = self.feature_extractor.pad(input_features, **kwargs)
        
        # 如果 `labels` 不为 `None`，调用 `tokenizer` 的 `pad` 方法进行填充
        if labels is not None:
            labels = self.tokenizer.pad(labels, **kwargs)
        
        # 如果 `labels` 为 `None`，返回 `input_features`
        if labels is None:
            return input_features
        # 如果 `input_features` 为 `None`，返回 `labels`
        elif input_features is None:
            return labels
        else:
            # 将 `labels` 的 `input_ids` 赋值给 `input_features` 的 `"labels"` 键
            input_features["labels"] = labels["input_ids"]
            return input_features

    # 将所有参数转发给 `PreTrainedTokenizer` 的 `batch_decode` 方法
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    # 将所有参数转发给 `PreTrainedTokenizer` 的 `decode` 方法
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

`.\models\wav2vec2_bert\init.py`

# 导入类型检查模块
from typing import TYPE_CHECKING

# 导入自定义异常和延迟加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_wav2vec2_bert": [
        "WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "Wav2Vec2BertConfig",
    ],
    "processing_wav2vec2_bert": ["Wav2Vec2BertProcessor"],
}

# 检查是否存在 Torch 库，若不存在则抛出自定义异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 存在，添加模型相关的导入结构
    _import_structure["modeling_wav2vec2_bert"] = [
        "WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Wav2Vec2BertForAudioFrameClassification",
        "Wav2Vec2BertForCTC",
        "Wav2Vec2BertForSequenceClassification",
        "Wav2Vec2BertForXVector",
        "Wav2Vec2BertModel",
        "Wav2Vec2BertPreTrainedModel",
    ]

# 如果是类型检查环境，进行类型导入
if TYPE_CHECKING:
    from .configuration_wav2vec2_bert import (
        WAV2VEC2_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        Wav2Vec2BertConfig,
    )
    from .processing_wav2vec2_bert import Wav2Vec2BertProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_wav2vec2_bert import (
            WAV2VEC2_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            Wav2Vec2BertForAudioFrameClassification,
            Wav2Vec2BertForCTC,
            Wav2Vec2BertForSequenceClassification,
            Wav2Vec2BertForXVector,
            Wav2Vec2BertModel,
            Wav2Vec2BertPreTrainedModel,
        )

# 非类型检查环境下，将当前模块替换为懒加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\wav2vec2_conformer\configuration_wav2vec2_conformer.py`

# 设置文件编码为 UTF-8

# 版权声明，声明代码版权归 Fairseq 作者和 HuggingFace 团队所有，保留所有权利

# 根据 Apache 许可证 2.0 版本，除非符合许可证的规定，否则不得使用此文件
# 您可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0

# 除非适用法律要求或书面同意，否则本软件按“原样”分发，不提供任何形式的担保或条件
# 请参阅许可证获取更多信息

""" Wav2Vec2Conformer 模型配置"""

# 导入 functools 和 operator 模块
import functools
import operator

# 从配置工具中导入 PretrainedConfig 类
from ...configuration_utils import PretrainedConfig
# 从工具模块中导入日志记录
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射字典，指定了模型名称和其对应的配置文件 URL
WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/wav2vec2-conformer-rel-pos-large": (
        "https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large/resolve/main/config.json"
    ),
}


class Wav2Vec2ConformerConfig(PretrainedConfig):
    r"""
    这是用于存储 [`Wav2Vec2ConformerModel`] 配置的类。它用于根据指定的参数实例化一个 Wav2Vec2Conformer 模型，
    定义模型架构。使用默认值实例化配置将产生类似于 Wav2Vec2Conformer
    [facebook/wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large)
    架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。更多信息请阅读 [`PretrainedConfig`] 的文档。

    示例：

    ```
    >>> from transformers import Wav2Vec2ConformerConfig, Wav2Vec2ConformerModel

    >>> # 初始化一个 Wav2Vec2Conformer facebook/wav2vec2-conformer-rel-pos-large 风格的配置
    >>> configuration = Wav2Vec2ConformerConfig()

    >>> # 从该配置初始化一个模型（具有随机权重），使用 facebook/wav2vec2-conformer-rel-pos-large 风格的配置
    >>> model = Wav2Vec2ConformerModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "wav2vec2-conformer"
    model_type = "wav2vec2-conformer"
    # 初始化函数，用于创建一个新的对象实例
    def __init__(
        self,
        vocab_size=None,  # 词汇表大小，默认为None
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer模型中隐藏层的数量，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # Feedforward层的中间维度大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout=0.1,  # 隐藏层的Dropout率，默认为0.1
        activation_dropout=0.1,  # 激活函数的Dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力层的Dropout率，默认为0.1
        feat_proj_dropout=0.0,  # 特征投影层的Dropout率，默认为0.0
        feat_quantizer_dropout=0.0,  # 特征量化器的Dropout率，默认为0.0
        final_dropout=0.1,  # 最终输出层的Dropout率，默认为0.1
        layerdrop=0.1,  # 层级Dropout率，默认为0.1
        initializer_range=0.02,  # 参数初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # 层级归一化的epsilon值，默认为1e-5
        feat_extract_norm="group",  # 特征提取层的归一化方式，默认为"group"
        feat_extract_activation="gelu",  # 特征提取层的激活函数，默认为GELU
        conv_dim=(512, 512, 512, 512, 512, 512, 512),  # 卷积层的维度，默认为(512, 512, 512, 512, 512, 512, 512)
        conv_stride=(5, 2, 2, 2, 2, 2, 2),  # 卷积层的步长，默认为(5, 2, 2, 2, 2, 2, 2)
        conv_kernel=(10, 3, 3, 3, 3, 2, 2),  # 卷积层的卷积核大小，默认为(10, 3, 3, 3, 3, 2, 2)
        conv_bias=False,  # 是否使用卷积层的偏置，默认为False
        num_conv_pos_embeddings=128,  # 卷积位置嵌入的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的分组数量，默认为16
        apply_spec_augment=True,  # 是否应用频谱增强，默认为True
        mask_time_prob=0.05,  # 遮盖时间的概率，默认为0.05
        mask_time_length=10,  # 遮盖时间的长度，默认为10
        mask_time_min_masks=2,  # 遮盖时间的最小数量，默认为2
        mask_feature_prob=0.0,  # 遮盖特征的概率，默认为0.0
        mask_feature_length=10,  # 遮盖特征的长度，默认为10
        mask_feature_min_masks=0,  # 遮盖特征的最小数量，默认为0
        num_codevectors_per_group=320,  # 每组编码向量的数量，默认为320
        num_codevector_groups=2,  # 编码向量组的数量，默认为2
        contrastive_logits_temperature=0.1,  # 对比日志的温度参数，默认为0.1
        num_negatives=100,  # 负样本数量，默认为100
        codevector_dim=256,  # 编码向量的维度，默认为256
        proj_codevector_dim=256,  # 投影编码向量的维度，默认为256
        diversity_loss_weight=0.1,  # 多样性损失的权重，默认为0.1
        ctc_loss_reduction="sum",  # CTC损失的减少方式，默认为"sum"
        ctc_zero_infinity=False,  # CTC损失中零是否为无穷，默认为False
        use_weighted_layer_sum=False,  # 是否使用加权层和，默认为False
        classifier_proj_size=256,  # 分类器投影的大小，默认为256
        tdnn_dim=(512, 512, 512, 512, 1500),  # TDNN层的维度，默认为(512, 512, 512, 512, 1500)
        tdnn_kernel=(5, 3, 3, 1, 1),  # TDNN层的卷积核大小，默认为(5, 3, 3, 1, 1)
        tdnn_dilation=(1, 2, 3, 1, 1),  # TDNN层的膨胀系数，默认为(1, 2, 3, 1, 1)
        xvector_output_dim=512,  # X向量输出的维度，默认为512
        pad_token_id=0,  # 填充标记的ID，默认为0
        bos_token_id=1,  # 开始标记的ID，默认为1
        eos_token_id=2,  # 结束标记的ID，默认为2
        add_adapter=False,  # 是否添加适配器层，默认为False
        adapter_kernel_size=3,  # 适配器层的卷积核大小，默认为3
        adapter_stride=2,  # 适配器层的步长，默认为2
        num_adapter_layers=3,  # 适配器层的数量，默认为3
        output_hidden_size=None,  # 输出隐藏层的大小，默认为None
        position_embeddings_type="relative",  # 位置嵌入的类型，默认为"relative"
        rotary_embedding_base=10000,  # 旋转嵌入的基数，默认为10000
        max_source_positions=5000,  # 最大源位置，默认为5000
        conv_depthwise_kernel_size=31,  # 深度卷积核的大小，默认为31
        conformer_conv_dropout=0.1,  # Conformer模型的卷积Dropout率，默认为0.1
        **kwargs,  # 其他未命名的参数
    ):
        # 计算输入到logits比例的属性，即所有卷积步长的乘积
        @property
        def inputs_to_logits_ratio(self):
            return functools.reduce(operator.mul, self.conv_stride, 1)

`.\models\wav2vec2_conformer\convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8

# 版权声明，这里声明代码版权属于 The HuggingFace Inc. 团队

# 导入 argparse 模块，用于命令行参数解析
import argparse

# 导入 json 模块，用于处理 JSON 数据
import json

# 导入 os 模块，提供与操作系统交互的功能
import os

# 导入 fairseq 库，用于序列到序列模型训练
import fairseq

# 导入 torch 库，PyTorch 深度学习框架
import torch

# 从 fairseq.data 模块中导入 Dictionary 类，用于管理词汇表
from fairseq.data import Dictionary

# 从 transformers 库中导入 Wav2Vec2 系列相关的类和函数
from transformers import (
    Wav2Vec2ConformerConfig,
    Wav2Vec2ConformerForCTC,
    Wav2Vec2ConformerForPreTraining,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    logging,
)

# 设置日志输出级别为 INFO
logging.set_verbosity_info()

# 获取当前模块的 logger
logger = logging.get_logger(__name__)

# 定义一个字典，用于将旧模型的参数映射到新模型的参数
MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
    "self_attn.linear_k": "encoder.layers.*.self_attn.linear_k",
    "self_attn.linear_v": "encoder.layers.*.self_attn.linear_v",
    "self_attn.linear_q": "encoder.layers.*.self_attn.linear_q",
    "self_attn.pos_bias_u": "encoder.layers.*.self_attn.pos_bias_u",
    "self_attn.pos_bias_v": "encoder.layers.*.self_attn.pos_bias_v",
    "self_attn.linear_out": "encoder.layers.*.self_attn.linear_out",
    "self_attn.linear_pos": "encoder.layers.*.self_attn.linear_pos",
    "self_attn.rotary_emb": "encoder.embed_positions",
    "self_attn_layer_norm": "encoder.layers.*.self_attn_layer_norm",
    "conv_module.pointwise_conv1": "encoder.layers.*.conv_module.pointwise_conv1",
    "conv_module.pointwise_conv2": "encoder.layers.*.conv_module.pointwise_conv2",
    "conv_module.depthwise_conv": "encoder.layers.*.conv_module.depthwise_conv",
    "conv_module.batch_norm": "encoder.layers.*.conv_module.batch_norm",
    "conv_module.layer_norm": "encoder.layers.*.conv_module.layer_norm",
    "ffn1.w_1": "encoder.layers.*.ffn1.intermediate_dense",
    "ffn1.w_2": "encoder.layers.*.ffn1.output_dense",
    "ffn1.layer_norm": "encoder.layers.*.ffn1_layer_norm",
    "ffn2.w_1": "encoder.layers.*.ffn2.intermediate_dense",
    "ffn2.w_2": "encoder.layers.*.ffn2.output_dense",
    "ffn2.layer_norm": "encoder.layers.*.ffn2_layer_norm",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "w2v_model.layer_norm": "feature_projection.layer_norm",
    "quantizer.weight_proj": "quantizer.weight_proj",
    "quantizer.vars": "quantizer.codevectors",
    "project_q": "project_q",
    "final_proj": "project_hid",
    "w2v_encoder.proj": "lm_head",
    "mask_emb": "masked_spec_embed",
}

# 定义顶层键列表，用于保存需要转换的顶层模型参数
TOP_LEVEL_KEYS = [
    "lm_head",
    "quantizer.weight_proj",


这段代码是用于转换 Wav2Vec2Conformer 模型的检查点，通过映射旧模型参数到新模型参数来实现模型结构的更新和兼容性保证。
    "quantizer.codevectors",
    # 定义字符串"quantizer.codevectors"，用作后续操作的键值之一
    "project_q",
    # 定义字符串"project_q"，用作后续操作的键值之一
    "project_hid",
    # 定义字符串"project_hid"，用作后续操作的键值之一
# 递归设置模型参数的函数，根据指定的键（key）和值（value）设置深层次对象的属性值
def set_recursively(hf_pointer, key, value, full_name, weight_type):
    # 按照键（key）分割字符串，逐级获取深层次对象的属性指针（hf_pointer）
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)

    # 根据权重类型（weight_type）确定当前属性的形状（hf_shape）
    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    # 检查当前值（value）的形状是否与目标属性的形状（hf_shape）一致，否则抛出数值错误
    if hf_shape != value.shape:
        raise ValueError(
            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
            f" {value.shape} for {full_name}"
        )

    # 根据权重类型（weight_type）设置对应属性的数据值
    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    elif weight_type == "running_mean":
        hf_pointer.running_mean.data = value
    elif weight_type == "running_var":
        hf_pointer.running_var.data = value
    elif weight_type == "num_batches_tracked":
        hf_pointer.num_batches_tracked.data = value
    elif weight_type == "inv_freq":
        hf_pointer.inv_freq.data = value
    else:
        hf_pointer.data = value

    # 记录日志，显示成功初始化的属性路径和权重值来源
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")


# 递归加载Fairseq模型权重到Hugging Face模型中的函数
def recursively_load_weights(fairseq_model, hf_model, is_headless):
    # 初始化未使用的权重列表
    unused_weights = []
    # 获取Fairseq模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 获取Hugging Face模型中的特征提取器
    feature_extractor = hf_model.wav2vec2_conformer.feature_extractor
    # 遍历fairseq_dict中的每个键值对，其中键为权重的名称，值为对应的张量数值
    for name, value in fairseq_dict.items():
        # 初始化一个标志，表示当前权重是否被使用过
        is_used = False
        
        # 如果权重名称中包含"conv_layers"
        if "conv_layers" in name:
            # 调用load_conv_layer函数加载卷积层的权重，并传入相关参数
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            # 标记该权重已被使用
            is_used = True
        
        # 如果权重名称不包含"conv_layers"，进入else分支
        else:
            # 遍历MAPPING字典中的每个键值对
            for key, mapped_key in MAPPING.items():
                # 如果mapped_key不在TOP_LEVEL_KEYS中，则加上"wav2vec2_conformer."
                mapped_key = "wav2vec2_conformer." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                
                # 如果key在name中或者key去掉"w2v_model."后与name的第一个分段相同
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    # 标记该权重已被使用
                    is_used = True
                    
                    # 如果mapped_key中包含"*"
                    if "*" in mapped_key:
                        # 获取layer_index作为权重名称中key的前一个分段的倒数第二个元素
                        layer_index = name.split(key)[0].split(".")[-2]
                        # 将"*"替换为layer_index
                        mapped_key = mapped_key.replace("*", layer_index)
                    
                    # 根据权重名称中的特定字符串判断权重类型
                    if "pos_bias_u" in name:
                        weight_type = None
                    elif "pos_bias_v" in name:
                        weight_type = None
                    elif "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        # 对于名为"weight"的权重类型，可能需要进行后续处理，当前标记为"weight"
                        weight_type = "weight"
                    elif "running_mean" in name:
                        weight_type = "running_mean"
                    elif "inv_freq" in name:
                        weight_type = "inv_freq"
                    elif "running_var" in name:
                        weight_type = "running_var"
                    elif "num_batches_tracked" in name:
                        weight_type = "num_batches_tracked"
                    else:
                        weight_type = None
                    
                    # 调用set_recursively函数设置hf_model中mapped_key对应的值为value
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                
                # 继续下一次循环
                continue
        
        # 如果权重未被使用，则将其名称添加到unused_weights列表中
        if not is_used:
            unused_weights.append(name)
    
    # 输出警告日志，记录未使用的权重名称列表unused_weights
    logger.warning(f"Unused weights: {unused_weights}")
# 定义函数 load_conv_layer，加载卷积层的权重或偏置
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 从全名中提取层编号和类型编号
    name = full_name.split("conv_layers.")[-1]
    items = name.split(".")
    layer_id = int(items[0])
    type_id = int(items[1])

    # 如果类型为0，表示处理卷积层的权重或偏置
    if type_id == 0:
        # 如果名称中包含 bias，更新卷积层的偏置值
        if "bias" in name:
            # 检查值的形状是否匹配目标卷积层的偏置数据形状
            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 如果名称中包含 weight，更新卷积层的权重值
        elif "weight" in name:
            # 检查值的形状是否匹配目标卷积层的权重数据形状
            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 如果类型为2，并且不使用 GroupNorm 或者是第一层并且使用 GroupNorm
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果名称中包含 bias，更新层归一化的偏置值
        if "bias" in name:
            # 检查值的形状是否匹配目标层归一化偏置数据形状
            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果名称中包含 weight，更新层归一化的权重值
        elif "weight" in name:
            # 检查值的形状是否匹配目标层归一化权重数据形状
            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
                )
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    # 否则，将未使用的权重名称添加到 unused_weights 列表中
    else:
        unused_weights.append(full_name)


# 使用 torch.no_grad 装饰器定义函数 convert_wav2vec2_conformer_checkpoint，不计算梯度
def convert_wav2vec2_conformer_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 如果提供了 config_path，从预训练配置加载配置对象，使用 "swish" 作为隐藏层激活函数
    if config_path is not None:
        config = Wav2Vec2ConformerConfig.from_pretrained(config_path, hidden_act="swish")
    else:
        # 如果未指定配置文件，则使用默认配置
        config = Wav2Vec2ConformerConfig()

    if "rope" in checkpoint_path:
        # 如果模型路径中包含 "rope" 字符串，则设置位置编码类型为 "rotary"
        config.position_embeddings_type = "rotary"

    if is_finetuned:
        if dict_path:
            # 如果模型是在预训练基础上微调的，并且提供了字典路径，则加载目标字典
            target_dict = Dictionary.load(dict_path)

            # 重要变更：修改开始和填充令牌ID，因为CTC符号是 <pad> 而不是 <s>（与fairseq不同）
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")

            if not os.path.isdir(pytorch_dump_folder_path):
                # 如果指定的目录路径不是一个有效的目录，则记录错误并返回
                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
                return

            # 创建目录（如果不存在）
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            vocab_dict = target_dict.indices

            # fairseq 中的 <pad> 和 <s> 被交换了
            vocab_dict["<pad>"] = 0
            vocab_dict["<s>"] = 1

            # 将字典写入到 JSON 文件中
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(vocab_dict, vocab_handle)

            # 使用目标字典创建CTC tokenizer
            tokenizer = Wav2Vec2CTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )

            # 根据特征提取器是否返回注意力掩码，设置返回注意力掩码的标志
            return_attention_mask = True if config.feat_extract_norm == "layer" else False

            # 创建特征提取器
            feature_extractor = Wav2Vec2FeatureExtractor(
                feature_size=1,
                sampling_rate=16000,
                padding_value=0,
                do_normalize=True,
                return_attention_mask=return_attention_mask,
            )

            # 创建处理器，将特征提取器和tokenizer作为参数传入
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

            # 将处理器保存到指定路径
            processor.save_pretrained(pytorch_dump_folder_path)

        # 根据微调状态选择不同的模型
        hf_wav2vec = Wav2Vec2ConformerForCTC(config)
    else:
        # 如果未微调，则选择预训练模型
        hf_wav2vec = Wav2Vec2ConformerForPreTraining(config)

    # 根据微调状态加载模型
    if is_finetuned:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
        )
    else:
        task_arg = argparse.Namespace(task="audio_pretraining")
        task = fairseq.tasks.setup_task(task_arg)

        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path], task=task)

    # 将加载的模型设置为评估模式
    model = model[0].eval()

    # 递归地加载权重到模型中
    recursively_load_weights(model, hf_wav2vec, not is_finetuned)

    # 将预训练模型保存到指定路径
    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加参数：输出 PyTorch 模型的路径

    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加参数：fairseq 检查点的路径

    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    # 添加参数：微调模型的字典路径

    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加参数：要转换的模型的 hf config.json 路径

    parser.add_argument(
        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
    )
    # 添加参数：指示要转换的模型是否为微调模型的标志

    args = parser.parse_args()
    # 解析命令行参数并存储在 args 变量中

    convert_wav2vec2_conformer_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
    )
    # 调用函数 convert_wav2vec2_conformer_checkpoint，传递解析后的参数作为函数的输入

`.\models\wav2vec2_conformer\modeling_wav2vec2_conformer.py`

# 设置编码格式为 UTF-8
# 版权声明，包括 Fairseq 作者和 HuggingFace Inc. 团队
#
# 根据 Apache 许可证版本 2.0 使用本文件，详见许可证
#
# 如果不是依照许可证的规定使用本文件，不得使用
#
# 详细许可证信息请访问 http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“按原样”提供，无任何明示或暗示的保证或条件
# 包括但不限于适销性或特定用途适用性的保证或条件。详见许可证。
""" PyTorch Wav2Vec2-Conformer model."""

import math
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# 导入自定义的激活函数映射 ACT2FN
from ...activations import ACT2FN
# 导入 DeepSpeed 集成检测功能
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
# 导入模型输出类
from ...modeling_outputs import (
    BaseModelOutput,
    CausalLMOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
    Wav2Vec2BaseModelOutput,
    XVectorOutput,
)
# 导入预训练模型基类
from ...modeling_utils import PreTrainedModel
# 导入工具函数
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_peft_available,
    logging,
    replace_return_docstrings,
)
# 导入 Wav2Vec2Conformer 的配置类
from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 隐藏状态起始位置的全局常量
_HIDDEN_STATES_START_POSITION = 2

# 用于文档的配置名称
_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"

# 用于文档的检查点名称
_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"

# 预期的输出形状
_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]

# CTC（Connectionist Temporal Classification）的预期输出示例
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 64.21

# 预训练模型存档列表
WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/wav2vec2-conformer-rel-pos-large",
    # 更多 Wav2Vec2Conformer 模型请查看 https://huggingface.co/models?filter=wav2vec2-conformer
]

@dataclass
# 基于 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
# 的输出类型定义，用于支持潜在的隐藏状态和注意力
class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
    """
    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
    """
    Args:
        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
            projected quantized states.
        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
            target vectors for contrastive loss.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
    """

    # 总体损失，包括对比损失和多样性损失，用于分类任务
    loss: Optional[torch.FloatTensor] = None
    # 模型隐藏状态投影到 config.proj_codevector_dim 维度，用于预测掩码后的量化状态
    projected_states: torch.FloatTensor = None
    # 量化提取的特征向量投影到 config.proj_codevector_dim 维度，作为对比损失的正样本向量
    projected_quantized_states: torch.FloatTensor = None
    # 模型每一层的隐藏状态，以及初始嵌入输出的元组
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 模型每一层的注意力权重的元组，用于计算自注意力头中的加权平均值
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    # 对比损失 L_m，参考论文中的定义
    contrastive_loss: Optional[torch.FloatTensor] = None
    # 多样性损失 L_d，参考论文中的定义
    diversity_loss: Optional[torch.FloatTensor] = None
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """
    batch_size, sequence_length = shape  # 获取批次大小和序列长度

    if mask_length < 1:  # 如果 mask_length 小于 1，抛出数值错误
        raise ValueError("`mask_length` has to be bigger than 0.")

    if mask_length > sequence_length:  # 如果 mask_length 大于序列长度，抛出数值错误
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )

    # epsilon 用于概率舍入
    epsilon = np.random.rand(1).item()

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        # 计算应该被掩盖的 span 的数量
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
        num_masked_span = max(num_masked_span, min_masks)

        # 确保 num_masked_span 不超过 sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length

        # 确保 num_masked span 不超过 input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)

        return num_masked_span

    # 计算每个批次中的被掩盖的 span 的数量
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()  # 如果 attention_mask 不为 None，则计算每个批次的实际长度
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]  # 否则，所有批次长度都为 sequence_length
    )

    # 创建一个全零的布尔数组，用于表示 SpecAugment 掩盖
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
    spec_aug_mask_idxs = []  # 存储 SpecAugment 掩盖的索引

    max_num_masked_span = compute_num_masked_span(sequence_length)  # 计算可以掩盖的最大 span 数量
    # 如果最大的被遮蔽片段数为0，则直接返回特定的遮蔽掩码
    if max_num_masked_span == 0:
        return spec_aug_mask

    # 遍历输入长度列表
    for input_length in input_lengths:
        # 计算当前输入长度下的被遮蔽片段数目
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要遮蔽的索引位置
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 选择第一个被抽样的索引作为填充向量的虚拟索引，以确保所有批次的维度相同（由于概率舍入）
        if len(spec_aug_mask_idx) == 0:
            # 这种情况只可能发生在 `input_length` 严格小于 `sequence_length` 的情况下，
            # 此时最后一个令牌必须是填充令牌，我们可以使用其作为虚拟遮蔽标识符
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟遮蔽索引添加到被抽样索引中，使得总长度等于 `max_num_masked_span`
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将列表转换为 NumPy 数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将遮蔽索引扩展为遮蔽片段
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 添加偏移量到起始索引，以便索引现在创建一个片段
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保索引不能大于 `sequence_length - 1`
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 在遮蔽掩码上散布索引以进行遮蔽
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回经过特定数据增强处理后的遮蔽掩码
    return spec_aug_mask
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices
def _sample_negative_indices(
    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
):
    """
    Sample `num_negatives` vectors from feature vectors.
    """
    # 获取批量大小和序列长度
    batch_size, sequence_length = features_shape

    # 生成正向向量本身的索引，并将它们重复 `num_negatives` 次
    sequence_length_range = np.arange(sequence_length)

    # 从同一句话中获取 `num_negatives` 个随机向量索引
    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)

    # 如果给定了 mask_time_indices，则将其转换为布尔型数组；否则创建一个全部为 True 的数组
    mask_time_indices = (
        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
    )

    # 遍历每个批次中的索引
    for batch_idx in range(batch_size):
        # 计算非零元素的数量
        high = mask_time_indices[batch_idx].sum() - 1
        # 获取映射后的屏蔽索引
        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]

        # 广播以重复索引
        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
        # 从 0 到 high 之间随机选择向量索引
        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
        # 避免采样相同的正向向量，同时保持分布均匀
        sampled_indices[sampled_indices >= feature_indices] += 1

        # 将采样的负向索引重新映射到实际索引
        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]

        # 对批次大小进行修正
        sampled_negative_indices[batch_idx] += batch_idx * sequence_length

    return sampled_negative_indices


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
class Wav2Vec2ConformerNoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 获取输入卷积维度和输出卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 获取激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 执行一维卷积操作
        hidden_states = self.conv(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
class Wav2Vec2ConformerLayerNormConvLayer(nn.Module):
    # 初始化函数，用于设置卷积神经网络的一维卷积层及其参数
    def __init__(self, config, layer_id=0):
        # 调用父类的初始化方法
        super().__init__()
        
        # 根据给定的配置获取输入卷积维度，如果layer_id大于0则取上一层的卷积维度，否则默认为1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 获取当前层的输出卷积维度
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一维卷积层，指定输入和输出的卷积维度，以及内核大小、步长和是否使用偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        
        # 创建一维卷积层后的层归一化层，指定归一化的维度和是否启用元素级别的仿射变换
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        
        # 根据配置选择激活函数，并将其赋值给activation变量
        self.activation = ACT2FN[config.feat_extract_activation]

    # 前向传播函数，接受输入的hidden_states进行前向计算，并返回计算后的结果
    def forward(self, hidden_states):
        # 应用一维卷积操作到输入的hidden_states
        hidden_states = self.conv(hidden_states)

        # 将hidden_states的维度进行转置，交换倒数第二和倒数第一维度
        hidden_states = hidden_states.transpose(-2, -1)
        
        # 对转置后的hidden_states进行层归一化操作
        hidden_states = self.layer_norm(hidden_states)
        
        # 再次将hidden_states的维度进行转置，恢复到初始维度
        hidden_states = hidden_states.transpose(-2, -1)
        
        # 应用激活函数activation到归一化后的hidden_states
        hidden_states = self.activation(hidden_states)
        
        # 返回处理后的hidden_states作为前向传播的输出
        return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer复制而来，将Wav2Vec2改为Wav2Vec2Conformer
class Wav2Vec2ConformerGroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 如果layer_id大于0，则设置输入卷积维度为config.conv_dim[layer_id - 1]，否则为1
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        # 设置输出卷积维度为config.conv_dim[layer_id]
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个1维卷积层，输入维度为self.in_conv_dim，输出维度为self.out_conv_dim
        # 使用config.conv_kernel[layer_id]作为卷积核大小，config.conv_stride[layer_id]作为步长，config.conv_bias作为偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 设置激活函数为ACT2FN[config.feat_extract_activation]
        self.activation = ACT2FN[config.feat_extract_activation]

        # 创建一个Group Normalization层，num_groups和num_channels都为self.out_conv_dim，启用仿射变换
        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        # 对输入的hidden_states进行卷积操作
        hidden_states = self.conv(hidden_states)
        # 对卷积后的hidden_states进行Group Normalization
        hidden_states = self.layer_norm(hidden_states)
        # 对Group Normalization后的hidden_states应用激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding复制而来，将Wav2Vec2改为Wav2Vec2Conformer
class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个1维卷积层，输入和输出维度都为config.hidden_size
        # 使用config.num_conv_pos_embeddings作为卷积核大小，config.num_conv_pos_embeddings // 2作为填充，groups为config.num_conv_pos_embedding_groups
        self.conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=config.num_conv_pos_embeddings,
            padding=config.num_conv_pos_embeddings // 2,
            groups=config.num_conv_pos_embedding_groups,
        )

        # 设置权重归一化方式为nn.utils.weight_norm
        weight_norm = nn.utils.weight_norm
        # 如果存在nn.utils.parametrizations.weight_norm，则使用该权重归一化方式
        if hasattr(nn.utils.parametrizations, "weight_norm"):
            weight_norm = nn.utils.parametrizations.weight_norm

        # 如果启用了deepspeed的zero3，则使用zero.GatheredParameters对self.conv.weight进行处理
        if is_deepspeed_zero3_enabled():
            import deepspeed

            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                self.conv = weight_norm(self.conv, name="weight", dim=2)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 否则，对self.conv进行权重归一化
            self.conv = weight_norm(self.conv, name="weight", dim=2)

        # 创建一个Wav2Vec2ConformerSamePadLayer实例，使用config.num_conv_pos_embeddings作为参数
        self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
        # 设置激活函数为ACT2FN[config.feat_extract_activation]
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将输入的hidden_states进行维度转换，将第1和第2个维度互换
        hidden_states = hidden_states.transpose(1, 2)

        # 对转置后的hidden_states进行卷积操作
        hidden_states = self.conv(hidden_states)
        # 对卷积后的hidden_states进行填充操作
        hidden_states = self.padding(hidden_states)
        # 对填充后的hidden_states应用激活函数
        hidden_states = self.activation(hidden_states)

        # 再次将hidden_states的第1和第2个维度互换回来
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
    """Rotary positional embedding
    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
    """
    def __init__(self, config):
        super().__init__()
        # 计算每个注意头的隐藏大小
        dim = config.hidden_size // config.num_attention_heads
        # 旋转嵌入的基数
        base = config.rotary_embedding_base

        # 计算频率的倒数
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
        # 将频率的倒数注册为缓冲区
        self.register_buffer("inv_freq", inv_freq)
        # 初始化缓存的序列长度和旋转位置嵌入
        self.cached_sequence_length = None
        self.cached_rotary_positional_embedding = None

    def forward(self, hidden_states):
        # 获取输入隐藏状态的序列长度
        sequence_length = hidden_states.shape[1]

        # 如果缓存的序列长度与当前序列长度相同，并且缓存的旋转位置嵌入不为空，则直接返回缓存的旋转位置嵌入
        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
            return self.cached_rotary_positional_embedding

        # 更新缓存的序列长度
        self.cached_sequence_length = sequence_length

        # 计算时间戳，使用与 inv_freq 常量相同的数据类型
        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
        # 计算频率
        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
        # 创建嵌入向量，包括 cos 和 sin 部分
        embeddings = torch.cat((freqs, freqs), dim=-1)

        # 计算 cos 和 sin 的嵌入向量
        cos_embeddings = embeddings.cos()[:, None, None, :]
        sin_embeddings = embeddings.sin()[:, None, None, :]
        # 将计算得到的嵌入向量转换为与隐藏状态输入相同的数据类型
        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
        # 返回缓存的旋转位置嵌入
        return self.cached_rotary_positional_embedding
class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
    """Relative positional encoding module."""

    def __init__(self, config):
        super().__init__()
        # 最大位置编码长度，从配置中获取
        self.max_len = config.max_source_positions
        # 模型隐藏层大小，从配置中获取
        self.d_model = config.hidden_size
        # 位置编码张量，初始为空
        self.pe = None
        # 初始化位置编码
        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))

    def extend_pe(self, x):
        # 重置位置编码
        if self.pe is not None:
            # self.pe 包含正负两部分
            # self.pe 的长度为 2 * input_len - 1
            if self.pe.size(1) >= x.size(1) * 2 - 1:
                if self.pe.dtype != x.dtype or self.pe.device != x.device:
                    # 调整 self.pe 的数据类型和设备
                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        # 创建正向和负向的位置编码
        pe_positive = torch.zeros(x.size(1), self.d_model)
        pe_negative = torch.zeros(x.size(1), self.d_model)
        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
        )
        pe_positive[:, 0::2] = torch.sin(position * div_term)
        pe_positive[:, 1::2] = torch.cos(position * div_term)
        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)

        # 反转正向索引的顺序并连接正向和负向索引，支持偏移技巧
        # 参考 https://arxiv.org/abs/1901.02860
        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
        pe_negative = pe_negative[1:].unsqueeze(0)
        pe = torch.cat([pe_positive, pe_negative], dim=1)
        self.pe = pe.to(device=x.device, dtype=x.dtype)

    def forward(self, hidden_states: torch.Tensor):
        # 扩展位置编码
        self.extend_pe(hidden_states)
        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
        # 获取相对位置编码
        relative_position_embeddings = self.pe[:, start_idx:end_idx]

        return relative_position_embeddings


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制，将 Wav2Vec2 改为 Wav2Vec2Conformer
class Wav2Vec2ConformerSamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 如果卷积位置编码数是偶数，则需要移除一个填充
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        if self.num_pad_remove > 0:
            # 移除最后一维的填充
            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
        return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder复制到Wav2Vec2->Wav2Vec2Conformer
class Wav2Vec2ConformerFeatureEncoder(nn.Module):
    """从原始音频波形构建特征"""

    def __init__(self, config):
        super().__init__()

        # 根据配置选择特征提取层的归一化方式
        if config.feat_extract_norm == "group":
            # 如果使用组归一化，第一层使用Wav2Vec2ConformerGroupNormConvLayer，其后的层使用Wav2Vec2ConformerNoLayerNormConvLayer
            conv_layers = [Wav2Vec2ConformerGroupNormConvLayer(config, layer_id=0)] + [
                Wav2Vec2ConformerNoLayerNormConvLayer(config, layer_id=i + 1)
                for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果使用层归一化，使用Wav2Vec2ConformerLayerNormConvLayer
            conv_layers = [
                Wav2Vec2ConformerLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
            ]
        else:
            # 抛出异常，如果配置中的特征提取归一化不在支持的选项中
            raise ValueError(
                f"`config.feat_extract_norm` 是 {config.feat_extract_norm}，但必须是 ['group', 'layer'] 中的一种"
            )
        
        # 将卷积层列表转换为模块列表
        self.conv_layers = nn.ModuleList(conv_layers)
        self.gradient_checkpointing = False  # 是否启用梯度检查点
        self._requires_grad = True  # 是否需要梯度更新

    def _freeze_parameters(self):
        # 冻结所有参数，不进行梯度更新
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def forward(self, input_values):
        hidden_states = input_values[:, None]  # 增加维度以匹配期望的输入形状

        # 如果需要梯度更新并且处于训练模式，则确保隐藏状态需要梯度
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        for conv_layer in self.conv_layers:
            if self._requires_grad and self.gradient_checkpointing and self.training:
                # 如果启用梯度检查点并且需要梯度更新，则使用梯度检查点函数优化卷积层的计算
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,
                    hidden_states,
                )
            else:
                # 否则直接通过卷积层计算隐藏状态
                hidden_states = conv_layer(hidden_states)

        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection复制到Wav2Vec2->Wav2Vec2Conformer
class Wav2Vec2ConformerFeatureProjection(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用层归一化对隐藏状态进行归一化
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
        # 使用线性映射对隐藏状态进行投影到指定的隐藏维度
        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # 使用Dropout对投影结果进行随机失活
        self.dropout = nn.Dropout(config.feat_proj_dropout)

    def forward(self, hidden_states):
        # 对隐藏状态进行归一化
        norm_hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的隐藏状态进行线性投影
        hidden_states = self.projection(norm_hidden_states)
        # 对投影结果进行随机失活
        hidden_states = self.dropout(hidden_states)
        return hidden_states, norm_hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward复制到Wav2Vec2->Wav2Vec2Conformer
class Wav2Vec2ConformerFeedForward(nn.Module):
    # 初始化函数，用于创建一个新的神经网络层对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        
        # 创建中间层的dropout层，根据配置中的激活函数的dropout值
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 创建中间层的全连接层，将输入大小调整为配置中的中间层大小
        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
        
        # 根据配置中的激活函数，选择相应的激活函数，并赋值给self.intermediate_act_fn
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
        
        # 创建输出层的全连接层，将中间层大小调整为配置中的隐藏层大小
        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
        
        # 创建输出层的dropout层，根据配置中的隐藏层的dropout值
        self.output_dropout = nn.Dropout(config.hidden_dropout)

    # 前向传播函数，定义了数据在神经网络中的流动方向
    def forward(self, hidden_states):
        # 中间层全连接层的前向传播，对输入的hidden_states进行线性变换
        hidden_states = self.intermediate_dense(hidden_states)
        
        # 中间层的激活函数的前向传播，应用于线性变换后的结果
        hidden_states = self.intermediate_act_fn(hidden_states)
        
        # 中间层的dropout层的前向传播，对激活函数的输出进行随机置零
        hidden_states = self.intermediate_dropout(hidden_states)

        # 输出层全连接层的前向传播，对中间层输出进行线性变换
        hidden_states = self.output_dense(hidden_states)
        
        # 输出层的dropout层的前向传播，对线性变换后的结果进行随机置零
        hidden_states = self.output_dropout(hidden_states)
        
        # 返回最终的神经网络层的输出结果
        return hidden_states
# 定义一个用于 Conformer 模块的卷积块
class Wav2Vec2ConformerConvolutionModule(nn.Module):
    """Convolution block used in the conformer block"""

    def __init__(self, config):
        super().__init__()
        # 检查是否满足 'SAME' 填充条件，深度可分离卷积核大小应为奇数
        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
        
        # Layer normalization 层，对隐藏状态进行归一化
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        
        # 第一个点卷积层，将隐藏大小映射到两倍的隐藏大小
        self.pointwise_conv1 = nn.Conv1d(
            config.hidden_size,
            2 * config.hidden_size,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )
        
        # GLU（门控线性单元）激活函数，用于特征维度的门控
        self.glu = nn.GLU(dim=1)
        
        # 深度可分离卷积层，用于捕获局部依赖关系
        self.depthwise_conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            config.conv_depthwise_kernel_size,
            stride=1,
            padding=(config.conv_depthwise_kernel_size - 1) // 2,
            groups=config.hidden_size,
            bias=False,
        )
        
        # 批标准化层，用于加速收敛和稳定训练过程
        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
        
        # 激活函数，根据配置选择的激活函数类型
        self.activation = ACT2FN[config.hidden_act]
        
        # 第二个点卷积层，将隐藏大小映射回原始大小
        self.pointwise_conv2 = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )
        
        # Dropout 层，用于随机丢弃部分神经元，防止过拟合
        self.dropout = nn.Dropout(config.conformer_conv_dropout)

    def forward(self, hidden_states):
        # 对隐藏状态进行层归一化
        hidden_states = self.layer_norm(hidden_states)
        
        # 交换时间维度和特征维度，使得特征维度在最后一维
        hidden_states = hidden_states.transpose(1, 2)

        # GLU 机制，将特征维度分成两部分并应用门控
        hidden_states = self.pointwise_conv1(hidden_states)
        hidden_states = self.glu(hidden_states)

        # 1D 深度可分离卷积，捕获局部依赖关系
        hidden_states = self.depthwise_conv(hidden_states)
        
        # 批标准化，加速收敛和稳定训练过程
        hidden_states = self.batch_norm(hidden_states)
        
        # 激活函数，根据配置选择的激活函数类型
        hidden_states = self.activation(hidden_states)

        # 第二个点卷积层，将隐藏大小映射回原始大小
        hidden_states = self.pointwise_conv2(hidden_states)
        
        # Dropout 层，随机丢弃部分神经元，防止过拟合
        hidden_states = self.dropout(hidden_states)
        
        # 恢复时间维度和特征维度的交换，使得特征维度在第二维
        hidden_states = hidden_states.transpose(1, 2)
        
        # 返回处理后的隐藏状态
        return hidden_states


class Wav2Vec2ConformerSelfAttention(nn.Module):
    """Construct an Wav2Vec2ConformerSelfAttention object.
    Can be enhanced with rotary or relative position embeddings.
    """
    # 初始化函数，用于初始化一个多头注意力层对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 计算每个注意力头的大小
        self.head_size = config.hidden_size // config.num_attention_heads
        # 设置注意力头的数量
        self.num_heads = config.num_attention_heads
        # 设置位置编码的类型（绝对或相对）
        self.position_embeddings_type = config.position_embeddings_type

        # 初始化用于查询的线性层
        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化用于键的线性层
        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化用于值的线性层
        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化输出的线性层
        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)

        # 初始化用于dropout的层
        self.dropout = nn.Dropout(p=config.attention_dropout)

        # 如果位置编码类型为"relative"
        if self.position_embeddings_type == "relative":
            # 初始化用于位置编码的线性层
            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
            # 初始化用于矩阵c和矩阵d的可学习偏置
            # 参考文献 https://arxiv.org/abs/1901.02860 第3.3节
            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))

    # 前向传播函数，处理输入的隐藏状态和其他可选参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        relative_position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # 定义函数的输入和输出类型，返回一个元组，包含三个元素：
        # 1. torch.Tensor：经过注意力机制处理后的隐藏状态
        # 2. Optional[torch.Tensor]：可能为 None 的注意力概率分布
        # 3. Optional[Tuple[torch.Tensor]]：可能为 None 的额外张量元组

        # 获取隐藏状态的批量大小、序列长度和隐藏单元大小
        batch_size, sequence_length, hidden_size = hidden_states.size()

        # 将 query/key 状态与 value 状态分开处理
        query_key_states = hidden_states
        value_states = hidden_states

        # 如果采用旋转型位置编码
        if self.position_embeddings_type == "rotary":
            # 检查相对位置编码是否已定义，如果未定义则抛出错误
            if relative_position_embeddings is None:
                raise ValueError(
                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
                )
            # 对 query_key_states 应用旋转型位置编码
            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)

        # 投影 query_key_states 和 value_states
        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)

        # 将维度重新排列为 (batch, head, time1, d_k)
        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)

        # 如果采用相对位置编码
        if self.position_embeddings_type == "relative":
            # 检查相对位置编码是否已定义，如果未定义则抛出错误
            if relative_position_embeddings is None:
                raise ValueError(
                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'relative'"
                )
            # 应用相对位置编码到 qk 分数，参考 Transformer_XL: https://arxiv.org/abs/1901.02860
            scores = self._apply_relative_embeddings(
                query=query, key=key, relative_position_embeddings=relative_position_embeddings
            )
        else:
            # 根据经典方法计算注意力分数
            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)

        # 如果存在注意力掩码，则应用到注意力分数上
        if attention_mask is not None:
            scores = scores + attention_mask

        # 计算注意力概率分布，维度为 (batch, head, time1, time2)
        probs = torch.softmax(scores, dim=-1)
        # 对注意力概率分布应用 dropout
        probs = self.dropout(probs)

        # 计算加权后的 value，维度为 (batch, head, time1, d_k)
        hidden_states = torch.matmul(probs, value)

        # 将维度重新排列为 (batch, time1, hidden_size)，并应用输出线性层
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
        hidden_states = self.linear_out(hidden_states)

        # 返回处理后的隐藏状态和可能的注意力概率分布
        return hidden_states, probs
    # 对输入的隐藏状态应用旋转嵌入和相对位置嵌入
    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
        # 获取批量大小、序列长度和隐藏层大小
        batch_size, sequence_length, hidden_size = hidden_states.size()
        # 将隐藏状态重塑为(batch_size, sequence_length, num_heads, head_size)
        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)

        # 获取相对位置嵌入的余弦部分和正弦部分
        cos = relative_position_embeddings[0, :sequence_length, ...]
        sin = relative_position_embeddings[1, :sequence_length, ...]

        # 旋转隐藏状态和旋转部分
        hidden_states = hidden_states.transpose(0, 1)
        rotated_states_begin = hidden_states[..., : self.head_size // 2]
        rotated_states_end = hidden_states[..., self.head_size // 2 :]
        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
        # 应用旋转嵌入公式
        hidden_states = (hidden_states * cos) + (rotated_states * sin)
        hidden_states = hidden_states.transpose(0, 1)

        # 将隐藏状态重塑为(batch_size, sequence_length, num_heads * head_size)
        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)

        return hidden_states
    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
        # 1. project positional embeddings
        # 将位置嵌入投影
        # => (batch, head, 2*time1-1, d_k)
        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
        )
        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)

        # 2. Add bias to query
        # 给查询添加偏置
        # => (batch, head, time1, d_k)
        query = query.transpose(1, 2)
        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)

        # 3. attention score: first compute matrix a and matrix c
        # 计算注意力分数：首先计算矩阵 a 和矩阵 c
        # 如 https://arxiv.org/abs/1901.02860 第 3.3 节所述
        # => (batch, head, time1, time2)
        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))

        # 4. then compute matrix b and matrix d
        # 然后计算矩阵 b 和矩阵 d
        # => (batch, head, time1, 2*time1-1)
        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)

        # 5. shift matrix b and matrix d
        # 移位矩阵 b 和矩阵 d
        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]

        # 6. sum matrices
        # 求和矩阵
        # => (batch, head, time1, time2)
        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)

        return scores
class Wav2Vec2ConformerEncoderLayer(nn.Module):
    """Conformer block based on https://arxiv.org/abs/2005.08100."""

    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        dropout = config.attention_dropout

        # Feed-forward 1
        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
        self.ffn1 = Wav2Vec2ConformerFeedForward(config)

        # Self-Attention
        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
        self.self_attn_dropout = nn.Dropout(dropout)
        self.self_attn = Wav2Vec2ConformerSelfAttention(config)

        # Conformer Convolution
        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)

        # Feed-forward 2
        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
        self.final_layer_norm = nn.LayerNorm(embed_dim)

    def forward(
        self,
        hidden_states,
        attention_mask: Optional[torch.Tensor] = None,
        relative_position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        hidden_states = hidden_states

        # 1. Feed-Forward 1 layer
        residual = hidden_states
        hidden_states = self.ffn1_layer_norm(hidden_states)
        hidden_states = self.ffn1(hidden_states)
        hidden_states = hidden_states * 0.5 + residual
        residual = hidden_states

        # 2. Self-Attention layer
        hidden_states = self.self_attn_layer_norm(hidden_states)
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            relative_position_embeddings=relative_position_embeddings,
            output_attentions=output_attentions,
        )
        hidden_states = self.self_attn_dropout(hidden_states)
        hidden_states = hidden_states + residual

        # 3. Convolutional Layer
        residual = hidden_states
        hidden_states = self.conv_module(hidden_states)
        hidden_states = residual + hidden_states

        # 4. Feed-Forward 2 Layer
        residual = hidden_states
        hidden_states = self.ffn2_layer_norm(hidden_states)
        hidden_states = self.ffn2(hidden_states)
        hidden_states = hidden_states * 0.5 + residual
        hidden_states = self.final_layer_norm(hidden_states)

        return hidden_states, attn_weights


注释：

# 定义一个名为Wav2Vec2ConformerEncoderLayer的类，用于实现Conformer结构，参考自https://arxiv.org/abs/2005.08100
class Wav2Vec2ConformerEncoderLayer(nn.Module):
    """Conformer block based on https://arxiv.org/abs/2005.08100."""

    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size  # 从配置中获取隐藏大小
        dropout = config.attention_dropout  # 从配置中获取注意力丢弃率

        # Feed-forward 1
        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)  # Layer normalization层
        self.ffn1 = Wav2Vec2ConformerFeedForward(config)  # 第一个前向传播网络

        # Self-Attention
        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)  # Layer normalization层
        self.self_attn_dropout = nn.Dropout(dropout)  # Dropout层
        self.self_attn = Wav2Vec2ConformerSelfAttention(config)  # 自注意力层

        # Conformer Convolution
        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)  # Conformer卷积模块

        # Feed-forward 2
        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)  # Layer normalization层
        self.ffn2 = Wav2Vec2ConformerFeedForward(config)  # 第二个前向传播网络
        self.final_layer_norm = nn.LayerNorm(embed_dim)  # 最终的Layer normalization层

    def forward(
        self,
        hidden_states,
        attention_mask: Optional[torch.Tensor] = None,
        relative_position_embeddings: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        hidden_states = hidden_states  # 输入隐藏状态

        # 1. Feed-Forward 1 layer
        residual = hidden_states  # 残差连接
        hidden_states = self.ffn1_layer_norm(hidden_states)  # Layer normalization
        hidden_states = self.ffn1(hidden_states)  # 第一个前向传播网络
        hidden_states = hidden_states * 0.5 + residual  # 残差连接加权和
        residual = hidden_states

        # 2. Self-Attention layer
        hidden_states = self.self_attn_layer_norm(hidden_states)  # Layer normalization
        hidden_states, attn_weights = self.self_attn(  # 自注意力计算
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            relative_position_embeddings=relative_position_embeddings,
            output_attentions=output_attentions,
        )
        hidden_states = self.self_attn_dropout(hidden_states)  # Dropout层
        hidden_states = hidden_states + residual  # 残差连接加和

        # 3. Convolutional Layer
        residual = hidden_states  # 残差连接
        hidden_states = self.conv_module(hidden_states)  # Conformer卷积模块应用
        hidden_states = residual + hidden_states  # 残差连接加和

        # 4. Feed-Forward 2 Layer
        residual = hidden_states  # 残差连接
        hidden_states = self.ffn2_layer_norm(hidden_states)  # Layer normalization
        hidden_states = self.ffn2(hidden_states)  # 第二个前向传播网络
        hidden_states = hidden_states * 0.5 + residual  # 残差连接加权和
        hidden_states = self.final_layer_norm(hidden_states)  # 最终的Layer normalization

        return hidden_states, attn_weights  # 返回隐藏状态和注意力权重
    # 初始化方法，接收配置参数并调用父类初始化方法
    def __init__(self, config):
        super().__init__()
        # 将配置参数存储在实例变量中
        self.config = config

        # 根据配置中的位置嵌入类型选择不同的位置嵌入方式
        if config.position_embeddings_type == "relative":
            # 如果位置嵌入类型为"relative"，则使用相对位置嵌入方式初始化位置嵌入对象
            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
        elif config.position_embeddings_type == "rotary":
            # 如果位置嵌入类型为"rotary"，则使用旋转位置嵌入方式初始化位置嵌入对象
            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
        else:
            # 如果位置嵌入类型不在预期的"relative"或"rotary"中，将位置嵌入对象设置为None
            self.embed_positions = None

        # 初始化位置卷积嵌入对象
        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
        # 初始化层归一化对象，设置归一化大小和epsilon
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化dropout对象，设置隐藏层dropout比例
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 初始化多层编码器层的列表，每层使用相同的配置参数
        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 设置梯度检查点为False
        self.gradient_checkpointing = False
    ):
        # 初始化隐藏状态和自注意力列表，根据是否需要输出隐藏状态和注意力矩阵做判断
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 如果有注意力掩码，则将未注意到的位置的隐藏状态置为0
        if attention_mask is not None:
            hidden_states[~attention_mask] = 0.0

            # 扩展注意力掩码维度，确保其与隐藏状态的数据类型匹配
            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
            attention_mask = attention_mask.expand(
                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
            )

        # 对隐藏状态进行dropout处理
        hidden_states = self.dropout(hidden_states)

        # 如果存在位置嵌入，则计算相对位置嵌入
        if self.embed_positions is not None:
            relative_position_embeddings = self.embed_positions(hidden_states)
        else:
            relative_position_embeddings = None

        # 检查是否启用了DeepSpeed Zero3
        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()

        # 遍历每个层进行处理
        for i, layer in enumerate(self.layers):
            # 如果需要输出隐藏状态，则将当前隐藏状态加入到列表中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 添加LayerDrop机制，决定是否跳过当前层
            dropout_probability = torch.rand([])
            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False

            # 如果不跳过当前层或者启用了DeepSpeed Zero3
            if not skip_the_layer or deepspeed_zero3_is_enabled:
                # 如果启用了梯度检查点功能且在训练模式下，则使用梯度检查点函数
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        layer.__call__,
                        hidden_states,
                        attention_mask,
                        relative_position_embeddings,
                        output_attentions,
                    )
                else:
                    # 否则直接调用当前层的forward方法
                    layer_outputs = layer(
                        hidden_states,
                        attention_mask=attention_mask,
                        relative_position_embeddings=relative_position_embeddings,
                        output_attentions=output_attentions,
                    )
                hidden_states = layer_outputs[0]

            # 如果跳过当前层，则将输出设置为None
            if skip_the_layer:
                layer_outputs = (None, None)

            # 如果需要输出注意力矩阵，则将当前层的自注意力矩阵加入到列表中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 对最终的隐藏状态进行LayerNorm处理
        hidden_states = self.layer_norm(hidden_states)

        # 如果需要输出隐藏状态，则将最终的隐藏状态加入到列表中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 根据是否需要以字典形式返回结果，决定返回哪些值
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GumbelVectorQuantizer复制到Wav2Vec2->Wav2Vec2Conformer
class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module):
    """
    使用Gumbel softmax进行向量量化。详见[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf)获取更多信息。
    """

    def __init__(self, config):
        super().__init__()
        self.num_groups = config.num_codevector_groups  # 设置编码向量组的数量
        self.num_vars = config.num_codevectors_per_group  # 设置每组编码向量的数量

        if config.codevector_dim % self.num_groups != 0:
            raise ValueError(
                f"`config.codevector_dim {config.codevector_dim} must be divisible "
                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
            )

        # 为编码簇变量（码本）预留存储空间
        self.codevectors = nn.Parameter(
            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
        )
        # 权重投影层，用于线性映射
        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)

        # 可以在训练过程中逐渐减小的温度参数
        self.temperature = 2

    @staticmethod
    def _compute_perplexity(probs, mask=None):
        """
        计算困惑度的静态方法。
        
        Args:
            probs (Tensor): 概率分布张量
            mask (Tensor, optional): 掩码张量，用于指示哪些位置应计入计算
        
        Returns:
            Tensor: 计算得到的困惑度
        """
        if mask is not None:
            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
            marginal_probs = probs.sum(dim=0) / mask.sum()
        else:
            marginal_probs = probs.mean(dim=0)

        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
        return perplexity
    # 定义前向传播方法，接受隐藏状态和时间掩码索引作为输入
    def forward(self, hidden_states, mask_time_indices=None):
        # 获取批量大小、序列长度和隐藏大小
        batch_size, sequence_length, hidden_size = hidden_states.shape

        # 投影到代码向量维度
        hidden_states = self.weight_proj(hidden_states)
        # 重新调整张量形状以便后续处理
        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)

        if self.training:
            # 使用 Gumbel Softmax 方法对隐藏状态进行采样，以获取代码向量概率
            codevector_probs = nn.functional.gumbel_softmax(
                hidden_states.float(), tau=self.temperature, hard=True
            ).type_as(hidden_states)

            # 计算困惑度
            codevector_soft_dist = torch.softmax(
                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
            )
            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
        else:
            # 在非可微方式下，取隐藏状态的最大值索引，计算硬代码向量分布（one hot）
            codevector_idx = hidden_states.argmax(dim=-1)
            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
                -1, codevector_idx.view(-1, 1), 1.0
            )
            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)

            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)

        # 将代码向量概率重新调整张量形状以便后续处理
        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
        # 使用概率值检索代码向量
        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
        # 将结果重新调整张量形状以便后续处理
        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)

        # 返回代码向量和困惑度
        return codevectors, perplexity
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter复制并修改为Wav2Vec2Conformer
class Wav2Vec2ConformerAdapter(nn.Module):
    def __init__(self, config):
        super().__init__()

        # 如果输出的隐藏大小不等于隐藏大小，则可能需要降维特征维度
        if config.output_hidden_size != config.hidden_size:
            # 线性投影层，将隐藏状态投影到输出的隐藏大小
            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
            # LayerNorm层，用于归一化投影后的隐藏状态
            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
        else:
            self.proj = self.proj_layer_norm = None

        # 使用Wav2Vec2ConformerAdapterLayer创建一组适配器层
        self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
        # LayerDrop的概率
        self.layerdrop = config.layerdrop

    def forward(self, hidden_states):
        # 如果需要，对隐藏状态进行降维投影
        if self.proj is not None and self.proj_layer_norm is not None:
            hidden_states = self.proj(hidden_states)
            hidden_states = self.proj_layer_norm(hidden_states)

        # 将维度1和2进行转置，适配器层通常操作的维度顺序
        hidden_states = hidden_states.transpose(1, 2)

        # 对每一层适配器进行迭代
        for layer in self.layers:
            layerdrop_prob = np.random.random()
            # 如果不在训练阶段或者随机数大于LayerDrop概率，则跳过当前层
            if not self.training or (layerdrop_prob > self.layerdrop):
                hidden_states = layer(hidden_states)

        # 再次转置维度1和2，返回最终的隐藏状态
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer复制并修改为Wav2Vec2ConformerAdapterLayer
class Wav2Vec2ConformerAdapterLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 一维卷积层，用于适配器层的特征提取和变换
        self.conv = nn.Conv1d(
            config.output_hidden_size,
            2 * config.output_hidden_size,
            config.adapter_kernel_size,
            stride=config.adapter_stride,
            padding=1,
        )

    def forward(self, hidden_states):
        # 对隐藏状态进行一维卷积操作
        hidden_states = self.conv(hidden_states)
        # 使用门控线性单元（GLU）激活函数进行特征变换
        hidden_states = nn.functional.glu(hidden_states, dim=1)

        return hidden_states


class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，用于处理权重初始化、下载和加载预训练模型的简单接口。
    """

    # 对应的配置类
    config_class = Wav2Vec2ConformerConfig
    # 基础模型的前缀
    base_model_prefix = "wav2vec2_conformer"
    # 主输入名称
    main_input_name = "input_values"
    # 是否支持梯度检查点
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果 module 是 Wav2Vec2ConformerForPreTraining 类型，则初始化其两个线性层的参数
        if isinstance(module, Wav2Vec2ConformerForPreTraining):
            module.project_hid.reset_parameters()  # 重置隐藏层投影的参数
            module.project_q.reset_parameters()  # 重置查询投影的参数
            module.project_hid._is_hf_initialized = True  # 设置隐藏层投影已经初始化标志
            module.project_q._is_hf_initialized = True  # 设置查询投影已经初始化标志
        # 如果 module 是 Wav2Vec2ConformerGumbelVectorQuantizer 类型，则特殊初始化参数
        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
            module.weight_proj.weight.data.normal_(mean=0.0, std=1)  # 使用正态分布初始化权重
            module.weight_proj.bias.data.zero_()  # 将偏置初始化为零
            nn.init.uniform_(module.codevectors)  # 使用均匀分布初始化 codevectors
        # 如果 module 是 Wav2Vec2ConformerSelfAttention 类型，则根据属性初始化参数
        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
            if hasattr(module, "pos_bias_u"):
                nn.init.xavier_uniform_(module.pos_bias_u)  # 使用 Xavier 均匀分布初始化 pos_bias_u
            if hasattr(module, "pos_bias_v"):
                nn.init.xavier_uniform_(module.pos_bias_v)  # 使用 Xavier 均匀分布初始化 pos_bias_v
        # 如果 module 是 Wav2Vec2ConformerPositionalConvEmbedding 类型，则使用正态分布初始化参数
        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
            nn.init.normal_(
                module.conv.weight,  # 卷积层权重初始化为正态分布
                mean=0,
                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
            )
            nn.init.constant_(module.conv.bias, 0)  # 卷积层偏置初始化为零
        # 如果 module 是 Wav2Vec2ConformerFeatureProjection 类型，则使用均匀分布初始化参数
        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
            k = math.sqrt(1 / module.projection.in_features)
            nn.init.uniform_(module.projection.weight, a=-k, b=k)  # 投影层权重均匀初始化
            nn.init.uniform_(module.projection.bias, a=-k, b=k)  # 投影层偏置均匀初始化
        # 如果 module 是 nn.Linear 类型，则使用正态分布初始化权重，同时初始化偏置为零
        elif isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果 module 是 nn.LayerNorm 或 nn.GroupNorm 类型，则初始化偏置为零，权重为1
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        # 如果 module 是 nn.Conv1d 类型，则使用 Kaiming 正态分布初始化权重，初始化偏置为特定均匀分布
        elif isinstance(module, nn.Conv1d):
            nn.init.kaiming_normal_(module.weight)  # 使用 Kaiming 正态分布初始化卷积层权重
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)  # 使用均匀分布初始化卷积层偏置
    ):
        """
        计算卷积层的输出长度
        """

        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter

        def _conv_out_length(input_length, kernel_size, stride):
            # 从 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html 获取的一维卷积层输出长度公式
            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            # 计算每个卷积核对应的输出长度
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        if add_adapter:
            # 如果需要添加适配器层，则对每层适配器使用特定的卷积核大小和步长计算输出长度
            for _ in range(self.config.num_adapter_layers):
                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)

        return input_lengths

    def _get_feature_vector_attention_mask(
        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
    ):
        # 计算非填充部分的长度，即注意力掩码中每个序列的实际长度之和
        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]

        # 获取特征向量提取器的输出长度
        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
        output_lengths = output_lengths.to(torch.long)

        batch_size = attention_mask.shape[0]

        # 创建一个注意力掩码张量，用于控制哪些部分需要进行注意力
        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )
        # 确保在输出长度之前的所有位置都被注意到
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
        # 对注意力掩码进行翻转和累加操作，确保在输出长度之前的所有位置都被置为True
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
        return attention_mask
# WAV2VEC2_CONFORMER_START_DOCSTRING 是一个长字符串，用于存储 Wav2Vec2Conformer 模型的文档字符串。
WAV2VEC2_CONFORMER_START_DOCSTRING = r"""
    Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
    Auli.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.

    Parameters:
        config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# WAV2VEC2_CONFORMER_INPUTS_DOCSTRING 是另一个长字符串，用于存储 Wav2Vec2Conformer 模型输入的文档字符串。
WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
    This docstring should detail the expected inputs of the Wav2Vec2Conformer model.
    It typically includes information on the type and shape of input tensors required
    for the model's forward pass, along with any additional context or constraints
    on the input data.

    Example:
        Inputs:
            - input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
                Indices of input tokens in the vocabulary.

    Note:
        This docstring should be completed to provide comprehensive guidance on how to
        format and prepare inputs for the model.
"""
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            # 输入的原始语音波形的浮点值。可以通过加载 `.flac` 或 `.wav` 音频文件并将其转换成 `List[float]` 或 `numpy.ndarray` 类型的数组获得。使用 `soundfile` 库 (`pip install soundfile`)。
            # 使用 [`AutoProcessor`] 进行填充和转换，生成 `torch.FloatTensor` 类型的张量 `input_values`。详见 [`Wav2Vec2Processor.__call__`]。
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩，用于在填充标记索引上避免进行卷积和注意力操作。
            # 遮罩值选择在 `[0, 1]`：

            # - 1 表示**未遮罩**的标记，
            # - 0 表示**已遮罩**的标记。

            # [什么是注意力遮罩？](../glossary#attention-mask)

            <Tip warning={true}>
            # 只有在相应的处理器具有 `config.return_attention_mask == True` 时才应传递 `attention_mask`。对于所有处理器具有 `config.return_attention_mask == False` 的模型，如 [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large)，应避免传递 `attention_mask` 以避免在进行批量推理时性能下降。对于这些模型，`input_values` 应简单地填充为 0 并传递，而不使用 `attention_mask`。请注意，这些模型在 `input_values` 是否填充会稍有不同的结果。
            </Tip>

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详见返回张量中的 `attentions` 以获取更多细节。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回张量中的 `hidden_states` 以获取更多细节。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
@add_start_docstrings(
    "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
    WAV2VEC2_CONFORMER_START_DOCSTRING,
)
class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
    def __init__(self, config: Wav2Vec2ConformerConfig):
        super().__init__(config)
        self.config = config
        # 初始化特征提取器
        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
        # 初始化特征投影层
        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)

        # 如果配置中的掩码概率大于0.0，则需要初始化掩码向量
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())

        # 初始化编码器
        self.encoder = Wav2Vec2ConformerEncoder(config)

        # 如果配置要求添加适配器，则初始化适配器
        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None

        # 初始化权重并应用最终处理
        self.post_init()

    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.freeze_feature_encoder复制而来
    def freeze_feature_encoder(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新。
        """
        self.feature_extractor._freeze_parameters()

    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states复制而来
    def _mask_hidden_states(
        self,
        hidden_states: torch.FloatTensor,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` 可以设置为 False 来禁用掩蔽
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # 根据是否提供了 `mask_time_indices`，选择是否沿时间轴应用 SpecAugment
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # 使用给定的 `mask_time_indices` 沿时间轴应用 SpecAugment
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # 如果 `mask_time_indices` 未提供且训练模式下配置允许，则生成新的 `mask_time_indices` 并应用 SpecAugment
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # 如果训练模式下配置允许，则生成新的 `mask_feature_indices` 并沿特征轴应用 SpecAugment
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        return hidden_states

    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward 复制而来，将 wav2vec2 改为 wav2vec2_conformer
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        # 确定是否输出注意力权重，如果未指定则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态，如果未指定则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否返回字典形式的输出，如果未指定则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 提取特征向量
        extract_features = self.feature_extractor(input_values)
        # 转置特征向量，调整维度顺序
        extract_features = extract_features.transpose(1, 2)

        # 如果存在注意力掩码，计算对应于特征向量的降维注意力掩码
        if attention_mask is not None:
            attention_mask = self._get_feature_vector_attention_mask(
                extract_features.shape[1], attention_mask, add_adapter=False
            )

        # 特征投影
        hidden_states, extract_features = self.feature_projection(extract_features)
        
        # 对隐藏状态进行遮罩处理
        hidden_states = self._mask_hidden_states(
            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
        )

        # 编码器处理
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的隐藏状态输出
        hidden_states = encoder_outputs[0]

        # 如果存在适配器，应用适配器到隐藏状态上
        if self.adapter is not None:
            hidden_states = self.adapter(hidden_states)

        # 如果不要求以字典形式返回结果，则返回元组形式的输出
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]

        # 以 Wav2Vec2BaseModelOutput 类型返回结果，包括最后的隐藏状态、提取的特征、编码器的隐藏状态、注意力权重
        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
)
class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.__init__ 复制而来，将类名和部分参数改为适应 Wav2Vec2Conformer 模型
    def __init__(self, config: Wav2Vec2ConformerConfig):
        super().__init__(config)
        # 初始化 Wav2Vec2ConformerModel 模型
        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
        # 定义特征量化器的 dropout 层
        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)

        # 初始化 Wav2Vec2ConformerGumbelVectorQuantizer 量化器
        self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)

        # 定义线性层用于投影隐藏状态到编码向量维度
        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
        # 定义线性层用于投影量化码向量维度
        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.set_gumbel_temperature 复制而来
    def set_gumbel_temperature(self, temperature: int):
        """
        设置 Gumbel softmax 的温度值为给定值。仅在训练时需要。
        """
        self.quantizer.temperature = temperature

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.freeze_feature_encoder 复制而来，将函数名和部分参数改为适应 Wav2Vec2Conformer 模型
    def freeze_feature_encoder(self):
        """
        调用此函数将禁用特征编码器的梯度计算，以便在训练过程中不更新其参数。
        """
        self.wav2vec2_conformer.feature_extractor._freeze_parameters()

    @staticmethod
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.compute_contrastive_logits 复制而来
    def compute_contrastive_logits(
        target_features: torch.FloatTensor,
        negative_features: torch.FloatTensor,
        predicted_features: torch.FloatTensor,
        temperature: int = 0.1,
    ):
        """
        基于余弦相似度作为距离度量计算对比损失的 logits，计算方式为 `[positive_feature, negative_features]` 和 `[predicted_features]` 的相似度。
        可以应用温度参数调整。
        """
        # 将目标特征和负样本特征拼接在一起
        target_features = torch.cat([target_features, negative_features], dim=0)

        # 计算余弦相似度
        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
            target_features
        )

        # 应用温度参数
        logits = logits / temperature
        return logits

    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.forward方法复制而来，做了如下替换：
    # - Wav2Vec2 替换为 Wav2Vec2Conformer
    # - wav2vec2 替换为 wav2vec2_conformer
    # - wav2vec2_conformer-base 替换为 wav2vec2-conformer-rel-pos-large
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.BoolTensor] = None,
        sampled_negative_indices: Optional[torch.BoolTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 添加模型文档字符串，描述了这是一个带有语言建模头部的 Wav2Vec2Conformer 模型，用于CTC（连接主义时间分类）任务。
@add_start_docstrings(
    """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    WAV2VEC2_CONFORMER_START_DOCSTRING,
)
class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ 复制而来，将 Wav2Vec2 替换为 Wav2Vec2Conformer，wav2vec2 替换为 wav2vec2_conformer
    def __init__(self, config, target_lang: Optional[str] = None):
        super().__init__(config)

        # 创建 Wav2Vec2ConformerModel 模型
        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
        # 使用配置中指定的最终 dropout 率创建 dropout 层
        self.dropout = nn.Dropout(config.final_dropout)

        # 设置目标语言
        self.target_lang = target_lang

        # 如果配置中未定义语言模型头部的词汇表大小，则抛出错误
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )

        # 根据配置设置输出隐藏层大小
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )
        # 创建线性层作为语言模型头部
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder 复制而来，将 wav2vec2 替换为 wav2vec2_conformer
    def freeze_feature_encoder(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其参数在训练过程中不会更新。
        """
        self.wav2vec2_conformer.feature_extractor._freeze_parameters()

    # 从 add_start_docstrings_to_model_forward 和 add_code_sample_docstrings 复制而来，将 Wav2Vec2 替换为 Wav2Vec2Conformer，wav2vec2 替换为 wav2vec2_conformer
    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward 复制而来，将 Wav2Vec2 替换为 Wav2Vec2Conformer，wav2vec2 替换为 wav2vec2_conformer
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        ):
    ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 初始化 return_dict 变量，如果 return_dict 参数为 None，则使用配置中的 use_return_dict 值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 wav2vec2_conformer 模型处理输入数据
        outputs = self.wav2vec2_conformer(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的隐藏状态，并应用 dropout
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # 将隐藏状态通过 lm_head 网络得到预测的 logits
        logits = self.lm_head(hidden_states)

        # 初始化损失变量
        loss = None
        if labels is not None:
            # 检查标签值是否在合法范围内
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # 根据注意力掩码获取输入长度
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

            # 假设填充的标记为 -100，在计算损失时忽略这些标记
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # 计算 log-probabilities 并进行格式转换
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # 使用 CTC 损失函数计算损失
            with torch.backends.cudnn.flags(enabled=False):
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # 如果不要求返回字典，则返回 logits 和可能的其他输出状态
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 返回 CausalLMOutput 对象，其中包括损失、logits、隐藏状态和注意力权重
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
# 使用自定义的文档字符串装饰器为模型类添加描述和文档
@add_start_docstrings(
    """
    Wav2Vec2Conformer 模型，在顶部添加了一个序列分类头（一个线性层，用于池化输出），用于诸如 SUPERB 关键词检测之类的任务。
    """,
    WAV2VEC2_CONFORMER_START_DOCSTRING,
)
class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel):
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ 复制而来，将 Wav2Vec2->Wav2Vec2Conformer，wav2vec2->wav2vec2_conformer
    def __init__(self, config):
        super().__init__(config)

        # 如果配置中包含 add_adapter，并且其值为 True，则引发异常
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
            )
        
        # 创建 Wav2Vec2ConformerModel 对象
        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
        
        # 计算层数，包括 Transformer 层和输入嵌入层
        num_layers = config.num_hidden_layers + 1
        # 如果配置中设置了 use_weighted_layer_sum，则初始化层权重
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 定义分类器的线性投影层
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
        # 定义分类器的线性分类层
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder 复制而来，将 wav2vec2->wav2vec2_conformer
    def freeze_feature_encoder(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其参数在训练过程中不会更新。
        """
        self.wav2vec2_conformer.feature_extractor._freeze_parameters()

    # 冻结基础模型，禁用基础模型的梯度计算，使其参数在训练过程中不会更新，只有分类头会被更新。
    def freeze_base_model(self):
        """
        调用此函数将禁用基础模型的梯度计算，使其参数在训练过程中不会更新。只有分类头会被更新。
        """
        for param in self.wav2vec2_conformer.parameters():
            param.requires_grad = False

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward 复制而来，将 Wav2Vec2->Wav2Vec2Conformer，wav2vec2->wav2vec2_conformer，WAV_2_VEC_2->WAV2VEC2_CONFORMER
    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        # 此处省略了函数定义的其他输入参数
        ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 确保返回字典不为空，使用配置中的返回字典设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果配置中使用加权层求和，则设置输出隐藏状态为真
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用 wav2vec2_conformer 模型进行处理
        outputs = self.wav2vec2_conformer(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置中使用加权层求和，则根据指定位置获取隐藏状态并加权求和
        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接使用第一个输出作为隐藏状态
            hidden_states = outputs[0]

        # 将隐藏状态投影到指定的维度
        hidden_states = self.projector(hidden_states)

        # 如果没有注意力掩码，则计算平均池化输出
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            # 否则根据注意力掩码生成填充掩码，并对隐藏状态进行掩码处理
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states[~padding_mask] = 0.0
            # 计算池化输出，除以掩码元素数量以得到平均值
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        # 使用分类器生成 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        # 如果存在标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        # 如果不使用返回字典，则组装输出并返回
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 否则，返回序列分类器输出对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Wav2Vec2Conformer Model with a frame classification head on top for tasks like Speaker Diarization.
    """,
    WAV2VEC2_CONFORMER_START_DOCSTRING,
)
class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel):
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.__init__ 复制而来，将 Wav2Vec2 替换为 Wav2Vec2Conformer，wav2vec2 替换为 wav2vec2_conformer，WAV_2_VEC_2 替换为 WAV2VEC2_CONFORMER
    def __init__(self, config):
        super().__init__(config)

        # 检查配置中是否有 add_adapter 属性且为 True，若是则抛出异常，因为音频帧分类不支持使用 Wav2Vec2Conformer 适配器
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
            )
        
        # 初始化 Wav2Vec2Conformer 模型
        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
        
        # 计算层数，包括 Transformer 层和输入嵌入层
        num_layers = config.num_hidden_layers + 1  
        
        # 如果配置中使用加权层求和，则初始化层权重
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 初始化分类器，用于最终的帧分类任务
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.num_labels = config.num_labels

        # 初始化模型权重
        self.init_weights()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_feature_encoder 复制而来，将 wav2vec2 替换为 wav2vec2_conformer
    def freeze_feature_encoder(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其在训练过程中不会更新参数。
        """
        self.wav2vec2_conformer.feature_extractor._freeze_parameters()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_base_model 复制而来，将 wav2vec2 替换为 wav2vec2_conformer
    def freeze_base_model(self):
        """
        调用此函数将禁用基础模型的梯度计算，使其参数在训练过程中不会更新。只有分类头将会更新。
        """
        for param in self.wav2vec2_conformer.parameters():
            param.requires_grad = False

    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward 复制而来，将 wav2vec2 替换为 wav2vec2_conformer
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs
    ):
        """
        此函数实现模型的前向传播逻辑，接受输入值、注意力掩码等参数，并返回模型输出结果。

        Args:
            input_values (Optional[torch.Tensor]): 输入值张量。
            attention_mask (Optional[torch.Tensor], optional): 注意力掩码张量，默认为 None。
            labels (Optional[torch.Tensor], optional): 标签张量，默认为 None。
            output_attentions (Optional[bool], optional): 是否输出注意力，默认为 None。
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态，默认为 None。
            return_dict (Optional[bool], optional): 是否返回字典格式的输出，默认为 None。
            **kwargs: 其他参数。

        Returns:
            模型输出结果。
        """
        # 在这里实现具体的前向传播逻辑
        pass
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 初始化是否返回字典的标志，如果未指定则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据配置决定是否输出隐藏层状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用wav2vec2_conformer模型进行推理
        outputs = self.wav2vec2_conformer(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置指定使用加权层求和机制
        if self.config.use_weighted_layer_sum:
            # 提取隐藏状态，并堆叠为张量
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            # 计算归一化的权重并应用到隐藏状态上
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接使用第一个输出作为隐藏状态
            hidden_states = outputs[0]

        # 将隐藏状态传入分类器得到logits
        logits = self.classifier(hidden_states)

        # 初始化损失值
        loss = None
        # 如果存在标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 对logits进行reshape并计算损失
            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))

        # 如果不要求返回字典，则返回元组形式的输出
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return output

        # 返回TokenClassifierOutput对象，包含损失、logits、隐藏状态和注意力信息
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
class AMSoftmaxLoss(nn.Module):
    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
        super(AMSoftmaxLoss, self).__init__()
        self.scale = scale  # 缩放因子，用于放大余弦相似度的值
        self.margin = margin  # 间隔参数，用于增加类别间的距离
        self.num_labels = num_labels  # 类别数目
        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)  # 分类权重矩阵，随机初始化
        self.loss = nn.CrossEntropyLoss()  # 交叉熵损失函数，用于计算分类损失

    def forward(self, hidden_states, labels):
        labels = labels.flatten()  # 将标签展平，以便与预测结果匹配
        weight = nn.functional.normalize(self.weight, dim=0)  # 对权重进行L2归一化，保证数值稳定性
        hidden_states = nn.functional.normalize(hidden_states, dim=1)  # 对隐藏状态进行L2归一化，保证数值稳定性
        cos_theta = torch.mm(hidden_states, weight)  # 计算余弦相似度
        psi = cos_theta - self.margin  # 计算带有间隔参数的余弦相似度

        onehot = nn.functional.one_hot(labels, self.num_labels)  # 将标签转换为one-hot编码
        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)  # 计算缩放后的预测分数
        loss = self.loss(logits, labels)  # 计算最终的损失值

        return loss


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
class TDNNLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]  # 输入维度
        self.out_conv_dim = config.tdnn_dim[layer_id]  # 输出维度
        self.kernel_size = config.tdnn_kernel[layer_id]  # 卷积核大小
        self.dilation = config.tdnn_dilation[layer_id]  # 膨胀率

        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)  # 线性层，用于卷积
        self.activation = nn.ReLU()  # ReLU激活函数

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        if is_peft_available():  # 检查是否存在PEFT库
            from peft.tuners.lora import LoraLayer

            if isinstance(self.kernel, LoraLayer):
                warnings.warn(
                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
                    "You should exclude TDNNLayer from LoRA's target modules.",
                )

        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
        hidden_states = hidden_states.transpose(1, 2)  # 转置隐藏状态的维度
        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)  # 调整卷积核的形状
        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)  # 一维卷积操作
        hidden_states = hidden_states.transpose(1, 2)  # 恢复隐藏状态的维度

        hidden_states = self.activation(hidden_states)  # 应用ReLU激活函数
        return hidden_states


@add_start_docstrings(
    """
    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    """,
    WAV2VEC2_CONFORMER_START_DOCSTRING,
)
class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
    pass  # 没有额外的代码，只是为了提供类文档字符串的类定义
    def __init__(self, config):
        super().__init__(config)

        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)  # 初始化一个Wav2Vec2ConformerModel对象并赋值给self.wav2vec2_conformer
        num_layers = config.num_hidden_layers + 1  # 计算transformer层数加上输入嵌入层的总层数
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)  # 如果配置使用加权层求和，则初始化权重参数
        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])  # 初始化一个线性层self.projector

        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]  # 根据配置中tdnn_dim的定义创建TDNNLayer对象列表
        self.tdnn = nn.ModuleList(tdnn_layers)  # 将TDNNLayer对象列表封装为nn.ModuleList赋值给self.tdnn

        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)  # 初始化一个线性层self.feature_extractor
        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)  # 初始化一个线性层self.classifier

        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)  # 初始化一个AMSoftmaxLoss对象self.objective

        self.init_weights()  # 调用init_weights方法进行初始化参数设置

    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_feature_encoder复制，并替换wav2vec2为wav2vec2_conformer
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.wav2vec2_conformer.feature_extractor._freeze_parameters()  # 调用Wav2Vec2ConformerModel中feature_extractor的_freeze_parameters方法冻结特征编码器参数

    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_base_model复制，并替换wav2vec2为wav2vec2_conformer
    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.wav2vec2_conformer.parameters():  # 遍历Wav2Vec2ConformerModel的所有参数
            param.requires_grad = False  # 将参数的梯度计算设置为False，即不更新这些参数的梯度

    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector._get_tdnn_output_lengths复制，并替换wav2vec2为wav2vec2_conformer
    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        Computes the output length of the TDNN layers
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
            return (input_length - kernel_size) // stride + 1

        for kernel_size in self.config.tdnn_kernel:  # 遍历配置中定义的TDNN核大小
            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)  # 计算每个TDNN层的输出长度

        return input_lengths

    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=XVectorOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward复制，并替换Wav2Vec2为Wav2Vec2Conformer,wav2vec2为wav2vec2_conformer,WAV_2_VEC_2为WAV2VEC2_CONFORMER
    # 定义模型的前向传播方法
    def forward(
        self,
        # 输入的张量值，可以为 None
        input_values: Optional[torch.Tensor],
        # 注意力掩码，可以为 None
        attention_mask: Optional[torch.Tensor] = None,
        # 是否输出注意力权重，默认为 None
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，默认为 None
        output_hidden_states: Optional[bool] = None,
        # 是否返回字典格式的输出，默认为 None
        return_dict: Optional[bool] = None,
        # 标签数据的张量，可以为 None
        labels: Optional[torch.Tensor] = None,
        ) -> Union[Tuple, XVectorOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 初始化 return_dict，如果未指定则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据 self.config.use_weighted_layer_sum 设置 output_hidden_states
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用 wav2vec2_conformer 模型，传入参数并获取输出
        outputs = self.wav2vec2_conformer(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果使用加权层求和，则对隐藏状态进行加权求和操作
        if self.config.use_weighted_layer_sum:
            # 从 outputs 中获取隐藏状态
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            # 在第二维度上堆叠隐藏状态
            hidden_states = torch.stack(hidden_states, dim=1)
            # 计算归一化的权重
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            # 使用权重加权求和隐藏状态
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接使用 outputs 的第一个元素作为隐藏状态
            hidden_states = outputs[0]

        # 将隐藏状态投影到指定维度
        hidden_states = self.projector(hidden_states)

        # 通过循环对隐藏状态进行时间延迟神经网络层的前向传播
        for tdnn_layer in self.tdnn:
            hidden_states = tdnn_layer(hidden_states)

        # 统计池化操作
        if attention_mask is None:
            # 如果没有给定 attention_mask，则计算全局平均值和标准差
            mean_features = hidden_states.mean(dim=1)
            std_features = hidden_states.std(dim=1)
        else:
            # 否则根据 attention_mask 计算特征提取器的输出长度和 TDNN 层的输出长度
            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
            mean_features = []
            std_features = []
            # 遍历计算每个 TDNN 层的平均值和标准差
            for i, length in enumerate(tdnn_output_lengths):
                mean_features.append(hidden_states[i, :length].mean(dim=0))
                std_features.append(hidden_states[i, :length].std(dim=0))
            mean_features = torch.stack(mean_features)
            std_features = torch.stack(std_features)
        # 拼接平均特征和标准差特征
        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)

        # 通过特征提取器提取统计池化特征的表示
        output_embeddings = self.feature_extractor(statistic_pooling)
        # 通过分类器生成 logits
        logits = self.classifier(output_embeddings)

        # 初始化 loss
        loss = None
        # 如果提供了标签，则计算损失值
        if labels is not None:
            loss = self.objective(logits, labels)

        # 如果 return_dict 为 False，则返回 logits、output_embeddings 和隐藏状态
        if not return_dict:
            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 否则返回 XVectorOutput 对象，包含 loss、logits、embeddings、隐藏状态和注意力
        return XVectorOutput(
            loss=loss,
            logits=logits,
            embeddings=output_embeddings,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Transformers-源码解析-一百二十二-

Transformers 源码解析（一百二十二）

.\models\wav2vec2\__init__.py

.\models\wav2vec2_bert\configuration_wav2vec2_bert.py

.\models\wav2vec2_bert\convert_wav2vec2_seamless_checkpoint.py

.\models\wav2vec2_bert\modeling_wav2vec2_bert.py

.\models\wav2vec2_bert\processing_wav2vec2_bert.py

.\models\wav2vec2_bert\__init__.py

.\models\wav2vec2_conformer\configuration_wav2vec2_conformer.py

.\models\wav2vec2_conformer\convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py

.\models\wav2vec2_conformer\modeling_wav2vec2_conformer.py

`.\models\wav2vec2\init.py`

`.\models\wav2vec2_bert\configuration_wav2vec2_bert.py`

`.\models\wav2vec2_bert\convert_wav2vec2_seamless_checkpoint.py`

`.\models\wav2vec2_bert\modeling_wav2vec2_bert.py`

`.\models\wav2vec2_bert\processing_wav2vec2_bert.py`

`.\models\wav2vec2_bert\init.py`

`.\models\wav2vec2_conformer\configuration_wav2vec2_conformer.py`

`.\models\wav2vec2_conformer\convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py`

`.\models\wav2vec2_conformer\modeling_wav2vec2_conformer.py`