Transformers 源码解析（一百零五）

`.\models\speech_encoder_decoder\convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py`

# coding=utf-8
# 上面是指定代码文件的编码格式为 UTF-8

# 版权声明和许可证信息，告知代码的版权和使用许可
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Convert Wav2Vec2 checkpoint."""

# 导入需要的模块
import argparse  # 用于命令行解析
import json  # 用于处理 JSON 格式的数据
import os  # 用于处理操作系统相关的功能

import fairseq  # 导入 fairseq 库，用于处理序列到序列任务
import torch  # 导入 PyTorch 深度学习库
from torch import nn  # 导入 PyTorch 的神经网络模块

# 导入 transformers 库中的相关模块和类
from transformers import (
    Speech2Text2Config,  # 语音到文本模型的配置类
    Speech2Text2ForCausalLM,  # 语音到文本模型的主类
    Speech2Text2Tokenizer,  # 语音到文本模型的分词器
    SpeechEncoderDecoderConfig,  # 语音编码解码器的配置类
    SpeechEncoderDecoderModel,  # 语音编码解码器的模型类
    Wav2Vec2Config,  # Wav2Vec2 模型的配置类
    Wav2Vec2FeatureExtractor,  # Wav2Vec2 模型的特征提取器
    Wav2Vec2Model,  # Wav2Vec2 模型的主类
    logging,  # 日志记录模块
)

logging.set_verbosity_info()  # 设置日志记录级别为 info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

# 定义一个映射字典，用于将原始模型的参数名映射到转换后的模型参数名
MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
    "fc2": "encoder.layers.*.feed_forward.output_dense",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "w2v_model.layer_norm": "feature_projection.layer_norm",
    "quantizer.weight_proj": "quantizer.weight_proj",
    "quantizer.vars": "quantizer.codevectors",
    "project_q": "project_q",
    "final_proj": "project_hid",
    "w2v_encoder.proj": "lm_head",
    "mask_emb": "masked_spec_embed",
}
# 定义顶级键列表，这些键表示模型的顶级组件或属性
TOP_LEVEL_KEYS = [
    "lm_head",
    "quantizer.weight_proj",
    "quantizer.codevectors",
    "project_q",
    "project_hid",
]

# 定义一个递归设置函数，用于将转换后的模型参数值设置到对应的位置
def set_recursively(hf_pointer, key, value, full_name, weight_type):
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)

    # 获取当前属性的形状信息
    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    # 断言检查当前属性的形状与期望的值的形状是否一致
    assert hf_shape == value.shape, (
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

    # 根据权重类型设置对应的值到当前属性中
    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    else:
        hf_pointer.data = value
    # 记录信息日志，指示某个变量的初始化情况以及其来源
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
# 递归加载权重到 wav2vec2 模型中
def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
    # 未使用的权重列表
    unused_weights = []
    # 获取 fairseq 模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 获取 hf_model 的特征提取器
    feature_extractor = hf_model.feature_extractor

    # 如果编码器与解码器维度不同，则使用 proj_weight
    proj_weight = None

    # 遍历 fairseq 模型的状态字典
    for name, value in fairseq_dict.items():
        # 标记是否被使用的标志
        is_used = False
        # 如果名称中包含 "conv_layers"
        if "conv_layers" in name:
            # 调用加载卷积层的函数
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            is_used = True
        # 否则，如果名称以 "proj" 开头
        elif name.split(".")[0] == "proj":
            # 获取 proj_weight
            proj_weight = fairseq_model.proj
            is_used = True
        # 否则，遍历 MAPPING 中的键值对
        else:
            for key, mapped_key in MAPPING.items():
                # 如果 key 出现在名称中或者 key 的一部分出现在名称的开头
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    is_used = True
                    # 如果 mapped_key 中包含通配符 "*"
                    if "*" in mapped_key:
                        # 获取层索引
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    # 根据名称中的关键词确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        weight_type = "weight"
                    else:
                        weight_type = None
                    # 递归地设置 hf_model 的权重
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                continue
        # 如果未被使用，则添加到未使用的权重列表中
        if not is_used:
            unused_weights.append(name)

    # 记录未使用的权重
    logger.warning(f"Unused weights: {unused_weights}")

    # 返回 proj_weight
    return proj_weight


# 加载卷积层参数
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 获取卷积层名称
    name = full_name.split("conv_layers.")[-1]
    # 拆分名称
    items = name.split(".")
    # 获取层 ID 和类型 ID
    layer_id = int(items[0])
    type_id = int(items[1])

    # 如果类型 ID 为 0
    if type_id == 0:
        # 如果名称中包含 "bias"
        if "bias" in name:
            # 断言值的形状与特征提取器中卷积层的偏置数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            # 将值设置为特征提取器中卷积层的偏置数据
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 否则，如果名称中包含 "weight"
        elif "weight" in name:
            # 断言值的形状与特征提取器中卷积层的权重数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            # 将值设置为特征提取器中卷积层的权重数据
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 如果 type_id 等于 2 并且不使用 group normalization，或者 type_id 等于 2 并且 layer_id 等于 0 并且使用 group normalization，则执行以下操作
    if (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果变量名中包含 "bias"
        if "bias" in name:
            # 断言当前值的形状与特征提取器 conv_layers[layer_id] 的层归一化偏置数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            # 将值赋给特征提取器 conv_layers[layer_id] 的层归一化偏置数据
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            # 记录日志，指示层归一化权重已从特定名称初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果变量名中包含 "weight"
        elif "weight" in name:
            # 断言当前值的形状与特征提取器 conv_layers[layer_id] 的层归一化权重数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
            )
            # 将值赋给特征提取器 conv_layers[layer_id] 的层归一化权重数据
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            # 记录日志，指示层归一化权重已从特定名称初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    # 如果不满足以上条件
    else:
        # 将未使用的权重名称添加到未使用权重列表中
        unused_weights.append(full_name)
# 从预训练的嵌入层创建一个线性层
def make_linear_from_emb(emb):
    # 获取嵌入层的词汇表大小和嵌入维度
    vocab_size, emb_size = emb.weight.shape
    # 创建一个没有偏置的线性层，输入大小为词汇表大小，输出大小为嵌入维度
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
    # 将线性层的权重设置为输入嵌入层的权重
    lin_layer.weight.data = emb.weight.data
    return lin_layer


# 创建一个词汇字典并返回
def create_vocab_dict(dict_path):
    with open(dict_path, "r", encoding="utf-8") as f:
        # 读取字典文件中的所有行
        lines = f.readlines()
        # 提取每行的第一个单词作为词汇列表
        words = [line.split(" ")[0] for line in lines]

    num_words = len(words)

    # 预定义的特殊标记和其索引
    vocab_dict = {
        "<s>": 0,
        "<pad>": 1,
        "</s>": 2,
        "<unk>": 3,
    }

    # 更新词汇字典，将从字典文件中读取的单词和索引添加进去
    vocab_dict.update(dict(zip(words, range(4, num_words + 4))))
    return vocab_dict


# 转换Wav2Vec2模型的检查点到Transformers设计
@torch.no_grad()
def convert_wav2vec2_checkpoint(
    checkpoint_path,
    pytorch_dump_folder_path,
    dict_path,
    encoder_config_path,
    decoder_config_path,
    vocab_size,
    num_decoder_layers,
):
    """
    复制/粘贴/调整模型权重到Transformers设计。
    """
    # 加载Wav2Vec2的编码器和解码器配置
    encoder_config = Wav2Vec2Config.from_pretrained(encoder_config_path)
    decoder_config = Speech2Text2Config.from_pretrained(
        decoder_config_path, vocab_size=vocab_size, decoder_layers=num_decoder_layers, do_stable_layer_norm=True
    )

    # 创建Wav2Vec2特征提取器
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16000,
        padding_value=0,
        do_normalize=True,
        return_attention_mask=True,
    )

    # 加载模型的权重并设置为评估模式
    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
    )
    model = model[0].eval()

    # 设置Wav2Vec2编码器的权重
    hf_encoder = Wav2Vec2Model(encoder_config)
    projection_layer = recursively_load_weights_wav2vec2(model.encoder, hf_encoder)

    # 创建Speech2Text2的解码器模型
    hf_decoder = Speech2Text2ForCausalLM(decoder_config)
    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)

    # 设置输出线性层的权重
    unexpected_keys.remove("embed_out")
    hf_decoder.lm_head.weight = nn.Parameter(model.decoder.embed_out.detach())

    # layer norm 初始化为单位矩阵，因此保持它不变是可以的
    logger.warning(f"加载解码器权重时缺少以下键: {missing_keys}")
    logger.warning(f"加载解码器权重时出现以下意外的键: {unexpected_keys}")

    # 创建包含编码器和解码器的模型
    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
    hf_wav2vec.config.tie_word_embeddings = False

    # 添加投影层
    hf_wav2vec.enc_to_dec_proj.weight = nn.Parameter(projection_layer.weight)
    hf_wav2vec.enc_to_dec_proj.bias = nn.Parameter(projection_layer.bias)

    # 创建词汇字典
    vocab_dict = create_vocab_dict(dict_path)

    # 将词汇字典保存为JSON文件
    with open(os.path.join(pytorch_dump_folder_path, "vocab.json"), "w") as fp:
        json.dump(vocab_dict, fp)

    # 保存Tokenizer的预训练文件
    tokenizer = Speech2Text2Tokenizer(os.path.join(pytorch_dump_folder_path, "vocab.json"))
    tokenizer.save_pretrained(pytorch_dump_folder_path)

    # 将配置保存为字典格式
    config = hf_wav2vec.config.to_dict()
    # 将tokenizer的pad_token_id赋给config字典中的pad_token_id键
    config["pad_token_id"] = tokenizer.pad_token_id
    # 将tokenizer的bos_token_id赋给config字典中的bos_token_id键
    config["bos_token_id"] = tokenizer.bos_token_id
    # 将tokenizer的eos_token_id赋给config字典中的eos_token_id键
    config["eos_token_id"] = tokenizer.eos_token_id
    # 将字符串"speech_to_text_2"赋给config字典中的tokenizer_class键
    config["tokenizer_class"] = "speech_to_text_2"
    # 将字符串"wav2vec2"赋给config字典中的feature_extractor_type键
    config["feature_extractor_type"] = "wav2vec2"

    # 使用SpeechEncoderDecoderConfig类从config字典创建hf_wav2vec的配置对象
    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)

    # 将hf_wav2vec模型保存到指定路径pytorch_dump_folder_path
    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
    # 将feature_extractor对象保存到指定路径pytorch_dump_folder_path
    feature_extractor.save_pretrained(pytorch_dump_folder_path)
# 如果脚本作为主程序运行，则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加命令行参数，指定输出的 PyTorch 模型路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数，指定 Fairseq 检查点路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加命令行参数，指定经过微调的模型词典路径
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    # 添加命令行参数，指定 HF 编码器 wav2vec2 检查点配置路径，默认为预设值
    parser.add_argument(
        "--encoder_config_path",
        default="facebook/wav2vec2-large-lv60",
        type=str,
        help="Path to hf encoder wav2vec2 checkpoint config",
    )
    # 添加命令行参数，指定 HF 解码器 s2t 检查点配置路径，默认为预设值
    parser.add_argument(
        "--decoder_config_path",
        default="facebook/s2t-small-mustc-en-fr-st",
        type=str,
        help="Path to hf decoder s2t checkpoint config",
    )
    # 添加命令行参数，指定解码器的词汇表大小，默认为预设值
    parser.add_argument("--vocab_size", default=10224, type=int, help="Vocab size of decoder")
    # 添加命令行参数，指定解码器的层数，默认为预设值
    parser.add_argument("--num_decoder_layers", default=7, type=int, help="Number of decoder layers")

    # 解析命令行参数并将其存储到 args 对象中
    args = parser.parse_args()

    # 调用 convert_wav2vec2_checkpoint 函数，传入命令行参数中指定的各个路径和配置
    convert_wav2vec2_checkpoint(
        args.checkpoint_path,
        args.pytorch_dump_folder_path,
        args.dict_path,
        encoder_config_path=args.encoder_config_path,
        decoder_config_path=args.decoder_config_path,
        vocab_size=args.vocab_size,
        num_decoder_layers=args.num_decoder_layers,
    )

`.\models\speech_encoder_decoder\modeling_flax_speech_encoder_decoder.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Classes to support Flax Speech-Encoder-Decoder architectures"""

import os
from typing import Optional, Tuple, Union

import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey

from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
from ...modeling_flax_utils import FlaxPreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from ..auto.configuration_auto import AutoConfig
from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig


logger = logging.get_logger(__name__)

# 用于文档字符串的配置名称
_CONFIG_FOR_DOC = "SpeechEncoderDecoderConfig"

# 文档字符串的起始部分，描述了 Speech-Encoder-Decoder 模型的初始化和用途
SPEECH_ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a speech-sequence-to-text-sequence model with any pretrained speech
    autoencoding model as the encoder and any pretrained text autoregressive model as the decoder. The encoder is
    loaded via [`~AutoModel.from_pretrained`] function and the decoder is loaded via
    [`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added to the decoder
    and should be fine-tuned on a downstream generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    Additionally, in [Large-Scale Self- and Semi-Supervised Learning for Speech
    Translation](https://arxiv.org/abs/2104.06678) it is shown how leveraging large pretrained speech models for speech
    translation yields a significant performance improvement.

    After such an Speech-Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other
    models (see the examples for more information).
    # 这个模型继承自 `FlaxPreTrainedModel`。请查看超类文档以了解库为所有模型实现的通用方法（如下载或保存模型、调整输入嵌入、修剪头等）。
    
    # 这个模型同时也是 Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) 的子类。
    # 可以像常规的 Flax Module 一样使用它，并参考 Flax 文档了解与一般用法和行为相关的所有事项。

    # 参数:
    # config ([`SpeechEncoderDecoderConfig`]): 模型配置类，包含模型的所有参数。
    # 初始化时使用配置文件不会加载与模型关联的权重，只加载配置。查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法以加载模型权重。
    
    # dtype (`jax.numpy.dtype`, *optional*, 默认为 `jax.numpy.float32`):
    # 计算的数据类型。可以是 `jax.numpy.float32`、`jax.numpy.float16`（在GPU上）和 `jax.numpy.bfloat16`（在TPU上）之一。
    
    # 这可以用于在GPU或TPU上启用混合精度训练或半精度推理。如果指定了dtype，所有计算将使用给定的 `dtype` 执行。
    
    # **请注意，这只指定计算的数据类型，并不影响模型参数的数据类型。**
    
    # 如果希望更改模型参数的数据类型，请参阅 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""

SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
    Args:
        inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
            Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
            or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
            library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
            [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
            `torch.FloatTensor`.
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
            created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
            and prepending them with the `decoder_start_token_id`.
        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
            range `[0, config.decoder.max_position_embeddings - 1]`.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
"""
    # 定义函数签名，描述了函数的输入参数及其类型
    Args:
        inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
            Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac*
            or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile
            library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or
            [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
            *torch.FloatTensor*.
    
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
    
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
    
            [What are attention masks?](../glossary#attention-mask)
    
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
    
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
    
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
"""
定义了一个多行字符串常量，用作文档字符串，描述了输入解码器的期望格式。
"""


class FlaxSpeechEncoderDecoderModule(nn.Module):
    config: SpeechEncoderDecoderConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        encoder_config = self.config.encoder
        decoder_config = self.config.decoder

        # 从`modeling_hybrid_clip.py`中复制代码，并进行修改。
        from ...models.auto.modeling_flax_auto import FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, FLAX_MODEL_MAPPING

        # 根据编码器和解码器配置，选择相应的模块类
        encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class
        decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class

        # 使用选择的模块类和配置创建编码器和解码器实例
        self.encoder = encoder_module(encoder_config, dtype=self.dtype)
        self.decoder = decoder_module(decoder_config, dtype=self.dtype)

        # 如果编码器输出维度与解码器隐藏状态维度不同，并且解码器没有交叉注意力隐藏状态
        if (
            self.encoder.config.hidden_size != self.decoder.config.hidden_size
            and self.decoder.config.cross_attention_hidden_size is None
        ):
            # 创建一个全连接层，用于将编码器输出投影到解码器期望的维度
            self.enc_to_dec_proj = nn.Dense(
                self.decoder.config.hidden_size,
                kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
                dtype=self.dtype,
            )
        else:
            self.enc_to_dec_proj = None

    def _get_feat_extract_output_lengths(
        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
    ):
        """
        计算卷积层的输出长度
        """

        add_adapter = self.config.encoder.add_adapter if add_adapter is None else add_adapter

        def _conv_out_length(input_length, kernel_size, stride):
            # 从 https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html 获取的一维卷积层输出长度公式
            return (input_length - kernel_size) // stride + 1

        # 遍历编码器的卷积核和步长，计算每个卷积层的输出长度
        for kernel_size, stride in zip(self.config.encoder.conv_kernel, self.config.encoder.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        # 如果需要，添加适配器卷积层，并计算适配器卷积层的输出长度
        if add_adapter:
            for _ in range(self.config.encoder.num_adapter_layers):
                input_lengths = _conv_out_length(input_lengths, 1, self.config.encoder.adapter_stride)

        return input_lengths

    def _get_encoder_module(self):
        return self.encoder

    def _get_projection_module(self):
        return self.enc_to_dec_proj

    def _get_decoder_module(self):
        return self.decoder

    def __call__(
        self,
        inputs,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask,
        decoder_position_ids,
        encoder_outputs=None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
        freeze_feature_encoder: bool = False,
        ):
            # 如果 encoder_outputs 为 None，则调用 self.encoder 进行编码器的前向传播计算
            encoder_outputs = self.encoder(
                inputs,
                attention_mask=attention_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                deterministic=deterministic,
                freeze_feature_encoder=freeze_feature_encoder,
            )

        # 获取编码器的隐藏状态
        encoder_hidden_states = encoder_outputs[0]

        # 如果存在 enc_to_dec_proj，进行编码器隐藏状态的投影
        if self.enc_to_dec_proj is not None:
            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)

        # 计算正确的编码器注意力掩码
        if attention_mask is not None:
            # 调用 self.encoder 的 _get_feature_vector_attention_mask 方法获取特征向量注意力掩码
            encoder_attention_mask = self.encoder._get_feature_vector_attention_mask(
                encoder_hidden_states.shape[1], attention_mask
            )
        else:
            encoder_attention_mask = None

        # 在 flax 脚本 modeling_flax_wav2vec2.py 中执行解码器的前向传播计算
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 如果 return_dict 为 False，则返回解码器和编码器的输出
        if not return_dict:
            return decoder_outputs + encoder_outputs

        # 返回 FlaxSeq2SeqLMOutput 对象，包含解码器的输出和编码器的相关隐藏状态和注意力
        return FlaxSeq2SeqLMOutput(
            logits=decoder_outputs.logits,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_hidden_states,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )
# 导入FlaxSpeechEncoderDecoderModel类所需的文档字符串
@add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING)
# 定义FlaxSpeechEncoderDecoderModel类，继承自FlaxPreTrainedModel类
class FlaxSpeechEncoderDecoderModel(FlaxPreTrainedModel):
    # 类的描述性文档字符串，解释此类是如何实例化的，使用了transformer架构作为编码器和解码器模块
    r"""
    [`FlaxSpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture
    with the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one
    as decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the
    encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
    """

    # 引用此类的配置类，即SpeechEncoderDecoderConfig类
    config_class = SpeechEncoderDecoderConfig
    # 指定模型的基础名称前缀，用于命名模型的不同部分
    base_model_prefix: str = "speech_encoder_decoder"
    # 引用此类使用的模块类，即FlaxSpeechEncoderDecoderModule类
    module_class = FlaxSpeechEncoderDecoderModule

    # 初始化方法，接受多个参数，包括配置、输入形状、随机种子、数据类型等
    def __init__(
        self,
        config: SpeechEncoderDecoderConfig,
        input_shape: Optional[Tuple] = None,
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 如果_do_init为False，则抛出ValueError异常，要求初始化
        if not _do_init:
            raise ValueError(
                "`FlaxSpeechEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
            )

        # 如果解码器的跨注意力隐藏大小不为None，则进行以下检查
        if config.decoder.cross_attention_hidden_size is not None:
            # 如果解码器的跨注意力隐藏大小不等于编码器的隐藏大小，则抛出ValueError异常
            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
                raise ValueError(
                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
                    " `config.encoder.hidden_size`."
                )

        # 确保输入和输出的嵌入不是共享的
        config.tie_word_embeddings = False
        # 使用给定的配置和其他参数实例化模块
        module = self.module_class(config=config, dtype=dtype, **kwargs)

        # 如果未提供输入形状，则设定默认的输入形状
        if input_shape is None:
            # 语音编码器几乎总是对序列长度维度进行降采样
            encoder_input_length = 1024
            # 根据编码器的输出长度获取解码器的输入长度
            decoder_input_length = module._get_feat_extract_output_lengths(encoder_input_length)
            # 设置输入形状为((1, encoder_input_length), (1, decoder_input_length))
            input_shape = ((1, encoder_input_length), (1, decoder_input_length))

        # 调用父类的初始化方法，传递配置、模块、输入形状、随机种子、数据类型和是否初始化的标志
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
    # 初始化权重函数，用于模型参数的初始化
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 解析输入形状，分为编码器和解码器的输入形状
        encoder_input_shape, decoder_input_shape = input_shape

        # 初始化编码器的输入数据为全零数组，数据类型为32位浮点数
        inputs = jnp.zeros(encoder_input_shape, dtype="f4")
        # 初始化编码器的注意力掩码为全1数组，数据类型为32位整数
        attention_mask = jnp.ones_like(inputs, dtype="i4")
        # 初始化解码器的输入标识为全零数组，数据类型为32位整数
        decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")
        # 初始化解码器的注意力掩码为与解码器输入标识相同形状的全1数组
        decoder_attention_mask = jnp.ones_like(decoder_input_ids)

        # 获取输入数据的批量大小和序列长度
        batch_size, sequence_length = inputs.shape

        # 获取解码器输入标识的批量大小和序列长度
        decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape
        # 检查编码器和解码器的批量大小是否相同，如果不同则抛出数值错误异常
        if not decoder_batch_size == batch_size:
            raise ValueError(
                f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder"
                f" and {decoder_batch_size} for decoder."
            )
        
        # 根据解码器序列长度广播生成解码器的位置标识
        decoder_position_ids = jnp.broadcast_to(
            jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
        )

        # 使用随机数生成器分割生成参数随机种子和Dropout的随机数种子
        params_rng, dropout_rng = jax.random.split(rng)
        # 构建随机数种子字典
        rngs = {"params": params_rng, "dropout": dropout_rng}

        # 使用模型的初始化方法初始化随机参数
        random_params = self.module.init(
            rngs,
            inputs,
            attention_mask,
            decoder_input_ids,
            decoder_attention_mask,
            decoder_position_ids,
        )["params"]

        # 如果提供了预定义参数，则合并随机参数和预定义参数
        if params is not None:
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            # 将预定义参数中缺失的键添加到随机参数中
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            # 冻结并返回合并后的参数字典
            return freeze(unflatten_dict(params))
        else:
            # 如果未提供预定义参数，则直接返回随机生成的参数字典
            return random_params
    # 初始化缓存，用于快速自回归解码
    def init_cache(self, batch_size, max_length, encoder_outputs):
        r"""
        Args:
            batch_size (`int`):
                用于快速自回归解码的批大小。定义了初始化缓存时的批大小。
            max_length (`int`):
                自回归解码的最大可能长度。定义了初始化缓存时的序列长度。
            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
                `encoder_outputs` 包括 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)。
                `last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`，*可选* 的隐藏状态序列，
                是编码器最后一层的输出的隐藏状态序列。在解码器的交叉注意力中使用。
        """
        # 初始化解码器的输入 ID，全为 1 的数组
        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
        # 解码器的注意力掩码，与输入 ID 形状相同，全为 1
        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
        # 解码器的位置 ID，广播到与输入 ID 形状相同的数组
        decoder_position_ids = jnp.broadcast_to(
            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
        )

        # 定义一个内部函数来前向传播解码器模块
        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
            decoder_module = module._get_decoder_module()
            return decoder_module(
                input_ids=decoder_input_ids,
                attention_mask=decoder_attention_mask,
                position_ids=decoder_position_ids,
                **kwargs,
            )

        # 初始化模型变量，使用给定的参数和前向传播函数来获取缓存
        init_variables = self.module.init(
            jax.random.PRNGKey(0),
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],
            init_cache=True,
            method=_decoder_forward,  # 只需调用解码器以初始化缓存
        )
        # 解冻缓存并返回
        return unfreeze(init_variables["cache"])

    # 获取特征提取器的输出长度
    def _get_feat_extract_output_lengths(
        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
    ):
        return self.module._get_feat_extract_output_lengths(input_lengths, add_adapter=add_adapter)

    # 编码器方法，根据给定的输入对语音编码器-解码器进行编码
    @add_start_docstrings(SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def encode(
        self,
        inputs: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        freeze_feature_encoder: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        r"""
        Returns:

        Example:

        ```
        >>> from transformers import FlaxSpeechEncoderDecoderModel

        >>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized
        >>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "facebook/wav2vec2-large-lv60", "facebook/bart-large"
        ... )

        >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)
        >>> encoder_outputs = model.encode(inputs)
        ```"""
        # 如果没有显式提供 output_attentions 参数，则使用配置文件中的值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有显式提供 output_hidden_states 参数，则使用配置文件中的值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有显式提供 return_dict 参数，则使用配置文件中的值
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 如果 attention_mask 参数为 None，则创建一个全 1 的掩码矩阵
        if attention_mask is None:
            attention_mask = jnp.ones_like(inputs, dtype="i4")

        # 处理可能存在的随机数生成器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 定义内部函数 _encoder_forward，用于执行编码器的前向传播
        def _encoder_forward(module, inputs, attention_mask, **kwargs):
            encode_module = module._get_encoder_module()
            return encode_module(inputs, attention_mask, **kwargs)

        # 使用 Flax 模块的 apply 方法执行编码器的前向传播
        outputs = self.module.apply(
            {"params": params or self.params},
            inputs=jnp.array(inputs, dtype="f4"),
            attention_mask=jnp.array(attention_mask, dtype="i4"),
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=not train,
            freeze_feature_encoder=freeze_feature_encoder,
            rngs=rngs,
            method=_encoder_forward,
        )

        # 如果 return_dict 为 True，则构造一个 FlaxBaseModelOutput 对象
        if return_dict:
            outputs = FlaxBaseModelOutput(
                last_hidden_state=outputs.last_hidden_state,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )

        # 返回模型的输出结果
        return outputs

    @add_start_docstrings(SPEECH_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def decode(
        self,
        decoder_input_ids,
        encoder_outputs,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        past_key_values: dict = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        r"""
        @add_start_docstrings_to_model_forward(SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING)
        ```
    # 使用装饰器替换返回文档字符串，指定输出类型为FlaxSeq2SeqLMOutput，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义实例方法__call__，用于执行模型推理或训练
    def __call__(
        self,
        # 输入数据，一个NumPy数组
        inputs: jnp.ndarray,
        # 可选项，注意力掩码数组，用于指示输入中哪些元素是填充的
        attention_mask: Optional[jnp.ndarray] = None,
        # 可选项，解码器输入的ID数组，用于生成序列
        decoder_input_ids: Optional[jnp.ndarray] = None,
        # 可选项，解码器注意力掩码数组，用于指示解码器输入中哪些元素是填充的
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        # 可选项，解码器位置ID数组，指示每个解码器输入在序列中的位置
        decoder_position_ids: Optional[jnp.ndarray] = None,
        # 可选项，是否输出注意力权重
        output_attentions: Optional[bool] = None,
        # 可选项，是否输出隐藏状态
        output_hidden_states: Optional[bool] = None,
        # 可选项，是否以字典形式返回输出结果
        return_dict: Optional[bool] = None,
        # 是否处于训练模式
        train: bool = False,
        # 是否冻结特征编码器
        freeze_feature_encoder: bool = False,
        # 模型参数的字典
        params: dict = None,
        # 随机数生成器的密钥
        dropout_rng: PRNGKey = None,
        ):
            r"""
            Returns:

            Examples:

            ```
            >>> from transformers import FlaxSpeechEncoderDecoderModel, AutoTokenizer

            >>> # load a fine-tuned wav2vec2-2-bart model
            >>> model = FlaxSpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large")
            >>> # load output tokenizer
            >>> tokenizer_output = AutoTokenizer.from_pretrained("facebook/bart-large")

            >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)

            >>> # use bart's special bos, pad and eos tokens
            >>> model.config.decoder_start_token_id = model.decoder.config.bos_token_id
            >>> model.config.pad_token_id = model.decoder.config.pad_token_id
            >>> model.config.eos_token_id = model.decoder.config.eos_token_id

            >>> outputs = model.generate(inputs)
            # Assert something? More interesting input? dtype correct?
            ```
            """

            # Decide whether to use provided output attentions setting or default from model config
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            # Decide whether to use provided output hidden states setting or default from model config
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            # Decide whether to use provided return dict setting or default from model config
            return_dict = return_dict if return_dict is not None else self.config.return_dict

            # Prepare encoder inputs: if attention_mask is not provided, create one with all ones
            if attention_mask is None:
                attention_mask = jnp.ones_like(inputs, dtype="i4")

            # Prepare decoder inputs: decoder_input_ids cannot be None, raise error if so
            if decoder_input_ids is None:
                raise ValueError(
                    "`decoder_input_ids` cannot be `None`. For sequence to sequence training, `decoder_position_ids` must"
                    " be specified as an input argument."
                )
            # Prepare decoder attention mask: if not provided, create one with all ones
            if decoder_attention_mask is None:
                decoder_attention_mask = jnp.ones_like(decoder_input_ids)
            # Prepare decoder position ids: if not provided, broadcast from a range of sequence lengths
            if decoder_position_ids is None:
                batch_size, sequence_length = decoder_input_ids.shape
                decoder_position_ids = jnp.broadcast_to(
                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
                )

            # Handle any dropout random number generator if provided
            rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}

            # Apply the Flax module to the inputs and other provided arguments
            return self.module.apply(
                {"params": params or self.params},
                inputs=jnp.array(inputs, dtype="f4"),
                attention_mask=jnp.array(attention_mask, dtype="i4"),
                decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
                decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
                decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                deterministic=not train,
                freeze_feature_encoder=freeze_feature_encoder,
                rngs=rngs,
            )
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        max_length,
        attention_mask: Optional[jax.Array] = None,
        decoder_attention_mask: Optional[jax.Array] = None,
        encoder_outputs=None,
        **kwargs,
    ):
        # initializing the cache
        # 获取批量大小和解码器输入序列长度
        batch_size, seq_length = decoder_input_ids.shape

        # 使用初始化方法初始化缓存，获取过去的键值对
        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)

        # 由于解码器使用因果掩码，可以创建一个静态的全1注意力掩码
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        
        # 如果存在解码器注意力掩码，则更新静态注意力掩码
        if decoder_attention_mask is not None:
            # 计算解码器位置ID，累积求和减一
            decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
        else:
            # 否则使用广播方式创建解码器位置ID
            decoder_position_ids = jnp.broadcast_to(
                jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
            )

        # 返回输入生成的字典，包括过去键值对、编码器输出、编码器注意力掩码、扩展后的解码器注意力掩码和解码器位置ID
        return {
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "encoder_attention_mask": attention_mask,
            "decoder_attention_mask": extended_attention_mask,
            "decoder_position_ids": decoder_position_ids,
        }

    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        # 更新输入以用于生成
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
        return model_kwargs

    @classmethod
    def from_encoder_decoder_pretrained(
        cls,
        encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
        decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
        *model_args,
        **kwargs,
    ):
        # 从预训练的编码器-解码器模型中加载模型
        pass  # 这里省略了具体实现的注释，因为这个函数体没有具体代码，仅用于说明类方法的加载功能

`.\models\speech_encoder_decoder\modeling_speech_encoder_decoder.py`

# 导入必要的库和模块
import torch
from torch import nn
from torch.nn import CrossEntropyLoss

# 导入配置相关的模块和函数
from ...configuration_utils import PretrainedConfig
from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from ..auto.configuration_auto import AutoConfig
from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的配置名称
_CONFIG_FOR_DOC = "SpeechEncoderDecoderConfig"

# 文档字符串，描述了 Speech-Encoder-Text-Decoder 架构的类
SPEECH_ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a speech-sequence-to-text-sequence model with any pretrained speech
    autoencoding model as the encoder and any pretrained text autoregressive model as the decoder. The encoder is
    loaded via [`~AutoModel.from_pretrained`] function and the decoder is loaded via
    [`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added to the decoder
    and should be fine-tuned on a downstream generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    Additionally, in [Large-Scale Self- and Semi-Supervised Learning for Speech
    Translation](https://arxiv.org/abs/2104.06678) it is shown how leveraging large pretrained speech models for speech
    translation yields a significant performance improvement.

    After such an Speech-Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other
    models (see the examples for more information).

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)
    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.



    Parameters:
        config ([`SpeechEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.


These comments provide context and explanations for each part of the code block, as requested.
"""

SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
"""


# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
# 将输入的 token ids 向右移动一个位置
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    将输入的 token ids 向右移动一个位置。
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将除了第一列外的所有列替换为当前列的前一列的值，实现向右移动一个位置
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    if decoder_start_token_id is None:
        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
    # 设置第一列的值为 decoder_start_token_id
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
    # 将 labels 中可能的 -100 值替换为 pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


@add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING)
# SpeechEncoderDecoderModel 类，用于包装 Transformer 架构中的编码器和解码器
class SpeechEncoderDecoderModel(PreTrainedModel):
    r"""
    [`SpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
    one of the base model classes of the library as encoder and another one as decoder when created with the
    :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
    :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
    
    [`SpeechEncoderDecoderModel`] 是一个通用的模型类，将会被实例化为一个 Transformer 架构，其编码器和解码器是基于库中的基础模型类创建的，
    可以通过 :meth*~transformers.AutoModel.from_pretrained* 方法创建编码器，以及 :meth*~transformers.AutoModelForCausalLM.from_pretrained* 
    方法创建解码器。
    """

    config_class = SpeechEncoderDecoderConfig
    base_model_prefix = "speech_encoder_decoder"
    main_input_name = "inputs"
    supports_gradient_checkpointing = True

    def __init__(
        self,
        config: Optional[PretrainedConfig] = None,
        encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[PreTrainedModel] = None,
    ):
        super().__init__(config)
        # 初始化 SpeechEncoderDecoderModel 实例
        # encoder 和 decoder 是预训练的编码器和解码器模型
        self.encoder = encoder
        self.decoder = decoder

    # 返回当前实例的编码器模型
    def get_encoder(self):
        return self.encoder

    # 返回当前实例的解码器模型
    def get_decoder(self):
        return self.decoder

    # 返回当前实例解码器的输出嵌入
    def get_output_embeddings(self):
        return self.decoder.get_output_embeddings()

    # 设置当前实例解码器的输出嵌入
    def set_output_embeddings(self, new_embeddings):
        return self.decoder.set_output_embeddings(new_embeddings)

    # 冻结特征编码器，禁用特征编码器的梯度计算，使其在训练过程中不会更新参数
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder of the speech encoder so
        that its parameters will not be updated during training.
        调用此函数将禁用语音编码器的特征编码器的梯度计算，使其在训练过程中不会更新参数。
        """
        self.encoder.freeze_feature_encoder()

    @classmethod
    # 从预训练模型加载 SpeechEncoderDecoderModel 实例
    def from_pretrained(cls, *args, **kwargs):
        # 目前不支持快速初始化复合模型
        if kwargs.get("_fast_init", False):
            logger.warning(
                "Fast initialization is currently not supported for SpeechEncoderDecoderModel. "
                "Falling back to slow initialization..."
            )
        kwargs["_fast_init"] = False
        return super().from_pretrained(*args, **kwargs)

    @classmethod
    # 从预训练的编码器和解码器模型名称或路径加载模型
    @add_start_docstrings_to_model_forward(SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        inputs: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        input_values: Optional[torch.FloatTensor] = None,
        input_features: Optional[torch.FloatTensor] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ):
        # 准备解码器输入的 token ids，从标签中右移，用于生成过程的初始化
        def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
            return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
    ):
        # 为生成准备输入，包括解码器的准备
        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
        # 构建输入字典，供模型生成使用
        input_dict = {
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_input_ids": decoder_inputs["input_ids"],
            "encoder_outputs": encoder_outputs,
            "past_key_values": decoder_inputs["past_key_values"],
            "use_cache": use_cache,
        }
        return input_dict

    def resize_token_embeddings(self, *args, **kwargs):
        # 不支持通过 SpeechEncoderDecoderModel 直接调整嵌入层大小
        raise NotImplementedError(
            "Resizing the embedding layers via the SpeechEncoderDecoderModel directly is not supported. Please use the"
            " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))"
        )

    def _reorder_cache(self, past_key_values, beam_idx):
        # 在这里重新排序缓存，应用解码器缓存重排
        return self.decoder._reorder_cache(past_key_values, beam_idx)

`.\models\speech_encoder_decoder\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从相对路径导入必要的异常和类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available

# 定义导入结构，包含模型配置
_import_structure = {"configuration_speech_encoder_decoder": ["SpeechEncoderDecoderConfig"]}

# 检查是否可用 Torch 库，若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 添加 Torch 版本的模型到导入结构中
    _import_structure["modeling_speech_encoder_decoder"] = ["SpeechEncoderDecoderModel"]

# 检查是否可用 Flax 库，若不可用则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 添加 Flax 版本的模型到导入结构中
    _import_structure["modeling_flax_speech_encoder_decoder"] = ["FlaxSpeechEncoderDecoderModel"]

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 导入模型配置类
    from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig

    # 检查 Torch 是否可用，若可用则导入 Torch 版本的模型
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_speech_encoder_decoder import SpeechEncoderDecoderModel

    # 检查 Flax 是否可用，若可用则导入 Flax 版本的模型
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_flax_speech_encoder_decoder import FlaxSpeechEncoderDecoderModel

# 如果不是类型检查阶段
else:
    # 导入系统模块
    import sys

    # 将当前模块替换为 LazyModule 实例，用于惰性加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\speech_to_text\configuration_speech_to_text.py`

# 定义了 Speech2Text 模型的配置类 Speech2TextConfig，继承自 PretrainedConfig
class Speech2TextConfig(PretrainedConfig):
    # 类的文档字符串，描述了 Speech2TextConfig 的作用和用法
    r"""
    This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate a
    Speech2Text model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Speech2Text
    [facebook/s2t-small-librispeech-asr](https://huggingface.co/facebook/s2t-small-librispeech-asr) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import Speech2TextConfig, Speech2TextModel

    >>> # Initializing a Speech2Text s2t_transformer_s style configuration
    >>> configuration = Speech2TextConfig()

    >>> # Initializing a model (with random weights) from the s2t_transformer_s style configuration
    >>> model = Speech2TextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    # 模型类型为 speech_to_text
    model_type = "speech_to_text"
    # 推断过程中忽略的键列表，这里包含 "past_key_values"
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典，将 num_attention_heads 映射为 encoder_attention_heads，hidden_size 映射为 d_model
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
    # 初始化方法，用于设置模型的各种参数和配置
    def __init__(
        self,
        vocab_size=10000,  # 词汇表大小，默认为10000
        encoder_layers=12,  # 编码器层数，默认为12层
        encoder_ffn_dim=2048,  # 编码器中全连接层的维度，默认为2048
        encoder_attention_heads=4,  # 编码器中注意力头的数量，默认为4个
        decoder_layers=6,  # 解码器层数，默认为6层
        decoder_ffn_dim=2048,  # 解码器中全连接层的维度，默认为2048
        decoder_attention_heads=4,  # 解码器中注意力头的数量，默认为4个
        encoder_layerdrop=0.0,  # 编码器层的随机丢弃比例，默认为0.0（不丢弃）
        decoder_layerdrop=0.0,  # 解码器层的随机丢弃比例，默认为0.0（不丢弃）
        use_cache=True,  # 是否使用缓存，默认为True
        is_encoder_decoder=True,  # 是否为编码解码模型，默认为True
        activation_function="relu",  # 激活函数类型，默认为ReLU
        d_model=256,  # 模型的维度，默认为256
        dropout=0.1,  # 全局的Dropout比例，默认为0.1
        attention_dropout=0.0,  # 注意力机制的Dropout比例，默认为0.0（不丢弃）
        activation_dropout=0.0,  # 激活函数的Dropout比例，默认为0.0（不丢弃）
        init_std=0.02,  # 初始化参数的标准差，默认为0.02
        decoder_start_token_id=2,  # 解码器起始标记的ID，默认为2
        scale_embedding=True,  # 是否对嵌入进行缩放，默认为True
        pad_token_id=1,  # 填充标记的ID，默认为1
        bos_token_id=0,  # 开始标记的ID，默认为0
        eos_token_id=2,  # 结束标记的ID，默认为2
        max_source_positions=6000,  # 最大源序列长度，默认为6000
        max_target_positions=1024,  # 最大目标序列长度，默认为1024
        num_conv_layers=2,  # 卷积层的数量，默认为2
        conv_kernel_sizes=(5, 5),  # 卷积核大小的元组，默认为(5, 5)
        conv_channels=1024,  # 卷积通道数，默认为1024
        input_feat_per_channel=80,  # 每个通道的输入特征数，默认为80
        input_channels=1,  # 输入通道数，默认为1
        **kwargs,  # 其他参数，用于传递给父类初始化函数
    ):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.encoder_ffn_dim = encoder_ffn_dim
        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
        self.decoder_ffn_dim = decoder_ffn_dim
        self.decoder_layers = decoder_layers
        self.decoder_attention_heads = decoder_attention_heads
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.activation_function = activation_function
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers  # 隐藏层的数量等同于编码器层数
        self.scale_embedding = scale_embedding  # 如果为True，则嵌入向量将按sqrt(d_model)进行缩放
        self.max_source_positions = max_source_positions
        self.max_target_positions = max_target_positions
        self.num_conv_layers = num_conv_layers
        self.conv_kernel_sizes = list(conv_kernel_sizes)  # 将卷积核大小转换为列表形式
        self.conv_channels = conv_channels
        self.input_feat_per_channel = input_feat_per_channel
        self.input_channels = input_channels

        # 检查卷积模块的配置是否正确
        if len(self.conv_kernel_sizes) != self.num_conv_layers:
            raise ValueError(
                "Configuration for convolutional module is incorrect. "
                "It is required that `len(config.conv_kernel_sizes)` == `config.num_conv_layers` "
                f"but is `len(config.conv_kernel_sizes) = {len(self.conv_kernel_sizes)}`, "
                f"`config.num_conv_layers = {self.num_conv_layers}`."
            )

        # 调用父类的初始化方法，传递必要的参数和关键字参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            decoder_start_token_id=decoder_start_token_id,
            **kwargs,
        )

`.\models\speech_to_text\convert_s2t_fairseq_to_tfms.py`

# 导入必要的库
import argparse  # 导入 argparse 库，用于处理命令行参数

import torch  # 导入 PyTorch 库
from torch import nn  # 导入 PyTorch 中的 nn 模块，用于神经网络构建

from transformers import Speech2TextConfig, Speech2TextForConditionalGeneration  # 导入 transformers 库中的 Speech2TextConfig 和 Speech2TextForConditionalGeneration 类

# 定义函数，移除 state_dict 中指定的键
def remove_ignore_keys_(state_dict):
    # 需要移除的键列表
    ignore_keys = [
        "encoder.version",
        "decoder.version",
        "model.encoder.version",
        "model.decoder.version",
        "decoder.output_projection.weight",
        "_float_tensor",
        "encoder.embed_positions._float_tensor",
        "decoder.embed_positions._float_tensor",
    ]
    # 从 state_dict 中移除指定的键
    for k in ignore_keys:
        state_dict.pop(k, None)

# 定义函数，重命名 state_dict 中的键名
def rename_keys(s_dict):
    keys = list(s_dict.keys())  # 获取 state_dict 的所有键名
    # 遍历键名列表
    for key in keys:
        # 替换包含 "transformer_layers" 的键名为 "layers"
        if "transformer_layers" in key:
            s_dict[key.replace("transformer_layers", "layers")] = s_dict.pop(key)
        # 替换包含 "subsample" 的键名为 "conv"
        elif "subsample" in key:
            s_dict[key.replace("subsample", "conv")] = s_dict.pop(key)

# 定义函数，根据输入的嵌入层创建线性层
def make_linear_from_emb(emb):
    vocab_size, emb_size = emb.weight.shape  # 获取嵌入层的词汇大小和嵌入维度
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)  # 创建一个无偏置的线性层
    lin_layer.weight.data = emb.weight.data  # 将嵌入层的权重数据复制到线性层的权重中
    return lin_layer

# 定义函数，将 Fairseq 的语音到文本检查点转换为 Transformers 模型
def convert_fairseq_s2t_checkpoint_to_tfms(checkpoint_path, pytorch_dump_folder_path):
    m2m_100 = torch.load(checkpoint_path, map_location="cpu")  # 加载 Fairseq 检查点
    args = m2m_100["args"]  # 获取模型参数
    state_dict = m2m_100["model"]  # 获取模型的 state_dict
    lm_head_weights = state_dict["decoder.output_projection.weight"]  # 获取解码器输出投影层的权重

    remove_ignore_keys_(state_dict)  # 调用函数移除不需要的键名
    rename_keys(state_dict)  # 调用函数重命名键名

    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]  # 获取嵌入层的词汇大小

    tie_embeds = args.share_decoder_input_output_embed  # 检查是否共享解码器的输入输出嵌入

    conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]  # 解析卷积核大小列表
    # 创建一个语音到文本转换模型的配置对象，配置包括词汇大小、最大源和目标位置、编码器和解码器层数、注意力头数等
    config = Speech2TextConfig(
        vocab_size=vocab_size,
        max_source_positions=args.max_source_positions,
        max_target_positions=args.max_target_positions,
        encoder_layers=args.encoder_layers,
        decoder_layers=args.decoder_layers,
        encoder_attention_heads=args.encoder_attention_heads,
        decoder_attention_heads=args.decoder_attention_heads,
        encoder_ffn_dim=args.encoder_ffn_embed_dim,
        decoder_ffn_dim=args.decoder_ffn_embed_dim,
        d_model=args.encoder_embed_dim,
        dropout=args.dropout,
        attention_dropout=args.attention_dropout,
        activation_dropout=args.activation_dropout,
        activation_function="relu",
        num_conv_layers=len(conv_kernel_sizes),
        conv_channels=args.conv_channels,
        conv_kernel_sizes=conv_kernel_sizes,
        input_feat_per_channel=args.input_feat_per_channel,
        input_channels=args.input_channels,
        tie_word_embeddings=tie_embeds,
        num_beams=5,
        max_length=200,
        use_cache=True,
        decoder_start_token_id=2,
        early_stopping=True,
    )

    # 使用上面配置的模型配置对象创建语音到文本转换模型
    model = Speech2TextForConditionalGeneration(config)

    # 加载模型的状态字典，并忽略丢失的一些键，记录下丢失的和不期望的键
    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)

    # 如果有丢失的键，并且丢失的键不在预期的键集合内，则抛出值错误异常
    if len(missing) > 0 and not set(missing) <= {
        "encoder.embed_positions.weights",
        "decoder.embed_positions.weights",
    }:
        raise ValueError(
            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights` are allowed to be missing,"
            f" but all the following weights are missing {missing}"
        )

    # 如果要求绑定嵌入，则将语言模型头部替换为从嵌入中创建的线性层；否则，直接加载预训练的语言模型头部权重
    if tie_embeds:
        model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens)
    else:
        model.lm_head.weight.data = lm_head_weights

    # 将模型保存到指定的 PyTorch 模型保存路径
    model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
    # 添加一个参数选项，指定 fairseq 模型文件的路径，类型为字符串

    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加一个参数选项，指定输出 PyTorch 模型的文件夹路径，默认值为 None，类型为字符串

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在 args 变量中

    convert_fairseq_s2t_checkpoint_to_tfms(args.fairseq_path, args.pytorch_dump_folder_path)
    # 调用函数 convert_fairseq_s2t_checkpoint_to_tfms，传入 fairseq 模型文件路径和 PyTorch 输出文件夹路径作为参数

`.\models\speech_to_text\feature_extraction_speech_to_text.py`

        feature_size=80,
        sampling_rate=16000,
        num_mel_bins=80,
        padding_value=0.0,
        do_ceptral_normalize=True,
        normalize_means=True,
        normalize_vars=True,
        **kwargs,

# 初始化函数，构造一个Speech2Text特征提取器对象，继承自SequenceFeatureExtractor类。接受多个参数来配置特征提取器的行为。


        super().__init__(**kwargs)

        self.feature_size = feature_size
        self.sampling_rate = sampling_rate
        self.num_mel_bins = num_mel_bins
        self.padding_value = padding_value
        self.do_ceptral_normalize = do_ceptral_normalize
        self.normalize_means = normalize_means
        self.normalize_vars = normalize_vars

# 调用父类SequenceFeatureExtractor的初始化函数，并设置特征提取器的各项参数。


    def _extract_mel_features(self, signal: np.ndarray) -> np.ndarray:
        """
        Extracts Mel-filter bank features from raw speech signal.

        Args:
            signal (`np.ndarray`): Raw speech signal.

        Returns:
            `np.ndarray`: Extracted Mel-filter bank features.
        """
        # 使用mel_filter_bank函数提取信号的Mel频率滤波器组特征
        return mel_filter_bank(
            signal,
            self.sampling_rate,
            self.num_mel_bins,
        )

# 提取原始语音信号的Mel频率滤波器组特征。


    def _apply_cmvn(self, features: np.ndarray) -> np.ndarray:
        """
        Applies utterance-level cepstral mean and variance normalization (CMVN) to the extracted features.

        Args:
            features (`np.ndarray`): Extracted features.

        Returns:
            `np.ndarray`: Features after CMVN.
        """
        # 计算特征的均值和方差，用于CMVN
        means = np.mean(features, axis=1, keepdims=True)
        variances = np.var(features, axis=1, keepdims=True)
        
        # 如果开启了归一化均值，则进行均值归一化
        if self.normalize_means:
            features -= means
        
        # 如果开启了归一化方差，则进行方差归一化
        if self.normalize_vars:
            features /= np.sqrt(variances + 1e-5)
        
        return features

# 对提取的特征应用语句级别的倒谱均值和方差归一化（CMVN）。


    def _extract_features(self, signal: np.ndarray) -> BatchFeature:
        """
        Extracts features from the raw speech signal.

        Args:
            signal (`np.ndarray`): Raw speech signal.

        Returns:
            `BatchFeature`: Batch of extracted features.
        """
        # 提取信号的频谱图
        spectrogram_feats = spectrogram(
            signal,
            self.sampling_rate,
            window_function,
        )

        # 提取频谱图的Mel频率滤波器组特征
        mel_feats = self._extract_mel_features(spectrogram_feats)

        # 如果开启了CMVN，则应用CMVN
        if self.do_ceptral_normalize:
            mel_feats = self._apply_cmvn(mel_feats)

        # 创建BatchFeature对象，封装提取的特征
        return BatchFeature(input_features=mel_feats, attention_mask=np.ones_like(mel_feats, dtype=np.float32))

# 从原始语音信号中提取特征。
    ):
        # 调用父类构造函数，设置特征大小、采样率、填充值等参数
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
        # 设置梅尔频谱特征的参数
        self.num_mel_bins = num_mel_bins
        self.do_ceptral_normalize = do_ceptral_normalize
        self.normalize_means = normalize_means
        self.normalize_vars = normalize_vars
        self.return_attention_mask = True

        # 如果没有语音可用，则生成梅尔滤波器和窗口函数
        if not is_speech_available():
            # 生成梅尔滤波器
            mel_filters = mel_filter_bank(
                num_frequency_bins=256,
                num_mel_filters=self.num_mel_bins,
                min_frequency=20,
                max_frequency=sampling_rate // 2,
                sampling_rate=sampling_rate,
                norm=None,
                mel_scale="kaldi",
                triangularize_in_mel_space=True,
            )
            # 填充梅尔滤波器并设置窗口函数
            self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
            self.window = window_function(400, "povey", periodic=False)

    def _extract_fbank_features(
        self,
        waveform: np.ndarray,
    ) -> np.ndarray:
        """
        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
        and hence the waveform should not be normalized before feature extraction.
        """
        # 转换波形数据以符合Kaldi的要求：16位有符号整数
        waveform = waveform * (2**15)
        # 如果有语音可用，则使用TorchAudio提取梅尔滤波器特征
        if is_speech_available():
            waveform = torch.from_numpy(waveform).unsqueeze(0)
            features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
            features = features.numpy()
        else:
            # 如果没有语音可用，则使用自定义的声谱图函数进行特征提取
            waveform = np.squeeze(waveform)
            features = spectrogram(
                waveform,
                self.window,
                frame_length=400,
                hop_length=160,
                fft_length=512,
                power=2.0,
                center=False,
                preemphasis=0.97,
                mel_filters=self.mel_filters,
                log_mel="log",
                mel_floor=1.192092955078125e-07,
                remove_dc_offset=True,
            ).T
        return features

    @staticmethod
    def utterance_cmvn(
        x: np.ndarray,
        input_length: int,
        normalize_means: Optional[bool] = True,
        normalize_vars: Optional[bool] = True,
        padding_value: float = 0.0,
    ) -> np.ndarray:
        # 确保对float32数组进行归一化处理
        if normalize_means:
            mean = x[:input_length].mean(axis=0)
            x = np.subtract(x, mean)
        if normalize_vars:
            std = x[:input_length].std(axis=0)
            x = np.divide(x, std)

        # 如果输入长度小于数组长度，则用填充值填充数组
        if input_length < x.shape[0]:
            x[input_length:] = padding_value

        # 确保数组是float32类型
        x = x.astype(np.float32)

        return x

    def normalize(
        self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
    ) -> List[np.ndarray]:  
        # 函数定义，指定返回类型为包含 np.ndarray 的列表
        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]  
        # 如果存在 attention_mask，则计算每个输入特征的有效长度；否则使用每个输入特征的长度
        return [  
            # 返回列表推导式，对每个输入特征应用 utterance_cmvn 函数，生成处理后的特征列表
            self.utterance_cmvn(x, n, self.normalize_means, self.normalize_vars, self.padding_value)
            for x, n in zip(input_features, lengths)  
            # 遍历 input_features 和 lengths 的并行列表，用于函数参数
        ]

    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        padding: Union[bool, str, PaddingStrategy] = False,
        max_length: Optional[int] = None,
        truncation: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        **kwargs,
        # 定义 __call__ 方法，接收多个参数，包括 raw_speech、padding 等等，还有可变数量的关键字参数 kwargs

`.\models\speech_to_text\modeling_speech_to_text.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Speech2Text model."""

import math
from typing import Optional, Tuple, Union

import torch
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_speech_to_text import Speech2TextConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "Speech2TextConfig"


SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/s2t-small-librispeech-asr",
    # See all Speech2Text models at https://huggingface.co/models?filter=speech_to_text
]


# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    # 创建一个新的和input_ids相同形状的张量，用于存放右移后的结果
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将input_ids向右移动一位，即第一列为decoder_start_token_id，后续列为input_ids的前n-1列
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # 如果有-100值，将其替换为pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


class Conv1dSubsampler(nn.Module):
    """
    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
    via gated linear units (https://arxiv.org/abs/1911.08460)
    """
    def __init__(self, config):
        super(Conv1dSubsampler, self).__init__()
        self.config = config
        self.num_layers = config.num_conv_layers  # 设置卷积层的数量
        self.in_channels = config.input_feat_per_channel * config.input_channels  # 输入通道数
        self.mid_channels = config.conv_channels  # 中间层的通道数
        self.out_channels = config.d_model  # 输出通道数，这里是模型的维度
        self.kernel_sizes = config.conv_kernel_sizes  # 卷积核的大小列表

        self.conv_layers = nn.ModuleList(
            # 创建卷积层的 ModuleList，每层使用不同的卷积核大小和通道数设置
            nn.Conv1d(
                self.in_channels if i == 0 else self.mid_channels // 2,  # 输入通道数的设置
                self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2,  # 输出通道数的设置
                kernel_size=k,  # 当前卷积层的卷积核大小
                stride=2,  # 步长设置为2
                padding=k // 2,  # 根据卷积核大小设置填充
            )
            for i, k in enumerate(self.kernel_sizes)  # 遍历卷积核大小列表
        )

    def forward(self, input_features):
        hidden_states = input_features.transpose(1, 2).contiguous()  # 转置输入特征以适应卷积操作 -> B x (C x D) x T
        for conv in self.conv_layers:
            hidden_states = conv(hidden_states)  # 应用当前卷积层
            hidden_states = nn.functional.glu(hidden_states, dim=1)  # 应用 GLU 激活函数
        hidden_states = hidden_states.transpose(1, 2).contiguous()  # 再次转置以恢复原始维度 -> T x B x (C x D)
        return hidden_states
class Speech2TextSinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
        super().__init__()
        self.offset = 2  # 偏移量，用于处理位置编码时的偏移
        self.embedding_dim = embedding_dim  # 嵌入维度
        self.padding_idx = padding_idx  # 可选的填充索引，用于指定填充位置
        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)

    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
        if hasattr(self, "weights"):
            # 在前向传播中将权重转换为正确的数据类型和设备
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        self.weights = nn.Parameter(emb_weights)  # 将嵌入权重设置为模块的参数
        self.weights.requires_grad = False  # 设置权重不需要梯度
        self.weights.detach_()  # 分离权重，使其不参与反向传播

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        """
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        构建正弦位置编码。这与tensor2tensor中的实现匹配，但与《Attention Is All You Need》第3.5节中的描述略有不同。
        """
        half_dim = embedding_dim // 2
        emb = math.log(10000) / (half_dim - 1)  # 计算正弦周期的频率
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)  # 计算正弦位置编码
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)  # 计算位置编码矩阵
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)  # 合并正弦和余弦编码
        if embedding_dim % 2 == 1:
            # 如果嵌入维度为奇数，则在末尾添加零填充
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            emb[padding_idx, :] = 0  # 将填充索引处的位置编码设置为零向量
        return emb.to(torch.get_default_dtype())  # 返回默认数据类型的位置编码

    @torch.no_grad()
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        bsz, seq_len = input_ids.size()
        # 从输入的token ids创建位置 ids，保持填充的位置不变
        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
            input_ids.device
        )

        # 如果需要，扩展嵌入
        max_pos = self.padding_idx + 1 + seq_len
        if max_pos > self.weights.size(0):
            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)

        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()

    def create_position_ids_from_input_ids(
        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
    ):
        # 从输入的 token ids 创建位置 ids
        raise NotImplementedError
        """
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            input_ids: torch.Tensor representing input tensor with token IDs
            padding_idx: int, index of padding token in input_ids
            past_key_values_length: int, length of past key values to be considered
        Returns:
            torch.Tensor representing the tensor with position indices
        """
        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
        # Create a mask where non-padding elements are marked as 1 and padding elements as 0
        mask = input_ids.ne(padding_idx).int()
        # Compute cumulative sum of the mask along the second dimension (sequence length),
        # and adjust for past key values length. Type conversion ensures compatibility with mask.
        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
        # Convert the indices to long type and add padding_idx to non-padding positions
        return incremental_indices.long() + padding_idx
# 从transformers.models.bart.modeling_bart.BartAttention复制的代码，将Bart替换为Speech2Text
class Speech2TextAttention(nn.Module):
    """来自“Attention Is All You Need”论文的多头注意力机制"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[Speech2TextConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim  # 设置注意力机制的嵌入维度
        self.num_heads = num_heads  # 设置注意力头的数量
        self.dropout = dropout  # 设置dropout率
        self.head_dim = embed_dim // num_heads  # 计算每个注意力头的维度
        self.config = config  # 保存配置信息

        # 确保embed_dim能够被num_heads整除，否则抛出异常
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim必须能够被num_heads整除 (当前 `embed_dim`: {self.embed_dim}"
                f" 和 `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子，用于缩放注意力分数
        self.is_decoder = is_decoder  # 标识是否是解码器注意力
        self.is_causal = is_causal  # 标识是否是因果注意力

        # 初始化线性层，用于对key、value和query进行投影
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        """将张量形状重新排列为(bsz, num_heads, seq_len, head_dim)，并转置前两个维度"""
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        """前向传播函数，实现注意力机制的计算"""
        pass  # 这里只是占位符，实际上需要根据具体实现补充内容


# 从transformers.models.mbart.modeling_mbart.MBartEncoderLayer复制的代码，将MBart替换为Speech2Text, MBART替换为SPEECH_TO_TEXT
class Speech2TextEncoderLayer(nn.Module):
    def __init__(self, config: Speech2TextConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 设置编码器层的嵌入维度

        # 初始化自注意力层
        self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 使用LayerNorm对自注意力层的输出进行归一化
        self.dropout = config.dropout  # 设置dropout率
        self.activation_fn = ACT2FN[config.activation_function]  # 获取激活函数
        self.activation_dropout = config.activation_dropout  # 设置激活函数的dropout率
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)  # 第一个全连接层
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)  # 第二个全连接层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 使用LayerNorm对最终输出进行归一化
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_head_mask: torch.Tensor,
        output_attentions: bool = False,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 保存输入的残差连接，以备后续使用
        residual = hidden_states
        # 对输入的 hidden_states 进行 Layer Normalization 处理
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 使用 self attention 模块处理 normalized 后的 hidden_states，获取输出、注意力权重和注意力分布
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 对输出的 hidden_states 进行 dropout 处理
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 恢复残差连接
        hidden_states = residual + hidden_states

        # 再次保存输入的残差连接
        residual = hidden_states
        # 对上一步的输出进行 Layer Normalization 处理
        hidden_states = self.final_layer_norm(hidden_states)
        # 使用激活函数和全连接层 fc1 处理 normalized 后的 hidden_states
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 对输出的 hidden_states 进行 dropout 处理
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 使用全连接层 fc2 处理 dropout 后的 hidden_states
        hidden_states = self.fc2(hidden_states)
        # 对输出的 hidden_states 进行 dropout 处理
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 恢复残差连接
        hidden_states = residual + hidden_states

        # 如果 hidden_states 的数据类型是 torch.float16，并且包含 inf 或 nan 的情况
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            # 对 hidden_states 进行值的 clamp 处理，避免出现超出浮点数范围的情况
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 构建输出元组，包含最终的 hidden_states
        outputs = (hidden_states,)

        # 如果需要输出 attentions，将 attentions 加入到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回最终的输出元组
        return outputs
# 从transformers.models.mbart.modeling_mbart.MBartDecoderLayer复制代码，将MBart替换为Speech2Text，MBART替换为SPEECH_TO_TEXT
class Speech2TextDecoderLayer(nn.Module):
    def __init__(self, config: Speech2TextConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 获取配置中的模型维度大小

        # 初始化自注意力机制
        self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,  # 传入嵌入维度
            num_heads=config.decoder_attention_heads,  # 解码器注意力头数
            dropout=config.attention_dropout,  # 注意力机制的dropout
            is_decoder=True,  # 标识为解码器自注意力
            is_causal=True,  # 使用因果注意力
            config=config,  # 传入配置对象
        )
        self.dropout = config.dropout  # 配置中的dropout比率
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数
        self.activation_dropout = config.activation_dropout  # 激活函数的dropout比率

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 自注意力层的LayerNorm

        # 初始化编码器注意力机制
        self.encoder_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,  # 传入嵌入维度
            config.decoder_attention_heads,  # 解码器注意力头数
            dropout=config.attention_dropout,  # 注意力机制的dropout
            is_decoder=True,  # 标识为解码器自注意力
            config=config,  # 传入配置对象
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 编码器注意力层的LayerNorm

        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)  # 第一个线性层
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)  # 第二个线性层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终的LayerNorm

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,



# Speech2TextPreTrainedModel类，继承自PreTrainedModel类
class Speech2TextPreTrainedModel(PreTrainedModel):
    config_class = Speech2TextConfig  # 配置类为Speech2TextConfig
    base_model_prefix = "model"  # 基础模型前缀为"model"
    main_input_name = "input_features"  # 主要输入名称为"input_features"
    supports_gradient_checkpointing = True  # 支持梯度检查点

    def _init_weights(self, module):
        std = self.config.init_std  # 初始化标准差
        if isinstance(module, (nn.Linear, nn.Conv1d)):  # 如果模块是线性层或一维卷积层
            module.weight.data.normal_(mean=0.0, std=std)  # 权重初始化为正态分布
            if module.bias is not None:  # 如果有偏置项
                module.bias.data.zero_()  # 偏置初始化为零
        elif isinstance(module, nn.Embedding):  # 如果模块是嵌入层
            module.weight.data.normal_(mean=0.0, std=std)  # 权重初始化为正态分布
            if module.padding_idx is not None:  # 如果有填充索引
                module.weight.data[module.padding_idx].zero_()  # 填充索引位置的权重初始化为零

    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
        """
        计算卷积层的输出长度
        """
        for i in range(self.config.num_conv_layers):  # 遍历卷积层数量
            input_lengths = (input_lengths - 1) // 2 + 1  # 根据卷积层的计算方式更新输入长度

        return input_lengths  # 返回更新后的输入长度
    # 定义一个方法来获取特征向量的注意力掩码
    def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
        # 如果注意力掩码的维度大于2，说明生成了一个3D的注意力掩码，
        # 这里将其转换为2D的形式
        if len(attention_mask.shape) > 2:
            attention_mask = attention_mask[:, :, -1]

        # 根据注意力掩码的和，获取下采样后的长度列表
        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
        bsz = attention_mask.size()[0]

        # 创建一个与注意力掩码相同形状的全零张量，用于存储生成的注意力掩码
        attention_mask = torch.zeros(
            (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )

        # 将生成的注意力掩码中对应于输出长度索引之前的所有位置设置为1，
        # 这确保了这些位置的数值被全部关注到
        attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1

        # 反转张量并在最后一个维度上累加，然后再次反转，将其转换为整数类型
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()

        # 返回生成的注意力掩码
        return attention_mask
# 定义文档字符串，说明继承自 `PreTrainedModel` 的模型基类的用途和特性
SPEECH_TO_TEXT_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Speech2TextConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义空的输入文档字符串，此处留空，没有具体的输入参数描述
SPEECH_TO_TEXT_INPUTS_DOCSTRING = r"""
"""


class Speech2TextEncoder(Speech2TextPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`Speech2TextEncoderLayer`].

    Args:
        config: Speech2TextConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: Speech2TextConfig):
        super().__init__(config)

        # 从配置中初始化模型属性
        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_source_positions
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        # 初始化卷积子采样器
        self.conv = Conv1dSubsampler(config)

        # 初始化声学-文本正弦位置嵌入
        self.embed_positions = Speech2TextSinusoidalPositionalEmbedding(
            self.max_source_positions,
            embed_dim,
            self.padding_idx,
        )

        # 创建多层 Transformer 编码器层
        self.layers = nn.ModuleList([Speech2TextEncoderLayer(config) for _ in range(config.encoder_layers)])
        
        # 初始化层归一化模块
        self.layer_norm = nn.LayerNorm(config.d_model)

        # 禁用梯度检查点
        self.gradient_checkpointing = False
        
        # 初始化权重并应用最终处理
        self.post_init()

    def forward(
        self,
        input_features,
        attention_mask=None,
        head_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
):  
    pass  # 此处为 forward 方法声明，但未提供具体实现，因此暂时没有更多的内容需要注释
        

class Speech2TextDecoder(Speech2TextPreTrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2TextDecoderLayer`]

    Args:
        config: Speech2TextConfig
        embed_tokens (nn.Embedding): output embedding
    """
    # 初始化方法，接受一个 Speech2TextConfig 类型的参数 config
    def __init__(self, config: Speech2TextConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置dropout参数
        self.dropout = config.dropout
        # 设置decoder层的layerdrop参数
        self.layerdrop = config.decoder_layerdrop
        # 设置padding的索引号
        self.padding_idx = config.pad_token_id
        # 设置最大目标位置
        self.max_target_positions = config.max_target_positions
        # 如果配置中指定了scale_embedding，则设置embed_scale为d_model的平方根，否则为1.0
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        # 创建一个词嵌入层，vocab_size为词汇表大小，d_model为词嵌入维度，padding_idx为填充的索引号
        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        # 创建一个音频转文本的正弦位置编码层，max_target_positions为最大目标位置，d_model为模型维度，padding_idx为填充的索引号
        self.embed_positions = Speech2TextSinusoidalPositionalEmbedding(
            self.max_target_positions,
            config.d_model,
            self.padding_idx,
        )

        # 创建一个由多个音频转文本解码器层组成的列表，每层的配置都来自config
        self.layers = nn.ModuleList([Speech2TextDecoderLayer(config) for _ in range(config.decoder_layers)])

        # 创建一个LayerNorm层，对输入进行归一化处理，维度为d_model
        self.layer_norm = nn.LayerNorm(config.d_model)

        # 初始化梯度检查点参数为False
        self.gradient_checkpointing = False
        
        # 初始化权重并进行最终处理
        self.post_init()

    # 获取输入词嵌入层的方法
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入词嵌入层的方法
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 前向传播方法，接受多个参数用于处理音频转文本的过程
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
@add_start_docstrings(
    "The bare Speech2Text Model outputting raw hidden-states without any specific head on top.",
    SPEECH_TO_TEXT_START_DOCSTRING,
)
# 定义了一个不带特定顶部头部的语音到文本模型
class Speech2TextModel(Speech2TextPreTrainedModel):
    def __init__(self, config: Speech2TextConfig):
        super().__init__(config)

        # 初始化编码器和解码器
        self.encoder = Speech2TextEncoder(config)
        self.decoder = Speech2TextDecoder(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回解码器的嵌入层
        return self.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置解码器的嵌入层
        self.decoder.embed_tokens = value

    def get_encoder(self):
        # 返回编码器
        return self.encoder

    def get_decoder(self):
        # 返回解码器
        return self.decoder

    @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 重写了父类的前向传播方法，增加了文档字符串和输出的替换
    def forward(
        self,
        input_features: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



@add_start_docstrings(
    "The Speech2Text Model with a language modeling head. Can be used for summarization.",
    SPEECH_TO_TEXT_START_DOCSTRING,
)
# 定义了一个带有语言建模头部的语音到文本模型，可以用于摘要生成
class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
    base_model_prefix = "model"
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: Speech2TextConfig):
        super().__init__(config)

        # 初始化基础语音到文本模型和语言建模头部
        self.model = Speech2TextModel(config)
        self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_encoder(self):
        # 返回基础模型的编码器
        return self.model.get_encoder()

    def get_decoder(self):
        # 返回基础模型的解码器
        return self.model.get_decoder()

    def get_output_embeddings(self):
        # 返回语言建模头部的输出嵌入层
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置语言建模头部的输出嵌入层
        self.lm_head = new_embeddings

    @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法，接收多个输入参数，并可选地返回不同的输出
    def forward(
        self,
        input_features: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 如果使用了过去的关键值（用于缓存），则截断decoder_input_ids，只保留最后一个位置的输入
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 返回包含各种输入参数的字典，用于生成器的输入准备
        return {
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 改为False以避免缓存（可能用于调试）
        }

    @staticmethod
    # 重新排序缓存中的过去关键值，以适应beam搜索中的索引重排
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化重排序后的过去关键值
        reordered_past = ()
        # 遍历每个层的过去关键值
        for layer_past in past_key_values:
            # 将每个层的过去状态按beam_idx重排，并转移到对应设备
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重排后的过去关键值
        return reordered_past

`.\models\speech_to_text\modeling_tf_speech_to_text.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TensorFlow Speech2Text model."""


from __future__ import annotations

import random
from typing import Optional, Tuple, Union

import numpy as np
import tensorflow as tf

from ...activations_tf import get_tf_activation, glu
from ...modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFSeq2SeqLMOutput,
    TFSeq2SeqModelOutput,
)
from ...modeling_tf_utils import (
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    TFSharedEmbeddings,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_speech_to_text import Speech2TextConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "Speech2TextConfig"
_CHECKPOINT_FOR_DOC = "facebook/s2t-small-librispeech-asr"


TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/s2t-small-librispeech-asr",
    # See all Speech2Text models at https://huggingface.co/models?filter=speech_to_text
]


LARGE_NEGATIVE = -1e8


# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
    # 将输入的 token ids 向右移动一位，用于解码过程
    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
    # 创建起始 token，填充为 decoder_start_token_id
    start_tokens = tf.fill(
        (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
    )
    # 将 input_ids 向右移动一位，构成 shifted_input_ids
    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
    # 替换 labels 中可能的 -100 值为 pad_token_id
    shifted_input_ids = tf.where(
        shifted_input_ids == -100,
        tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
        shifted_input_ids,
    )

    # "Verify that `labels` has only positive values and -100"
    # 断言 shifted_input_ids 中的值大于等于 0
    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))

    # Make sure the assertion op is called by wrapping the result in an identity no-op
    # 确保断言操作被调用，通过将结果包装在一个恒等 no-op 中
    return tf.identity(shifted_input_ids, name="shifted_input_ids")
    # 使用 TensorFlow 中的控制依赖机制，确保 assert_gte0（大于等于0的断言）被执行
    with tf.control_dependencies([assert_gte0]):
        # 使用 tf.identity 创建 shifted_input_ids 的副本，并确保在执行 assert_gte0 后再进行
        shifted_input_ids = tf.identity(shifted_input_ids)
    
    # 返回经过控制依赖处理后的 shifted_input_ids
    return shifted_input_ids
# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.
    """
    # 获取 batch size 和目标序列长度
    bsz = input_ids_shape[0]
    tgt_len = input_ids_shape[1]
    # 创建一个全为 LARGE_NEGATIVE 的矩阵作为初始 mask
    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
    # 创建一个条件向量，长度与 mask 的最后一个维度相同
    mask_cond = tf.range(shape_list(mask)[-1])
    # 根据条件向量设置 mask 中的值，实现上三角为 0，其余为 LARGE_NEGATIVE
    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
    
    # 如果 past_key_values_length 大于 0，则在 mask 左侧添加相应长度的 0
    if past_key_values_length > 0:
        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)

    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))


# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    # 获取原始 mask 的长度
    src_len = shape_list(mask)[1]
    # 如果未指定 tgt_len，则默认为 src_len
    tgt_len = tgt_len if tgt_len is not None else src_len
    # 创建一个常数张量值为 1.0
    one_cst = tf.constant(1.0)
    # 将 mask 转换为与目标长度相关的数据类型
    mask = tf.cast(mask, dtype=one_cst.dtype)
    # 在 mask 的第二个维度上扩展为 `[bsz, 1, tgt_len, src_len]`
    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))

    # 返回扩展后的 mask，将其中的 1.0 改为 LARGE_NEGATIVE
    return (one_cst - expanded_mask) * LARGE_NEGATIVE


class TFConv1dSubsampler(keras.layers.Layer):
    """
    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
    via gated linear units (https://arxiv.org/abs/1911.08460)
    """

    def __init__(self, config: Speech2TextConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        self.num_layers = config.num_conv_layers
        self.in_channels = config.input_feat_per_channel * config.input_channels
        self.mid_channels = config.conv_channels
        self.out_channels = config.d_model
        self.kernel_sizes = config.conv_kernel_sizes

        # 创建一系列的 1D 卷积层，每一层使用不同的参数配置
        self.conv_layers = [
            keras.layers.Conv1D(
                filters=self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2,
                kernel_size=k,
                strides=2,
                name=f"conv_layers.{i}",
            )
            for i, k in enumerate(self.kernel_sizes)
        ]
    def call(self, input_features: tf.Tensor) -> tf.Tensor:
        # TF Conv1D assumes Batch x Time x Channels, same as the input
        # 将输入特征转换为 float32 类型的张量
        hidden_states = tf.cast(input_features, tf.float32)
        for i, conv in enumerate(self.conv_layers):
            # equivalent to `padding=k // 2` on PT's `nn.Conv1d`
            # 计算填充长度，使得卷积操作的输出与输入在时间维度上保持一致
            pad_len = self.kernel_sizes[i] // 2
            hidden_shapes = shape_list(hidden_states)
            # 在时间维度两侧进行零填充，以保持卷积操作后的维度一致性
            hidden_states = tf.concat(
                (
                    tf.zeros((hidden_shapes[0], pad_len, hidden_shapes[2])),
                    hidden_states,
                    tf.zeros((hidden_shapes[0], pad_len, hidden_shapes[2])),
                ),
                axis=1,
            )

            # 应用卷积操作
            hidden_states = conv(hidden_states)
            # 在通道维度上应用门控线性单元（GLU）操作
            hidden_states = glu(hidden_states, axis=2)  # GLU over the Channel dimension
        # 返回处理后的隐藏状态张量
        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "conv_layers", None) is not None:
            for i, layer in enumerate(self.conv_layers):
                with tf.name_scope(layer.name):
                    # 根据卷积层的要求构建该层的参数
                    layer.build([None, None, self.in_channels] if i == 0 else [None, None, self.mid_channels // 2])
class TFSpeech2TextSinusoidalPositionalEmbedding(keras.layers.Layer):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None, **kwargs):
        super().__init__(**kwargs)
        # 偏移量，用于生成位置编码
        self.offset = 2
        # 嵌入维度
        self.embedding_dim = embedding_dim
        # 填充索引，指定填充位置的特殊索引
        self.padding_idx = padding_idx
        # 初始化嵌入权重矩阵
        self.embedding_weights = self._get_embedding(num_positions + self.offset, embedding_dim, padding_idx)

    @staticmethod
    def _get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None) -> tf.Tensor:
        """
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        """
        # 计算一半的维度
        half_dim = embedding_dim // 2
        # 计算频率
        emb = tf.math.log(10000.0) / (half_dim - 1)
        emb = tf.math.exp(tf.range(half_dim, dtype=tf.float32) * -emb)
        emb = tf.expand_dims(tf.range(num_embeddings, dtype=tf.float32), axis=1) * tf.expand_dims(emb, axis=0)
        emb = tf.reshape(tf.concat([tf.math.sin(emb), tf.math.cos(emb)], axis=1), shape=[num_embeddings, -1])
        if embedding_dim % 2 == 1:
            # 如果维度是奇数，补零
            emb = tf.concat([emb, tf.zeros((num_embeddings, 1))], axis=1)
        if padding_idx is not None:
            # 如果有填充索引，处理填充位置
            emb = tf.concat([emb[:padding_idx, :], tf.zeros((1, tf.shape(emb)[1])), emb[padding_idx + 1 :, :]], axis=0)
        return emb

    def call(self, input_ids: tf.Tensor, past_key_values_length: int = 0) -> tf.Tensor:
        bsz, seq_len = shape_list(input_ids)
        # 根据输入的 token ids 创建位置 ids，保留任何填充的 token 的填充状态
        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)

        # 生成位置嵌入
        embeddings = self._get_embedding(
            self.padding_idx + 1 + seq_len + self.offset + past_key_values_length, self.embedding_dim, self.padding_idx
        )
        return tf.reshape(tf.gather(embeddings, tf.reshape(position_ids, (-1,)), axis=0), (bsz, seq_len, -1))

    @staticmethod
    def create_position_ids_from_input_ids(
        input_ids: tf.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
    ) -> tf.Tensor:
        # 从输入的 token ids 创建位置 ids
        # 这里会根据填充索引和历史键值长度处理位置 ids
        pass  # 实际的实现将在代码中完成，这里只是声明函数结构
    def make_positions(x: tf.Tensor) -> tf.Tensor:
        """
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
    
        Args:
            x: tf.Tensor, input tensor where positions will be computed.
    
        Returns:
            tf.Tensor, tensor with replaced positions.
        """
        # 创建一个掩码，标记输入张量中不是填充符号的位置
        mask = tf.cast(tf.math.not_equal(input_ids, padding_idx), dtype=tf.int32)
        # 计算增量索引，加上过去键值的长度，并乘以掩码以忽略填充符号
        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
        # 返回增量索引，并将数据类型转换为 int64，同时加上填充索引
        return tf.cast(incremental_indices, dtype=tf.int64) + padding_idx
# 从 transformers.models.bart.modeling_tf_bart.TFBartAttention 复制并修改为 Speech2Text
class TFSpeech2TextAttention(keras.layers.Layer):
    """多头注意力机制，基于 'Attention Is All You Need'"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 初始化注意力层的参数
        self.embed_dim = embed_dim  # 注意力层的嵌入维度
        self.num_heads = num_heads  # 注意力头的数量
        self.dropout = keras.layers.Dropout(dropout)  # dropout 层
        self.head_dim = embed_dim // num_heads  # 每个注意力头的维度
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子
        self.is_decoder = is_decoder  # 是否为解码器模式

        # 初始化线性映射层
        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")  # K 线性映射
        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")  # Q 线性映射
        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")  # V 线性映射
        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")  # 输出线性映射

    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
        # 将输入张量重新形状为 [batch_size, num_heads, seq_len, head_dim]，并转置为 [batch_size, num_heads, seq_len, head_dim]
        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))

    def call(
        self,
        hidden_states: tf.Tensor,
        key_value_states: tf.Tensor | None = None,
        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
        attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        training: Optional[bool] = False,
    ):
        # 注意力层的前向传播函数
        def build(self, input_shape=None):
            if self.built:
                return
            self.built = True
            # 构建线性映射层
            if getattr(self, "k_proj", None) is not None:
                with tf.name_scope(self.k_proj.name):
                    self.k_proj.build([None, None, self.embed_dim])
            if getattr(self, "q_proj", None) is not None:
                with tf.name_scope(self.q_proj.name):
                    self.q_proj.build([None, None, self.embed_dim])
            if getattr(self, "v_proj", None) is not None:
                with tf.name_scope(self.v_proj.name):
                    self.v_proj.build([None, None, self.embed_dim])
            if getattr(self, "out_proj", None) is not None:
                with tf.name_scope(self.out_proj.name):
                    self.out_proj.build([None, None, self.embed_dim])

class TFSpeech2TextEncoderLayer(keras.layers.Layer):
    # 初始化函数，用于创建一个新的实例
    def __init__(self, config: Speech2TextConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 设置嵌入维度为模型配置中的 d_model
        self.embed_dim = config.d_model
        # 创建自注意力层对象，使用自定义的注意力头数和丢弃率配置
        self.self_attn = TFSpeech2TextAttention(
            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
        )
        # 创建自注意力层的层归一化层
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
        # 创建丢弃层，使用配置中的丢弃率
        self.dropout = keras.layers.Dropout(config.dropout)
        # 获取激活函数对象，根据配置的激活函数名
        self.activation_fn = get_tf_activation(config.activation_function)
        # 创建激活函数丢弃层，使用配置中的激活函数丢弃率
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
        # 创建全连接层 fc1，输出维度为配置中的 encoder_ffn_dim
        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
        # 创建全连接层 fc2，输出维度与嵌入维度相同
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
        # 创建最终层的层归一化层
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
        # 将配置对象保存到实例中
        self.config = config

    # 调用函数，执行实际的前向计算过程
    def call(
        self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False
    ):
        """
        Args:
            hidden_states (`tf.Tensor`): 输入到层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): 注意力掩码张量，形状为 `(batch, 1, tgt_len, src_len)`，其中填充元素由非常大的负值表示。
            layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码张量，形状为 `(encoder_attention_heads,)`
            training (`bool`): 是否处于训练模式
        """
        # 保存残差连接，用于后续加法操作
        residual = hidden_states
        # 执行自注意力层的层归一化
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 执行自注意力计算，并返回计算结果、注意力权重及额外信息
        hidden_states, self_attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            training=training,
        )

        # 断言确保自注意力操作未修改查询的形状
        tf.debugging.assert_equal(
            shape_list(hidden_states),
            shape_list(residual),
            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
        )

        # 应用丢弃操作到自注意力结果
        hidden_states = self.dropout(hidden_states, training=training)
        # 添加残差连接到自注意力结果上
        hidden_states = residual + hidden_states

        # 保存残差连接，用于后续加法操作
        residual = hidden_states
        # 执行最终层的层归一化
        hidden_states = self.final_layer_norm(hidden_states)
        # 应用激活函数到第一个全连接层的输出
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用激活函数的丢弃操作到第一个全连接层的输出
        hidden_states = self.activation_dropout(hidden_states, training=training)
        # 应用第二个全连接层，并输出结果
        hidden_states = self.fc2(hidden_states)
        # 应用丢弃操作到第二个全连接层的输出
        hidden_states = self.dropout(hidden_states, training=training)
        # 添加残差连接到第二个全连接层的输出上
        hidden_states = residual + hidden_states

        # 返回最终的隐藏状态和自注意力权重
        return hidden_states, self_attn_weights
    # 构建神经网络层的方法，用于在输入形状已知或未知时构建层
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记该层已经构建
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self_attn 层
        if getattr(self, "self_attn", None) is not None:
            # 在 TensorFlow 中使用 name_scope 为层设置命名空间
            with tf.name_scope(self.self_attn.name):
                # 调用 self_attn 层的 build 方法
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 self_attn_layer_norm 层
        if getattr(self, "self_attn_layer_norm", None) is not None:
            # 在 TensorFlow 中使用 name_scope 为层设置命名空间
            with tf.name_scope(self.self_attn_layer_norm.name):
                # 调用 self_attn_layer_norm 层的 build 方法，传入输入形状
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建 fc1 层
        if getattr(self, "fc1", None) is not None:
            # 在 TensorFlow 中使用 name_scope 为层设置命名空间
            with tf.name_scope(self.fc1.name):
                # 调用 fc1 层的 build 方法，传入输入形状
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建 fc2 层
        if getattr(self, "fc2", None) is not None:
            # 在 TensorFlow 中使用 name_scope 为层设置命名空间
            with tf.name_scope(self.fc2.name):
                # 调用 fc2 层的 build 方法，传入输入形状的编码器维度
                self.fc2.build([None, None, self.config.encoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建 final_layer_norm 层
        if getattr(self, "final_layer_norm", None) is not None:
            # 在 TensorFlow 中使用 name_scope 为层设置命名空间
            with tf.name_scope(self.final_layer_norm.name):
                # 调用 final_layer_norm 层的 build 方法，传入输入形状
                self.final_layer_norm.build([None, None, self.embed_dim])
class TFSpeech2TextDecoderLayer(keras.layers.Layer):
    # 定义 TF Speech-to-Text 解码器层的类
    def __init__(self, config: Speech2TextConfig, **kwargs):
        # 初始化函数，接收配置参数和其他关键字参数
        super().__init__(**kwargs)
        # 调用父类的初始化方法

        # 设置嵌入维度为模型配置中的维度
        self.embed_dim = config.d_model

        # 创建自注意力层，用于解码器的自注意力机制
        self.self_attn = TFSpeech2TextAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="self_attn",
            is_decoder=True,
        )

        # dropout 层，用于在激活函数前进行随机失活
        self.dropout = keras.layers.Dropout(config.dropout)

        # 获取激活函数并设置激活函数的随机失活
        self.activation_fn = get_tf_activation(config.activation_function)
        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)

        # 层归一化，用于自注意力层输出的归一化
        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")

        # 创建编码器注意力层，用于解码器与编码器之间的注意力机制
        self.encoder_attn = TFSpeech2TextAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            name="encoder_attn",
            is_decoder=True,
        )

        # 编码器注意力层的归一化
        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")

        # 第一个全连接层，用于解码器中的前馈神经网络
        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")

        # 第二个全连接层，输出维度与嵌入维度相同
        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")

        # 最终的层归一化，用于前馈神经网络输出的归一化
        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")

        # 存储配置参数
        self.config = config

    def call(
        self,
        hidden_states,
        attention_mask: tf.Tensor | None = None,
        encoder_hidden_states: tf.Tensor | None = None,
        encoder_attention_mask: tf.Tensor | None = None,
        layer_head_mask: tf.Tensor | None = None,
        cross_attn_layer_head_mask: tf.Tensor | None = None,
        past_key_value: Tuple[tf.Tensor] | None = None,
        training=False,
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回，不进行重复构建
        if self.built:
            return
        # 标记为已构建
        self.built = True
        
        # 如果存在 self_attn 属性，则构建 self_attn 层
        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)
        
        # 如果存在 self_attn_layer_norm 属性，则构建 self_attn_layer_norm 层
        if getattr(self, "self_attn_layer_norm", None) is not None:
            with tf.name_scope(self.self_attn_layer_norm.name):
                self.self_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 encoder_attn 属性，则构建 encoder_attn 层
        if getattr(self, "encoder_attn", None) is not None:
            with tf.name_scope(self.encoder_attn.name):
                self.encoder_attn.build(None)
        
        # 如果存在 encoder_attn_layer_norm 属性，则构建 encoder_attn_layer_norm 层
        if getattr(self, "encoder_attn_layer_norm", None) is not None:
            with tf.name_scope(self.encoder_attn_layer_norm.name):
                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
        
        # 如果存在 fc1 属性，则构建 fc1 层
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.embed_dim])
        
        # 如果存在 fc2 属性，则构建 fc2 层
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.decoder_ffn_dim])
        
        # 如果存在 final_layer_norm 属性，则构建 final_layer_norm 层
        if getattr(self, "final_layer_norm", None) is not None:
            with tf.name_scope(self.final_layer_norm.name):
                self.final_layer_norm.build([None, None, self.embed_dim])
class TFSpeech2TextPreTrainedModel(TFPreTrainedModel):
    # 指定配置类
    config_class = Speech2TextConfig
    # 模型前缀用于加载
    base_model_prefix = "model"
    # 主要输入特征名称
    main_input_name = "input_features"
    # 在加载时忽略的键列表
    _keys_to_ignore_on_load_unexpected = [r"encoder.embed_positions.weights"]

    def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
        """
        Computes the output length of the convolutional layers
        计算卷积层的输出长度
        """
        # 根据配置中的卷积层数进行迭代计算
        for _ in range(self.config.num_conv_layers):
            input_lengths = (input_lengths - 1) // 2 + 1

        return input_lengths

    @property
    def input_signature(self):
        # 定义模型输入的签名
        return {
            "input_features": tf.TensorSpec(
                # 输入特征的形状：(None, None, 输入通道数 * 每个通道的特征数)
                (None, None, self.config.input_feat_per_channel * self.config.input_channels),
                tf.float32,
                name="input_features",
            ),
            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
            "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
            "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
        }


SPEECH_TO_TEXT_START_DOCSTRING = r"""
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
"""
    # 注意：在使用子类化（subclassing）创建模型和层时，您不需要担心以下任何内容，因为您可以像将输入传递给任何其他Python函数一样进行传递！

    </Tip>

    # 参数:
    # config ([`Speech2TextConfig`]):
    #     包含模型所有参数的模型配置类。使用配置文件初始化时，不会加载与模型关联的权重，只加载配置信息。
    #     可以查看[`~TFPreTrainedModel.from_pretrained`]方法以加载模型权重。
"""


SPEECH_TO_TEXT_INPUTS_DOCSTRING = r"""
"""


@keras_serializable
class TFSpeech2TextEncoder(keras.layers.Layer):
    config_class = Speech2TextConfig
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`TFSpeech2TextEncoderLayer`].

    Args:
        config: Speech2TextConfig
    """

    def __init__(self, config: Speech2TextConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config

        # 初始化 dropout 层，使用指定的 dropout 概率
        self.dropout = keras.layers.Dropout(config.dropout)
        # layerdrop 是指定的 encoder_layerdrop 参数
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_source_positions
        self.embed_scale = tf.math.sqrt(float(embed_dim)) if config.scale_embedding else 1.0

        # 创建 TFConv1dSubsampler 对象，用于卷积操作
        self.conv = TFConv1dSubsampler(config, name="conv")

        # 创建 TFSpeech2TextSinusoidalPositionalEmbedding 对象，用于位置编码
        self.embed_positions = TFSpeech2TextSinusoidalPositionalEmbedding(
            num_positions=config.max_source_positions,
            embedding_dim=embed_dim,
            padding_idx=self.padding_idx,
            name="embed_positions",
        )
        
        # 创建多个 TFSpeech2TextEncoderLayer 对象，作为 Transformer 编码器的层
        self.layers = [TFSpeech2TextEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
        
        # 创建 LayerNormalization 层，用于归一化每个编码器层的输出
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")

    def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
        """
        Computes the output length of the convolutional layers
        """
        # 计算卷积层的输出长度
        for _ in range(self.config.num_conv_layers):
            input_lengths = (input_lengths - 1) // 2 + 1

        return input_lengths

    def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
        # 如果 attention_mask 的维度大于2，则取最后一个维度
        if len(attention_mask.shape) > 2:
            attention_mask = attention_mask[:, :, -1]

        # 计算特征提取的输出长度
        subsampled_lengths = self._get_feat_extract_output_lengths(tf.math.reduce_sum(attention_mask, -1))
        bsz = shape_list(attention_mask)[0]
        # 创建注意力掩码，将特定位置标记为1
        indices = tf.concat(
            (
                tf.expand_dims(tf.range(bsz, dtype=attention_mask.dtype), -1),
                tf.expand_dims(subsampled_lengths - 1, -1),
            ),
            axis=-1,
        )
        attention_mask = tf.scatter_nd(indices=indices, updates=tf.ones(bsz), shape=[bsz, feature_vector_length])
        # 反转和累积注意力掩码
        attention_mask = tf.cast(tf.reverse(tf.math.cumsum(tf.reverse(attention_mask, [-1]), -1), [-1]), tf.int64)
        return attention_mask

    @unpack_inputs
    def call(
        self,
        input_features=None,
        attention_mask=None,
        head_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    # 定义神经网络模型的 build 方法，用于构建模型的各个层次和参数
    def build(self, input_shape=None):
        # 如果已经构建过模型，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位，表示模型已经构建
        self.built = True
        
        # 如果存在卷积层，则构建卷积层
        if getattr(self, "conv", None) is not None:
            # 使用卷积层的名称作为 TensorFlow 的命名空间
            with tf.name_scope(self.conv.name):
                self.conv.build(None)
        
        # 如果存在位置嵌入层，则构建位置嵌入层
        if getattr(self, "embed_positions", None) is not None:
            # 使用位置嵌入层的名称作为 TensorFlow 的命名空间
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)
        
        # 如果存在层归一化层，则构建层归一化层
        if getattr(self, "layer_norm", None) is not None:
            # 使用层归一化层的名称作为 TensorFlow 的命名空间
            with tf.name_scope(self.layer_norm.name):
                # 构建层归一化层，输入形状为 [None, None, self.config.d_model]
                self.layer_norm.build([None, None, self.config.d_model])
        
        # 如果存在多个层，则分别构建每个层
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                # 使用每个层的名称作为 TensorFlow 的命名空间
                with tf.name_scope(layer.name):
                    # 构建当前层，输入形状为 None（即不限制输入形状）
                    layer.build(None)
# 使用 keras_serializable 装饰器使类可序列化
@keras_serializable
class TFSpeech2TextDecoder(keras.layers.Layer):
    # 将 config_class 属性设置为 Speech2TextConfig 类
    config_class = Speech2TextConfig

    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFSpeech2TextDecoderLayer`]

    Args:
        config: Speech2TextConfig
    """

    # 初始化方法，接受一个 config 参数和其他关键字参数
    def __init__(self, config: Speech2TextConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将传入的 config 参数赋值给 self.config
        self.config = config
        # 设置 layerdrop 属性为 config.decoder_layerdrop
        self.layerdrop = config.decoder_layerdrop
        # 设置 padding_idx 属性为 config.pad_token_id
        self.padding_idx = config.pad_token_id
        # 设置 max_target_positions 属性为 config.max_target_positions
        self.max_target_positions = config.max_target_positions
        # 如果 config.scale_embedding 为 True，则设置 embed_scale 为 d_model 的平方根，否则为 1.0
        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0

        # 创建 TFSharedEmbeddings 对象并赋值给 embed_tokens 属性
        self.embed_tokens = TFSharedEmbeddings(config.vocab_size, config.d_model, name="embed_tokens")

        # 创建 TFSpeech2TextSinusoidalPositionalEmbedding 对象并赋值给 embed_positions 属性
        self.embed_positions = TFSpeech2TextSinusoidalPositionalEmbedding(
            num_positions=config.max_target_positions,
            embedding_dim=config.d_model,
            padding_idx=self.padding_idx,
            name="embed_positions",
        )

        # 创建包含 config.decoder_layers 个 TFSpeech2TextDecoderLayer 的列表并赋值给 layers 属性
        self.layers = [TFSpeech2TextDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
        
        # 创建 LayerNormalization 层并赋值给 layer_norm 属性
        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")

        # 创建 Dropout 层并赋值给 dropout 属性
        self.dropout = keras.layers.Dropout(config.dropout)

    # 获取 embed_tokens 属性的方法
    def get_embed_tokens(self):
        return self.embed_tokens

    # 设置 embed_tokens 属性的方法
    def set_embed_tokens(self, embed_tokens):
        self.embed_tokens = embed_tokens

    # 使用 unpack_inputs 装饰器定义 call 方法，接受多个参数用于 Transformer 解码器的前向传播
    @unpack_inputs
    def call(
        self,
        input_ids=None,
        inputs_embeds=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
    ):
        # 此处实现前向传播逻辑，具体内容需要进一步详细注释，但不在此处进行总结

    # build 方法用于构建层，当被调用时检查是否已经构建，如果已构建则直接返回，否则构建各层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        # 将 built 属性标记为 True，表示已构建
        self.built = True

        # 如果 embed_tokens 属性存在，则构建其内部结构
        if getattr(self, "embed_tokens", None) is not None:
            with tf.name_scope(self.embed_tokens.name):
                self.embed_tokens.build(None)

        # 如果 embed_positions 属性存在，则构建其内部结构
        if getattr(self, "embed_positions", None) is not None:
            with tf.name_scope(self.embed_positions.name):
                self.embed_positions.build(None)

        # 如果 layer_norm 属性存在，则构建其内部结构
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                self.layer_norm.build([None, None, self.config.d_model])

        # 遍历 layers 列表中的每一层，并构建其内部结构
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    layer.build(None)


# 使用 keras_serializable 装饰器使类可序列化
@keras_serializable
class TFSpeech2TextMainLayer(keras.layers.Layer):
    # 将 config_class 属性设置为 Speech2TextConfig 类
    config_class = Speech2TextConfig
    # 初始化方法，接受一个配置对象 config 和其他关键字参数
    def __init__(self, config: Speech2TextConfig, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 将传入的配置对象保存在实例变量中
        self.config = config

        # 创建一个 TFSpeech2TextEncoder 对象并保存在实例变量 encoder 中
        self.encoder = TFSpeech2TextEncoder(config, name="encoder")
        # 创建一个 TFSpeech2TextDecoder 对象并保存在实例变量 decoder 中
        self.decoder = TFSpeech2TextDecoder(config, name="decoder")

    # 获取输入嵌入的方法，返回 decoder 的 embed_tokens 属性
    def get_input_embeddings(self):
        return self.decoder.embed_tokens

    # 设置输入嵌入的方法，接受新的嵌入向量并将其赋值给 decoder 的 embed_tokens 属性
    def set_input_embeddings(self, new_embeddings):
        self.decoder.embed_tokens = new_embeddings

    # 装饰器函数，用于解包输入参数
    @unpack_inputs
    def call(
        self,
        input_features=None,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        encoder_outputs=None,
        past_key_values=None,
        decoder_inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        training=False,
        **kwargs,
    ):
        # 此处是模型的调用方法，接受多个输入参数，并进行相应的处理

    # build 方法用于构建模型，在第一次调用时执行
    def build(self, input_shape=None):
        # 如果模型已经构建过，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果实例变量中存在 encoder 对象，则在命名空间下构建 encoder
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果实例变量中存在 decoder 对象，则在命名空间下构建 decoder
        if getattr(self, "decoder", None) is not None:
            with tf.name_scope(self.decoder.name):
                self.decoder.build(None)
# 定义一个基于 TFSpeech2TextPreTrainedModel 的具体模型类 TFSpeech2TextModel，用于输出未经特定头部处理的原始隐藏状态
@add_start_docstrings(
    "The bare Speech2Text Model outputting raw hidden-states without any specific head on top.",
    SPEECH_TO_TEXT_START_DOCSTRING,
)
class TFSpeech2TextModel(TFSpeech2TextPreTrainedModel):
    
    # 初始化方法，接受一个 Speech2TextConfig 类型的配置对象和其他可选参数
    def __init__(self, config: Speech2TextConfig, *inputs, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *inputs, **kwargs)

        # 创建一个 TFSpeech2TextMainLayer 对象作为模型的主层，使用给定的配置对象和名称
        self.model = TFSpeech2TextMainLayer(config, name="model")

    # 获取模型的编码器部分
    def get_encoder(self):
        return self.model.encoder

    # 获取模型的解码器部分
    def get_decoder(self):
        return self.model.decoder

    # 定义模型的调用方法，接受多个输入参数和一些可选的输出控制标志，返回模型输出的元组或 TFSeq2SeqModelOutput 类型
    @unpack_inputs
    @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFSeq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_features: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
        encoder_outputs: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
        **kwargs,
    ) -> Union[Tuple, TFSeq2SeqModelOutput]:
        
        # 调用模型的主层对象，传递所有参数和标志，并接收输出结果
        outputs = self.model(
            input_features=input_features,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回模型的输出结果
        return outputs
    # 定义一个方法用于生成模型的输出
    def serving_output(self, output):
        # 如果配置中使用缓存，则获取输出中的过去关键值的第二项，否则为 None
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置中输出隐藏状态，则将输出中的解码器隐藏状态转换为 TensorFlow 张量，否则为 None
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置中输出注意力权重，则将输出中的解码器注意力权重转换为 TensorFlow 张量，否则为 None
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置中输出注意力权重，则将输出中的交叉注意力权重转换为 TensorFlow 张量，否则为 None
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置中输出隐藏状态，则将输出中的编码器隐藏状态转换为 TensorFlow 张量，否则为 None
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置中输出注意力权重，则将输出中的编码器注意力权重转换为 TensorFlow 张量，否则为 None
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 返回一个 TFSeq2SeqModelOutput 对象，封装了模型的输出
        return TFSeq2SeqModelOutput(
            last_hidden_state=output.last_hidden_state,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    # 构建方法，用于构建模型
    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果模型已存在
        if getattr(self, "model", None) is not None:
            # 使用模型的名称空间构建模型，输入形状为 None
            with tf.name_scope(self.model.name):
                self.model.build(None)
# 定义一个基于 TFSpeech2TextPreTrainedModel 和 TFCausalLanguageModelingLoss 的模型类，用于语音到文本转换，并具有语言建模头部
@add_start_docstrings(
    "The Speech2Text Model with a language modeling head. Can be used for summarization.",
    SPEECH_TO_TEXT_START_DOCSTRING,
)
class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCausalLanguageModelingLoss):
    
    # 初始化方法，接受一个 Speech2TextConfig 对象作为参数
    def __init__(self, config: Speech2TextConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建 TFSpeech2TextMainLayer 对象作为模型主体，并命名为 "model"
        self.model = TFSpeech2TextMainLayer(config, name="model")
        # 创建一个 Dense 层作为语言建模头部，输出维度为 config.vocab_size，不使用偏置
        self.lm_head = keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head")
        # 设置是否支持在 XLA 生成中使用的标志为 False
        # TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate
        self.supports_xla_generation = False
        # 将传入的 config 对象保存到实例变量中
        self.config = config

    # 返回模型的编码器部分
    def get_encoder(self):
        return self.model.encoder

    # 返回模型的解码器部分
    def get_decoder(self):
        return self.model.decoder

    # 重新调整 token embeddings 的大小，返回更新后的 embeddings
    def resize_token_embeddings(self, new_num_tokens: int) -> tf.Variable:
        new_embeddings = super().resize_token_embeddings(new_num_tokens)
        return new_embeddings

    # 返回语言建模头部
    def get_output_embeddings(self):
        return self.lm_head

    # 设置新的输出 embeddings
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 模型的前向传播方法，接受多种输入参数并返回 TFSeq2SeqLMOutput 类型的输出
    @unpack_inputs
    @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def call(
        self,
        input_features: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
        encoder_outputs: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
        labels: np.ndarray | tf.Tensor | None = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
        **kwargs,
    # 定义一个方法用于处理模型输出，并根据配置选择性地返回不同的张量
    def serving_output(self, output):
        # 如果配置要求使用缓存，则获取输出中的过去键值（past_key_values）的第二个元素
        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
        # 如果配置要求输出隐藏状态，则将输出的解码器隐藏状态转换为张量
        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将输出的解码器注意力权重转换为张量
        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
        # 如果配置要求输出交叉注意力权重，则将输出的交叉注意力权重转换为张量
        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
        # 如果配置要求输出隐藏状态，则将输出的编码器隐藏状态转换为张量
        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
        # 如果配置要求输出注意力权重，则将输出的编码器注意力权重转换为张量
        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None

        # 返回一个 TFSeq2SeqLMOutput 对象，包含不同类型的模型输出
        return TFSeq2SeqLMOutput(
            logits=output.logits,
            past_key_values=pkv,
            decoder_hidden_states=dec_hs,
            decoder_attentions=dec_attns,
            cross_attentions=cross_attns,
            encoder_last_hidden_state=output.encoder_last_hidden_state,
            encoder_hidden_states=enc_hs,
            encoder_attentions=enc_attns,
        )

    # 准备用于生成的输入参数，根据条件截取 decoder_input_ids
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果存在 past_key_values，则截取 decoder_input_ids 的最后一个元素作为输入
        if past_key_values is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        # 返回一个字典，包含用于生成的输入参数
        return {
            "input_features": None,  # 需要传递以使 Keras.layer.__call__ 正常运行
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 更改此项以避免缓存（可能用于调试）
        }

    # 构建方法，用于建立模型的组件
    def build(self, input_shape=None):
        # 如果已经建立过，则直接返回
        if self.built:
            return
        # 标记模型已经建立
        self.built = True
        # 如果存在模型对象，则在命名空间下建立模型
        if getattr(self, "model", None) is not None:
            with tf.name_scope(self.model.name):
                self.model.build(None)
        # 如果存在 lm_head 对象，则在命名空间下建立 lm_head
        if getattr(self, "lm_head", None) is not None:
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build([None, None, self.config.d_model])

    # 转换 TensorFlow 权重名称到 PyTorch 权重名称的方法
    def tf_to_pt_weight_rename(self, tf_weight):
        # 如果输入的 TensorFlow 权重名称是 "lm_head.weight"，则返回对应的 PyTorch 权重名称
        if tf_weight == "lm_head.weight":
            return tf_weight, "model.decoder.embed_tokens.weight"
        else:
            return (tf_weight,)

`.\models\speech_to_text\processing_speech_to_text.py`

# coding=utf-8
# 设置文件编码格式为 UTF-8

# 版权声明：2021 年由 HuggingFace Inc. 团队版权所有。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权；
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”提供软件，
# 无论是明示还是默示的保证或条件。
# 有关更多详细信息，请参见许可证。

"""
Speech2Text 的语音处理器类
"""
# 引入警告模块
import warnings
# 引入上下文管理器
from contextlib import contextmanager

# 从本地引入处理工具函数 ProcessorMixin
from ...processing_utils import ProcessorMixin


class Speech2TextProcessor(ProcessorMixin):
    """
    构造一个 Speech2Text 处理器，将 Speech2Text 特征提取器和 Speech2Text 分词器封装到单个处理器中。

    [`Speech2TextProcessor`] 提供了 [`Speech2TextFeatureExtractor`] 和 [`Speech2TextTokenizer`] 的所有功能。
    查看 [`~Speech2TextProcessor.__call__`] 和 [`~Speech2TextProcessor.decode`] 获取更多信息。

    Args:
        feature_extractor (`Speech2TextFeatureExtractor`):
            [`Speech2TextFeatureExtractor`] 的一个实例。特征提取器是必需的输入。
        tokenizer (`Speech2TextTokenizer`):
            [`Speech2TextTokenizer`] 的一个实例。分词器是必需的输入。
    """

    # 类属性，特征提取器的类名
    feature_extractor_class = "Speech2TextFeatureExtractor"
    # 类属性，分词器的类名
    tokenizer_class = "Speech2TextTokenizer"

    def __init__(self, feature_extractor, tokenizer):
        # 调用父类 ProcessorMixin 的构造方法
        super().__init__(feature_extractor, tokenizer)
        # 将特征提取器作为当前处理器
        self.current_processor = self.feature_extractor
        # 内部标志，用于目标上下文管理器
        self._in_target_context_manager = False
    def __call__(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
        [`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
        [`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
        [`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
        information.
        """
        # 对于向后兼容性
        if self._in_target_context_manager:
            # 如果处于目标处理器上下文管理器中，将参数转发给当前处理器（tokenizer）
            return self.current_processor(*args, **kwargs)

        if "raw_speech" in kwargs:
            # 如果使用了过时的关键字参数 `raw_speech`，发出警告并改用 `audio`
            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
            audio = kwargs.pop("raw_speech")
        else:
            # 否则，使用关键字参数 `audio`，默认为 None
            audio = kwargs.pop("audio", None)
        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            # 如果位置参数 args 不为空，将第一个位置参数作为 audio，并将其余位置参数赋给 args
            audio = args[0]
            args = args[1:]

        if audio is None and text is None:
            # 如果既没有 audio 也没有 text 输入，则抛出 ValueError
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
            # 如果有 audio 输入，则使用特征提取器（feature_extractor）处理
            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            # 如果有 text 输入，则使用 tokenizer 处理
            encodings = self.tokenizer(text, **kwargs)

        if text is None:
            # 如果只有 audio 输入，则返回处理后的 inputs
            return inputs
        elif audio is None:
            # 如果只有 text 输入，则返回处理后的 encodings
            return encodings
        else:
            # 如果既有 audio 又有 text 输入，则将 encodings 的 input_ids 作为 labels 放入 inputs，并返回
            inputs["labels"] = encodings["input_ids"]
            return inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Speech2TextTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        # 将所有参数转发给 tokenizer 的 batch_decode 方法，并返回其输出
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Speech2TextTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        """
        # 将所有参数转发给 tokenizer 的 decode 方法，并返回其输出
        return self.tokenizer.decode(*args, **kwargs)

    @contextmanager
    def as_target_processor(self):
        """
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
        Speech2Text.
        """
        # 发出警告，说明该方法即将被移除，并建议使用 __call__ 方法中的 text 参数来处理标签
        warnings.warn(
            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
            "your audio inputs, or in a separate call."
        )
        self._in_target_context_manager = True
        self.current_processor = self.tokenizer  # 设置当前处理器为 tokenizer
        yield  # 执行代码块直到 yield
        self.current_processor = self.feature_extractor  # 恢复当前处理器为 feature_extractor
        self._in_target_context_manager = False  # 结束目标处理器上下文管理器

`.\models\speech_to_text\tokenization_speech_to_text.py`

# coding=utf-8
# 设置脚本编码格式为 UTF-8

# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
# 版权声明，版权归 HuggingFace Inc. 团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 遵循 Apache License 2.0 版本，允许在特定条件下使用本代码

# you may not use this file except in compliance with the License.
# 除非符合许可证的条件，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在上述 License 链接获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
# 许可证详细信息请访问 http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 在适用法律要求或书面同意的情况下，本软件按“原样”分发，不提供任何担保或条件

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 不提供任何明示或暗示的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查看许可证，了解适用的语言、权限和限制

"""Tokenization classes for Speech2Text."""
# 用于 Speech2Text 的分词类

import json
import os
from pathlib import Path
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union

import sentencepiece  # 导入 sentencepiece 库

from ...tokenization_utils import PreTrainedTokenizer  # 导入预训练分词器基类
from ...utils import logging  # 导入日志模块


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

SPIECE_UNDERLINE = "▁"  # 定义表示词组起始的特殊符号

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",  # 词汇表文件名
    "spm_file": "sentencepiece.bpe.model",  # sentencepiece 模型文件名
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/s2t-small-librispeech-asr": (
            "https://huggingface.co/facebook/s2t-small-librispeech-asr/resolve/main/vocab.json"
        ),
    },
    "spm_file": {
        "facebook/s2t-small-librispeech-asr": (
            "https://huggingface.co/facebook/s2t-small-librispeech-asr/resolve/main/sentencepiece.bpe.model"
        )
    },
}

MAX_MODEL_INPUT_SIZES = {
    "facebook/s2t-small-librispeech-asr": 1024,  # 模型输入的最大长度
}

MUSTC_LANGS = ["pt", "fr", "ru", "nl", "ro", "it", "es", "de"]  # MUSTC 语言列表

LANGUAGES = {"mustc": MUSTC_LANGS}  # 支持的语言映射，例如 "mustc" 对应的语言列表为 MUSTC_LANGS


class Speech2TextTokenizer(PreTrainedTokenizer):
    """
    Construct an Speech2Text tokenizer.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
    the superclass for more information regarding such methods.
    """
    # 构造一个 Speech2Text 分词器，继承自 PreTrainedTokenizer

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 调用父类的构造方法初始化分词器
    # 定义一个类，用于处理预训练模型的tokenizer，继承自`PreTrainedTokenizer`
    class PreTrainedTokenizer:
        
        # 定义类属性，指定用于加载词汇表文件的名称
        vocab_files_names = VOCAB_FILES_NAMES
        # 指定预训练模型的词汇表文件映射
        pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
        # 指定模型的最大输入尺寸
        max_model_input_sizes = MAX_MODEL_INPUT_SIZES
        # 定义输入名称列表，这些名称将在模型输入时使用
        model_input_names = ["input_ids", "attention_mask"]
        
        # 初始化方法，设置tokenizer的各种参数
        def __init__(
            self,
            vocab_file,
            spm_file,
            bos_token="<s>",
            eos_token="</s>",
            pad_token="<pad>",
            unk_token="<unk>",
            do_upper_case=False,
            do_lower_case=False,
            tgt_lang=None,
            lang_codes=None,
            additional_special_tokens=None,
            sp_model_kwargs: Optional[Dict[str, Any]] = None,
            **kwargs,
        ):
    ) -> None:
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        # 初始化参数 `sp_model_kwargs`，如果为 None 则设为空字典，否则使用传入的参数值

        self.do_upper_case = do_upper_case
        self.do_lower_case = do_lower_case
        # 设置是否进行大写和小写处理的标志

        self.encoder = load_json(vocab_file)
        # 加载并存储从 JSON 文件中读取的编码器字典

        self.decoder = {v: k for k, v in self.encoder.items()}
        # 创建解码器字典，反转编码器字典中的键值对

        self.spm_file = spm_file
        # 存储 SentencePiece 模型文件路径

        self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
        # 使用 SentencePiece 模型文件和参数初始化 sp_model 对象

        if lang_codes is not None:
            self.lang_codes = lang_codes
            # 存储语言代码

            self.langs = LANGUAGES[lang_codes]
            # 获取对应语言代码的语言列表

            self.lang_tokens = [f"<lang:{lang}>" for lang in self.langs]
            # 为每种语言生成特定格式的标记

            self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"<lang:{lang}>") for lang in self.langs}
            # 创建语言到其对应 ID 的映射

            if additional_special_tokens is not None:
                additional_special_tokens = self.lang_tokens + additional_special_tokens
            else:
                additional_special_tokens = self.lang_tokens
            # 添加额外的特殊标记，包括语言标记

            self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
            # 设置目标语言，默认为语言列表中的第一个

            self.set_tgt_lang_special_tokens(self._tgt_lang)
            # 设置目标语言的特殊标记
        else:
            self.lang_code_to_id = {}
            # 若未提供语言代码，则初始化为空字典

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            do_upper_case=do_upper_case,
            do_lower_case=do_lower_case,
            tgt_lang=tgt_lang,
            lang_codes=lang_codes,
            sp_model_kwargs=self.sp_model_kwargs,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )
        # 调用父类的构造函数，传递参数以初始化基类

    @property
    def vocab_size(self) -> int:
        return len(self.encoder)
        # 返回编码器的词汇表大小

    def get_vocab(self) -> Dict:
        vocab = self.encoder.copy()
        vocab.update(self.added_tokens_encoder)
        return vocab
        # 返回扩展后的完整词汇表，包括已添加的特殊标记

    @property
    def tgt_lang(self) -> str:
        return self._tgt_lang
        # 返回当前目标语言代码

    @tgt_lang.setter
    def tgt_lang(self, new_tgt_lang) -> None:
        self._tgt_lang = new_tgt_lang
        self.set_tgt_lang_special_tokens(new_tgt_lang)
        # 设置新的目标语言，并更新特殊标记设置

    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
        """Reset the special tokens to the target language setting. prefix=[eos, tgt_lang_code] and suffix=[eos]."""
        lang_code_id = self.lang_code_to_id[tgt_lang]
        self.prefix_tokens = [lang_code_id]
        # 根据目标语言设置特殊标记，包括前缀部分的语言代码

    def _tokenize(self, text: str) -> List[str]:
        return self.sp_model.encode(text, out_type=str)
        # 使用 SentencePiece 模型对文本进行分词处理，返回分词后的字符串列表

    def _convert_token_to_id(self, token):
        return self.encoder.get(token, self.encoder[self.unk_token])
        # 将标记转换为对应的 ID，如果不在词汇表中则返回未知标记的 ID

    def _convert_id_to_token(self, index: int) -> str:
        """Converts an index (integer) in a token (str) using the decoder."""
        return self.decoder.get(index, self.unk_token)
        # 将索引转换为对应的标记，使用解码器进行映射，未知索引返回未知标记
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        # 初始化当前子词列表和输出字符串
        current_sub_tokens = []
        out_string = ""
        # 遍历每个 token
        for token in tokens:
            # 检查当前 token 是否是特殊 token
            if token in self.all_special_tokens:
                # 解码当前子词列表成字符串，根据需求转换大小写，然后添加当前 token 到结果字符串中
                decoded = self.sp_model.decode(current_sub_tokens)
                out_string += (decoded.upper() if self.do_upper_case else decoded) + token + " "
                # 重置当前子词列表
                current_sub_tokens = []
            else:
                # 将当前 token 加入当前子词列表中
                current_sub_tokens.append(token)
        # 处理剩余的子词列表并添加到输出字符串中
        decoded = self.sp_model.decode(current_sub_tokens)
        out_string += decoded.upper() if self.do_upper_case else decoded
        # 返回处理后的输出字符串，去除末尾的空格
        return out_string.strip()

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """Build model inputs from a sequence by appending eos_token_id."""
        # 如果只有一个 token 序列，将其加上前缀 tokens 和结束 token 后返回
        if token_ids_1 is None:
            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
        # 对于处理序列对的情况，尽管本方法不期望处理，但保留对序列对的处理逻辑以保持 API 的一致性
        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            # 如果输入已经包含特殊 tokens，则调用父类方法获取特殊 token 的掩码
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # 前缀 tokens 全部标记为 1
        prefix_ones = [1] * len(self.prefix_tokens)
        # 后缀 tokens 只有一个标记为 1
        suffix_ones = [1]
        if token_ids_1 is None:
            # 如果只有一个 token 序列，前缀为 1，其余为 0，后缀为 1
            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
        # 对于序列对，前缀为 1，两个序列的内容为 0，后缀为 1
        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

    def __getstate__(self) -> Dict:
        # 复制当前对象的状态字典
        state = self.__dict__.copy()
        # 将 sp_model 设置为 None，以便于对象的序列化
        state["sp_model"] = None
        # 返回修改后的状态字典
        return state
    # 定义对象的 __setstate__ 方法，用于从字典 d 恢复对象状态
    def __setstate__(self, d: Dict) -> None:
        # 将对象的 __dict__ 属性设置为字典 d，从而恢复对象的状态
        self.__dict__ = d

        # 为了向后兼容性而添加，如果对象没有属性 "sp_model_kwargs"，则设置为空字典
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 使用指定的 spm_file 和 sp_model_kwargs 加载 SentencePiece 模型
        self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)

    # 保存词汇表到指定目录下
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 将 save_directory 转换为 Path 对象
        save_dir = Path(save_directory)
        # 断言 save_dir 是一个目录，否则抛出异常
        assert save_dir.is_dir(), f"{save_directory} should be a directory"

        # 构建词汇表文件和 SentencePiece 模型文件的保存路径
        vocab_save_path = save_dir / (
            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
        )
        spm_save_path = save_dir / (
            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
        )

        # 将编码器(encoder)保存为 JSON 文件到 vocab_save_path
        save_json(self.encoder, vocab_save_path)

        # 如果当前 spm_file 的绝对路径与 spm_save_path 不同，并且 spm_file 是一个文件，则复制 spm_file 到 spm_save_path
        if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
            copyfile(self.spm_file, spm_save_path)
        # 如果 spm_file 不是文件，则将 sp_model 序列化后的模型内容写入到 spm_save_path
        elif not os.path.isfile(self.spm_file):
            with open(spm_save_path, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回保存的 vocab 和 spm 文件的路径组成的元组
        return (str(vocab_save_path), str(spm_save_path))
# 根据指定的参数加载 SentencePiece 模型处理器
def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
    # 使用给定的参数初始化 SentencePiece 模型处理器
    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
    # 加载指定路径下的 SentencePiece 模型
    spm.Load(str(path))
    # 返回加载后的 SentencePiece 模型处理器
    return spm


# 加载指定路径下的 JSON 文件并返回解析后的 Python 对象
def load_json(path: str) -> Union[Dict, List]:
    # 使用只读模式打开指定路径下的 JSON 文件
    with open(path, "r") as f:
        # 解析 JSON 文件内容为 Python 字典或列表
        return json.load(f)


# 将数据保存为 JSON 格式到指定路径下的文件
def save_json(data, path: str) -> None:
    # 以写入模式打开指定路径下的文件
    with open(path, "w") as f:
        # 将数据以带缩进的 JSON 格式写入文件
        json.dump(data, f, indent=2)

`.\models\speech_to_text\init.py`

# 导入所需模块和函数，包括特定的异常处理和延迟加载模块
from typing import TYPE_CHECKING
# 从相对路径导入必要的模块和类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tf_available,
    is_torch_available,
)

# 定义模块的导入结构，包含不同子模块的导入映射
_import_structure = {
    "configuration_speech_to_text": ["SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Speech2TextConfig"],
    "feature_extraction_speech_to_text": ["Speech2TextFeatureExtractor"],
    "processing_speech_to_text": ["Speech2TextProcessor"],
}

# 尝试导入句子处理模块，如果不可用则抛出异常并忽略
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["tokenization_speech_to_text"] = ["Speech2TextTokenizer"]

# 尝试导入 TensorFlow 模块，如果不可用则抛出异常并忽略
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_tf_speech_to_text"] = [
        "TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFSpeech2TextForConditionalGeneration",
        "TFSpeech2TextModel",
        "TFSpeech2TextPreTrainedModel",
    ]

# 尝试导入 PyTorch 模块，如果不可用则抛出异常并忽略
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_speech_to_text"] = [
        "SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Speech2TextForConditionalGeneration",
        "Speech2TextModel",
        "Speech2TextPreTrainedModel",
    ]

# 如果在类型检查模式下，导入额外的模块和类来进行类型注解
if TYPE_CHECKING:
    from .configuration_speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
    from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
    from .processing_speech_to_text import Speech2TextProcessor

    # 在类型检查模式下，尝试导入句子处理模块，如果不可用则忽略
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_speech_to_text import Speech2TextTokenizer

    # 在类型检查模式下，尝试导入 TensorFlow 模块，如果不可用则忽略
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tf_speech_to_text import (
            TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFSpeech2TextForConditionalGeneration,
            TFSpeech2TextModel,
            TFSpeech2TextPreTrainedModel,
        )
    # 尝试检查是否 Torch 库可用
    try:
        # 如果 Torch 不可用，则引发 OptionalDependencyNotAvailable 异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常
    except OptionalDependencyNotAvailable:
        # 如果 Torch 不可用，不进行任何操作，继续执行后续代码
        pass
    else:
        # 如果没有捕获异常，则导入语音到文本模型相关的模块和类
        from .modeling_speech_to_text import (
            SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
            Speech2TextForConditionalGeneration,
            Speech2TextModel,
            Speech2TextPreTrainedModel,
        )
# 如果条件不成立，则导入 sys 模块
import sys
# 将当前模块注册到 sys.modules 中
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\speech_to_text_2\configuration_speech_to_text_2.py`

# coding=utf-8
# 定义脚本编码为 UTF-8

# 版权声明和许可信息
# 版权所有 2021 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件
# 按“原样”分发，不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
""" Speech2Text model configuration"""

# 从父级目录导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 从工具模块导入日志记录功能
from ...utils import logging

# 获取 logger 实例
logger = logging.get_logger(__name__)

# 预训练配置存档映射，将预训练模型名称映射到其配置文件的 URL
SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/s2t-wav2vec2-large-en-de": (
        "https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/config.json"
    ),
    # 查看所有 Speech2Text 模型：https://huggingface.co/models?filter=speech2text2
}


# 定义 Speech2Text2Config 类，继承自 PretrainedConfig 类
class Speech2Text2Config(PretrainedConfig):
    r"""
    这是配置类，用于存储 [`Speech2Text2ForCausalLM`] 的配置。根据指定的参数实例化 Speech2Text2 模型，
    定义模型架构。使用默认值实例化配置将生成类似于 Speech2Text2
    [facebook/s2t-wav2vec2-large-en-de](https://huggingface.co/facebook/s2t-wav2vec2-large-en-de) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。
    ```
    # 定义了 Speech2Text 模型的配置类 Speech2Text2Config 的默认参数
    Args:
        vocab_size (`int`, *optional*, defaults to 50265):
            语音到文本模型的词汇表大小，定义了在调用 Speech2TextModel 时传递的 `inputs_ids` 可表示的不同标记数量。
        d_model (`int`, *optional*, defaults to 1024):
            层和池化层的维度。
        decoder_layers (`int`, *optional*, defaults to 12):
            解码器层数。
        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Transformer 解码器中每个注意力层的注意力头数。
        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            解码器中“中间”（通常称为前馈）层的维度。
        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            池化器中的非线性激活函数（函数或字符串）。如果是字符串，支持 `"gelu"`, `"relu"`, `"silu"` 和 `"gelu_new"`。
        dropout (`float`, *optional*, defaults to 0.1):
            嵌入层和池化器中所有全连接层的丢弃概率。
        attention_dropout (`float`, *optional*, defaults to 0.0):
            注意力概率的丢弃比率。
        activation_dropout (`float`, *optional*, defaults to 0.0):
            全连接层内部激活的丢弃比率。
        init_std (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
            参考 https://arxiv.org/abs/1909.11556 进一步了解详情。
        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
            解码器的 LayerDrop 概率。参见 LayerDrop 论文 https://arxiv.org/abs/1909.11556 了解更多详情。
        use_cache (`bool`, *optional*, defaults to `True`):
            模型是否应返回最后一个键/值注意力（并非所有模型都使用）。
        max_target_positions (`int`, *optional*, defaults to 1024):
            模型可能会用到的最大序列长度。通常将其设置为一个较大的值（例如 512、1024 或 2048）。

    Example:

    ```
    >>> from transformers import Speech2Text2Config, Speech2Text2ForCausalLM

    >>> # 初始化一个 Speech2Text2Config 配置类实例
    >>> configuration = Speech2Text2Config()

    >>> # 从 Speech2Text2Config 配置类实例初始化一个带有随机权重的模型
    >>> model = Speech2Text2ForCausalLM(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```

    # 设置模型类型
    model_type = "speech_to_text_2"
    # 在推断过程中需要忽略的键列表
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典，将模型配置的字段映射到其他命名约定
    attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
    # 初始化函数，用于初始化 TransformerDecoderModel 类的实例
    def __init__(
        self,
        vocab_size=10000,  # 词汇表大小，默认为10000
        decoder_layers=6,  # 解码器层数，默认为6层
        decoder_ffn_dim=2048,  # 解码器中间层的维度，默认为2048
        decoder_attention_heads=4,  # 解码器注意力头数，默认为4个头
        decoder_layerdrop=0.0,  # 解码器层级随机丢弃的概率，默认为0.0（不丢弃）
        use_cache=True,  # 是否使用缓存，默认为True
        activation_function="relu",  # 激活函数，默认为ReLU
        d_model=256,  # 模型维度，默认为256
        dropout=0.1,  # 全连接层和注意力层的dropout概率，默认为0.1
        attention_dropout=0.0,  # 注意力机制的dropout概率，默认为0.0（不丢弃）
        activation_dropout=0.0,  # 激活函数的dropout概率，默认为0.0（不丢弃）
        init_std=0.02,  # 参数初始化的标准差，默认为0.02
        decoder_start_token_id=2,  # 解码器起始token的ID，默认为2
        scale_embedding=True,  # 是否对embedding进行缩放，默认为True
        pad_token_id=1,  # 填充token的ID，默认为1
        bos_token_id=0,  # 开始token的ID，默认为0
        eos_token_id=2,  # 结束token的ID，默认为2
        max_target_positions=1024,  # 目标序列的最大长度，默认为1024
        **kwargs,  # 其他关键字参数
    ):
        self.vocab_size = vocab_size  # 设置词汇表大小
        self.d_model = d_model  # 设置模型维度
        self.decoder_ffn_dim = decoder_ffn_dim  # 设置解码器中间层维度
        self.decoder_layers = decoder_layers  # 设置解码器层数
        self.decoder_attention_heads = decoder_attention_heads  # 设置解码器注意力头数
        self.dropout = dropout  # 设置全连接层和注意力层的dropout概率
        self.attention_dropout = attention_dropout  # 设置注意力机制的dropout概率
        self.activation_dropout = activation_dropout  # 设置激活函数的dropout概率
        self.activation_function = activation_function  # 设置激活函数类型
        self.init_std = init_std  # 设置参数初始化的标准差
        self.decoder_layerdrop = decoder_layerdrop  # 设置解码器层级随机丢弃的概率
        self.use_cache = use_cache  # 设置是否使用缓存
        self.num_hidden_layers = decoder_layers  # 设置隐藏层的数量为解码器层数
        self.scale_embedding = scale_embedding  # 设置是否对embedding进行缩放，若True则缩放因子为sqrt(d_model)
        self.max_target_positions = max_target_positions  # 设置目标序列的最大长度

        # 调用父类的初始化方法，传入填充、起始和结束token的ID以及其他关键字参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            decoder_start_token_id=decoder_start_token_id,
            **kwargs,
        )

`.\models\speech_to_text_2\modeling_speech_to_text_2.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Speech2Text2 model."""


import copy
import math
from typing import Optional, Tuple, Union

import torch
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, logging, replace_return_docstrings
from .configuration_speech_to_text_2 import Speech2Text2Config


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "Speech2Text2Config"
_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de"


SPEECH_TO_TEXT_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/s2t-wav2vec2-large-en-de",
    # See all Speech2Text2 models at https://huggingface.co/models?filter=speech2text2
]


# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextSinusoidalPositionalEmbedding with Speech2Text->Speech2Text2
class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
        super().__init__()
        self.offset = 2
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        # 调用make_weights方法初始化位置编码权重
        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)

    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        # 调用get_embedding方法生成位置编码权重
        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
        if hasattr(self, "weights"):
            # 如果已经有了weights属性，在forward方法中将权重转换为正确的数据类型和设备
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        # 将生成的位置编码权重设置为模型的可学习参数，并且不需要梯度
        self.weights = nn.Parameter(emb_weights)
        self.weights.requires_grad = False
        self.weights.detach_()

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
        """
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        """
        # 计算嵌入向量的一半维度
        half_dim = embedding_dim // 2
        # 计算公式中的常数
        emb = math.log(10000) / (half_dim - 1)
        # 计算指数部分
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        # 计算正弦和余弦位置编码
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        # 如果嵌入维度是奇数，则进行零填充
        if embedding_dim % 2 == 1:
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        # 如果指定了填充索引，则将填充位置的嵌入向量设为零向量
        if padding_idx is not None:
            emb[padding_idx, :] = 0
        # 将嵌入向量转换为默认的 torch 数据类型并返回
        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        bsz, seq_len = input_ids.size()
        # 从输入的 token ids 中创建位置 ids，保持填充的 token 仍然填充
        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
            input_ids.device
        )

        # 如果超出当前权重矩阵的最大位置，则扩展权重矩阵
        max_pos = self.padding_idx + 1 + seq_len
        if max_pos > self.weights.size(0):
            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)

        # 根据位置 ids 选择对应的权重，重塑为 (batch_size, seq_len, -1) 的形状并返回
        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()

    def create_position_ids_from_input_ids(
        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
    ):
        """
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        """
        # 使用 input_ids 中非填充符号的位置数替换为它们的位置数字，位置数字从 padding_idx+1 开始
        mask = input_ids.ne(padding_idx).int()
        # 累积计算位置编号，加上过去键值长度，并保持输入张量的形状
        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
        return incremental_indices.long() + padding_idx
# 从transformers.models.bart.modeling_bart.BartAttention复制代码，将Bart->Speech2Text2
class Speech2Text2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[Speech2Text2Config] = None,
    ):
        super().__init__()
        # 初始化注意力机制的参数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        # 检查embed_dim是否能够被num_heads整除
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子，用于缩放注意力分数
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        # 线性变换层，用于计算查询、键、值和输出
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 重新形状张量，以便进行多头注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 实现注意力机制的前向传播
        # hidden_states: 输入的隐藏状态张量
        # key_value_states: 键-值状态张量（可选）
        # past_key_value: 过去的键-值对（可选）
        # attention_mask: 注意力掩码（可选）
        # layer_head_mask: 层头掩码（可选）
        # output_attentions: 是否输出注意力权重（布尔值）
        pass  # 这里应该有具体的实现，但是在这个例子中被省略了

class Speech2Text2DecoderLayer(nn.Module):
    def __init__(self, config: Speech2Text2Config):
        super().__init__()
        self.embed_dim = config.d_model

        # 初始化解码器层的参数
        self.self_attn = Speech2Text2Attention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        # 层归一化层，用于自注意力
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 如果配置为解码器，还需要初始化编码器-解码器注意力层
        if config.is_decoder:
            self.encoder_attn = Speech2Text2Attention(
                self.embed_dim,
                config.decoder_attention_heads,
                dropout=config.attention_dropout,
                is_decoder=True,
            )
            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 线性变换层，用于前馈神经网络部分
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
    # 定义模型的前向传播方法，用于生成模型的输出
    def forward(
        self,
        # 输入参数：当前层的隐藏状态，类型为 torch.Tensor
        hidden_states: torch.Tensor,
        # 输入参数：注意力遮罩，可选的 torch.Tensor 类型，默认为 None
        attention_mask: Optional[torch.Tensor] = None,
        # 输入参数：编码器的隐藏状态，可选的 torch.Tensor 类型，默认为 None
        encoder_hidden_states: Optional[torch.Tensor] = None,
        # 输入参数：编码器的注意力遮罩，可选的 torch.Tensor 类型，默认为 None
        encoder_attention_mask: Optional[torch.Tensor] = None,
        # 输入参数：层级头部掩码，可选的 torch.Tensor 类型，默认为 None
        layer_head_mask: Optional[torch.Tensor] = None,
        # 输入参数：跨注意力层级头部掩码，可选的 torch.Tensor 类型，默认为 None
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        # 输入参数：过去的键值对，可选的 torch.Tensor 类型的元组，默认为 None
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        # 输入参数：是否输出注意力权重，可选的布尔值，默认为 False
        output_attentions: Optional[bool] = False,
        # 输入参数：是否使用缓存，可选的布尔值，默认为 True
        use_cache: Optional[bool] = True,
# 定义一个自定义的解码器类 Speech2Text2Decoder，继承自 Speech2Text2PreTrainedModel
class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
    """
    Transformer 解码器，由 config.decoder_layers 层组成。每一层是一个 Speech2Text2DecoderLayer 类的实例。

    Args:
        config: Speech2Text2Config，模型的配置对象
        embed_tokens (nn.Embedding): 输出的嵌入层对象
    """

    # 初始化方法，接受一个配置对象 config
    def __init__(self, config: Speech2Text2Config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置 dropout 概率
        self.dropout = config.dropout
        # 设置层级下降的概率
        self.layerdrop = config.decoder_layerdrop
        # 设置填充标记的索引
        self.padding_idx = config.pad_token_id
        # 设置目标序列的最大位置
        self.max_target_positions = config.max_target_positions
        # 根据配置中的 scale_embedding 决定是否对嵌入进行缩放
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        # 定义词嵌入层，输入参数为词汇表大小、嵌入维度、填充标记的索引
        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        # 定义位置编码器，使用 Speech2Text2SinusoidalPositionalEmbedding 类创建
        self.embed_positions = Speech2Text2SinusoidalPositionalEmbedding(
            self.max_target_positions,
            config.d_model,
            self.padding_idx,
        )

        # 定义多层解码器，使用 nn.ModuleList 存储多个 Speech2Text2DecoderLayer 层
        self.layers = nn.ModuleList([Speech2Text2DecoderLayer(config) for _ in range(config.decoder_layers)])

        # 是否启用梯度检查点
        self.gradient_checkpointing = False

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入的嵌入层对象
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入的嵌入层对象
    def set_input_embeddings(self, value):
        self.embed_tokens = value
    # 定义一个前向传播函数，用于模型推理阶段
    def forward(
        self,
        input_ids=None,  # 输入的token IDs
        attention_mask=None,  # 注意力掩码，指示哪些位置是padding的
        encoder_hidden_states=None,  # 编码器的隐藏状态
        encoder_attention_mask=None,  # 编码器的注意力掩码
        head_mask=None,  # 多头注意力的掩码
        cross_attn_head_mask=None,  # 跨注意力头的掩码
        past_key_values=None,  # 过去的键值对（用于循环生成）
        inputs_embeds=None,  # 输入的嵌入表示
        use_cache=None,  # 是否使用缓存（用于加速推理）
        output_attentions=None,  # 是否输出注意力权重
        output_hidden_states=None,  # 是否输出隐藏状态
        return_dict=None,  # 是否以字典形式返回结果
# 使用装饰器添加文档字符串，描述了 Speech2Text2Model 的用途和结构特性
@add_start_docstrings(
    "The Speech2Text2 Model with a language modeling head. Can be used for summarization.",
    SPEECH_TO_TEXT_2_START_DOCSTRING,
)
# 定义 Speech2Text2DecoderWrapper 类，继承自 Speech2Text2PreTrainedModel 类
class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    """

    # 初始化方法，接收配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建 Speech2Text2Decoder 对象并赋值给 self.decoder
        self.decoder = Speech2Text2Decoder(config)

    # 前向传播方法，将参数传递给 self.decoder 的前向传播方法
    def forward(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)


# 使用装饰器添加文档字符串，描述了 Speech2Text2Decoder 的用途和结构特性
@add_start_docstrings(
    "The Speech2Text2 Decoder with a language modeling head. Can be used as the decoder part of"
    " [`EncoderDecoderModel`] and [`SpeechEncoderDecoder`].",
    SPEECH_TO_TEXT_2_START_DOCSTRING,
)
# 定义 Speech2Text2ForCausalLM 类，继承自 Speech2Text2PreTrainedModel 类
class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
    # 类变量，定义了绑定权重的键名列表
    _tied_weights_keys = ["lm_head.weight"]

    # 初始化方法，接收配置对象 config
    def __init__(self, config):
        # 深拷贝配置对象 config
        config = copy.deepcopy(config)
        # 设置 config 的 is_decoder 属性为 True，is_encoder_decoder 属性为 False
        config.is_decoder = True
        config.is_encoder_decoder = False
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建 Speech2Text2DecoderWrapper 对象并赋值给 self.model
        self.model = Speech2Text2DecoderWrapper(config)

        # 创建线性层 lm_head，连接 config 的隐藏大小和词汇表大小，无偏置
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 调用自定义的初始化方法，初始化权重并进行最终处理
        self.post_init()

    # 获取输入嵌入层的方法，返回 self.model.decoder 的 embed_tokens 属性
    def get_input_embeddings(self):
        return self.model.decoder.embed_tokens

    # 设置输入嵌入层的方法，将 value 赋值给 self.model.decoder 的 embed_tokens 属性
    def set_input_embeddings(self, value):
        self.model.decoder.embed_tokens = value

    # 获取输出嵌入层的方法，返回 self.lm_head 属性
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出嵌入层的方法，将 new_embeddings 赋值给 self.lm_head 属性
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 设置解码器的方法，将 decoder 赋值给 self.model.decoder
    def set_decoder(self, decoder):
        self.model.decoder = decoder

    # 获取解码器的方法，返回 self.model.decoder
    def get_decoder(self):
        return self.model.decoder

    # 用于替换返回值文档字符串的装饰器，输出类型为 CausalLMOutputWithCrossAttentions，配置类为 _CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    # 前向传播方法，接收多个输入参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 省略前向传播的具体实现

    # 为生成准备输入的方法，接收输入的多个参数，并省略其具体实现
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
    ):
        # 省略输入准备的具体实现
    ):
        # 如果模型作为编码器-解码器模型的解码器使用，那么注意力遮罩将动态创建
        if attention_mask is None:
            # 如果注意力遮罩为None，则创建一个全为1的注意力遮罩，形状与input_ids相同
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            # 获取过去键值对的长度（过去的状态）
            past_length = past_key_values[0][0].shape[2]

            # 某些生成方法可能只传递最后一个输入ID
            if input_ids.shape[1] > past_length:
                # 如果输入ID的长度大于过去的长度，移除前缀长度为过去的长度
                remove_prefix_length = past_length
            else:
                # 否则，默认旧的行为：保留最后一个ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 更新输入ID，仅保留后缀部分以便进行生成
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回一个包含各种生成所需信息的字典
        return {
            "input_ids": input_ids,  # 编码器输出已定义，不需要input_ids
            "attention_mask": attention_mask,  # 注意力遮罩
            "past_key_values": past_key_values,  # 过去的键值对
            "use_cache": use_cache,  # 是否使用缓存
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        # 重新排序过去的状态，根据beam_idx重排过去的键值对
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                # 对每一层的过去状态，根据beam_idx在设备上重新选择索引
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

`.\models\speech_to_text_2\processing_speech_to_text_2.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Speech processor class for Speech2Text2
"""
import warnings
from contextlib import contextmanager

from ...processing_utils import ProcessorMixin

# 导入 ProcessorMixin 类，该类提供了处理器的基本功能

class Speech2Text2Processor(ProcessorMixin):
    r"""
    Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
    a single processor.

    [`Speech2Text2Processor`] offers all the functionalities of [`AutoFeatureExtractor`] and [`Speech2Text2Tokenizer`].
    See the [`~Speech2Text2Processor.__call__`] and [`~Speech2Text2Processor.decode`] for more information.

    Args:
        feature_extractor (`AutoFeatureExtractor`):
            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`Speech2Text2Tokenizer`):
            An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
    """

    feature_extractor_class = "AutoFeatureExtractor"
    # 类属性：特征提取器类名，值为字符串 "AutoFeatureExtractor"
    tokenizer_class = "Speech2Text2Tokenizer"
    # 类属性：分词器类名，值为字符串 "Speech2Text2Tokenizer"

    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)
        # 调用父类 ProcessorMixin 的构造函数，初始化特征提取器和分词器
        self.current_processor = self.feature_extractor
        # 设置当前处理器为特征提取器对象
        self._in_target_context_manager = False
        # 私有属性：标志当前不处于目标上下文管理器中
    def __call__(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
        [`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
        Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
        methods for more information.
        """
        # 对于向后兼容性
        if self._in_target_context_manager:
            # 如果处于目标处理器上下文管理器中，则调用当前处理器的方法并返回其输出
            return self.current_processor(*args, **kwargs)

        if "raw_speech" in kwargs:
            # 如果 kwargs 中包含 "raw_speech" 关键字参数，则发出警告并将其替换为 "audio"
            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
            audio = kwargs.pop("raw_speech")
        else:
            # 否则，从 kwargs 中弹出 "audio" 参数，如果没有则设为 None
            audio = kwargs.pop("audio", None)
        # 从 kwargs 中弹出 "sampling_rate" 和 "text" 参数，如果没有则设为 None
        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            # 如果传入了额外的位置参数，则将第一个参数视为 audio，并将剩余参数放入 args 中
            audio = args[0]
            args = args[1:]

        if audio is None and text is None:
            # 如果既没有传入 audio，也没有传入 text，则抛出 ValueError
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
            # 如果传入了 audio，则使用特征提取器处理 audio，将结果存储在 inputs 中
            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            # 如果传入了 text，则使用 tokenizer 处理 text，将结果存储在 encodings 中
            encodings = self.tokenizer(text, **kwargs)

        if text is None:
            # 如果没有传入 text，则返回处理后的 inputs
            return inputs
        elif audio is None:
            # 如果没有传入 audio，则返回处理后的 encodings
            return encodings
        else:
            # 如果既有 audio 又有 text，则将 encodings 的 "input_ids" 存入 inputs 的 "labels" 中，然后返回 inputs
            inputs["labels"] = encodings["input_ids"]
            return inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Speech2Text2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        # 将所有参数转发给 tokenizer 的 batch_decode 方法，并返回其输出
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Speech2Text2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
        to the docstring of this method for more information.
        """
        # 将所有参数转发给 tokenizer 的 decode 方法，并返回其输出
        return self.tokenizer.decode(*args, **kwargs)

    @contextmanager
    def as_target_processor(self):
        """
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
        Speech2Text2.
        """
        # 发出警告，说明此方法将在 Transformers v5 中移除，推荐在普通 __call__ 方法的 text 参数中处理标签
        warnings.warn(
            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
            "your audio inputs, or in a separate call."
        )
        # 将 _in_target_context_manager 标志设为 True，当前处理器设为 tokenizer
        self._in_target_context_manager = True
        self.current_processor = self.tokenizer
        yield
        # 在退出上下文管理器后，将当前处理器设回 feature_extractor，并将 _in_target_context_manager 标志设为 False
        self.current_processor = self.feature_extractor
        self._in_target_context_manager = False

Transformers-源码解析-一百零五-

Transformers 源码解析（一百零五）

.\models\speech_encoder_decoder\convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py

.\models\speech_encoder_decoder\modeling_flax_speech_encoder_decoder.py

.\models\speech_encoder_decoder\modeling_speech_encoder_decoder.py

.\models\speech_encoder_decoder\__init__.py

.\models\speech_to_text\configuration_speech_to_text.py

.\models\speech_to_text\convert_s2t_fairseq_to_tfms.py

.\models\speech_to_text\feature_extraction_speech_to_text.py

.\models\speech_to_text\modeling_speech_to_text.py

.\models\speech_to_text\modeling_tf_speech_to_text.py

.\models\speech_to_text\processing_speech_to_text.py

.\models\speech_to_text\tokenization_speech_to_text.py

.\models\speech_to_text\__init__.py

.\models\speech_to_text_2\configuration_speech_to_text_2.py

.\models\speech_to_text_2\modeling_speech_to_text_2.py

.\models\speech_to_text_2\processing_speech_to_text_2.py

`.\models\speech_encoder_decoder\convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py`

`.\models\speech_encoder_decoder\modeling_flax_speech_encoder_decoder.py`

`.\models\speech_encoder_decoder\modeling_speech_encoder_decoder.py`

`.\models\speech_encoder_decoder\init.py`

`.\models\speech_to_text\configuration_speech_to_text.py`

`.\models\speech_to_text\convert_s2t_fairseq_to_tfms.py`

`.\models\speech_to_text\feature_extraction_speech_to_text.py`

`.\models\speech_to_text\modeling_speech_to_text.py`

`.\models\speech_to_text\modeling_tf_speech_to_text.py`

`.\models\speech_to_text\processing_speech_to_text.py`

`.\models\speech_to_text\tokenization_speech_to_text.py`

`.\models\speech_to_text\init.py`

`.\models\speech_to_text_2\configuration_speech_to_text_2.py`

`.\models\speech_to_text_2\modeling_speech_to_text_2.py`

`.\models\speech_to_text_2\processing_speech_to_text_2.py`