Transformers 源码解析（八十）

`.\models\mt5\modeling_tf_mt5.py`

# 设置文件编码为UTF-8
# 版权声明，指出代码的版权归属
# 版权使用协议，告知可在Apache License Version 2.0下使用
# 获取Apache License Version 2.0的具体内容链接
# 如果不是根据许可证中规定的，不得使用此文件
# 在适用法律下，本软件按"原样"提供，没有任何明示或暗示的担保或条件
# 参见许可证以了解特定语言的权限
""" Tensorflow mT5 model."""

# 从相对路径导入logging工具
from ...utils import logging
# 从T5的TensorFlow模型中导入编码器模型、有条件生成模型和基础模型
from ..t5.modeling_tf_t5 import TFT5EncoderModel, TFT5ForConditionalGeneration, TFT5Model
# 从当前目录下的配置文件中导入MT5的配置类
from .configuration_mt5 import MT5Config

# 获取logger对象
logger = logging.get_logger(__name__)

# 文档字符串，用于生成文档
_CONFIG_FOR_DOC = "T5Config"

# TFMT5Model类，继承自TFT5Model类，用于MT5模型的TensorFlow实现
class TFMT5Model(TFT5Model):
    r"""
    This class overrides [`TFT5Model`]. Please check the superclass for the appropriate documentation alongside usage
    examples.

    Examples:

    ```
    >>> from transformers import TFMT5Model, AutoTokenizer

    >>> model = TFMT5Model.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, return_tensors="tf")
    >>> labels = tokenizer(text_target=summary, return_tensors="tf")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```"""

    # 模型类型为"mt5"
    model_type = "mt5"
    # 配置类为MT5Config
    config_class = MT5Config


# TFMT5ForConditionalGeneration类，继承自TFT5ForConditionalGeneration类，用于带条件生成的MT5模型的TensorFlow实现
class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
    r"""
    This class overrides [`TFT5ForConditionalGeneration`]. Please check the superclass for the appropriate
    documentation alongside usage examples.

    Examples:

    ```
    >>> from transformers import TFMT5ForConditionalGeneration, AutoTokenizer

    >>> model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="tf")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```"""

    # 模型类型为"mt5"
    model_type = "mt5"
    # 配置类为MT5Config
    config_class = MT5Config


# TFMT5EncoderModel类，继承自TFT5EncoderModel类，用于MT5编码器模型的TensorFlow实现
class TFMT5EncoderModel(TFT5EncoderModel):
    r"""
    This class overrides [`TFT5EncoderModel`]. Please check the superclass for the appropriate documentation alongside
    usage examples.

    Examples:

    ```
    >>> from transformers import TFMT5EncoderModel, AutoTokenizer

    >>> model = TFMT5EncoderModel.from_pretrained("google/mt5-small")
    # 设置tokenizer为从预训练模型"google/mt5-small"加载的自动分词器
    tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    
    # 设置一个新闻文章的示例文本
    article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    
    # 使用tokenizer对文章进行分词并返回TensorFlow格式的输入ID
    input_ids = tokenizer(article, return_tensors="tf").input_ids
    
    # 对输入ID进行模型推理，获取模型的输出
    outputs = model(input_ids)
    
    # 从模型的输出中提取最后一个隐藏状态的表示
    hidden_state = outputs.last_hidden_state
    
    # 设置模型类型为"mt5"，这里暂存了模型的类型信息
    model_type = "mt5"
    
    # 设置配置类为MT5Config，用于模型配置的加载和管理
    config_class = MT5Config

`.\models\mt5\init.py`

# 引入类型检查相关模块
from typing import TYPE_CHECKING

# 引入必要的依赖项和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 如果sentencepiece可用，则使用T5Tokenizer来自../t5/tokenization_t5模块
if is_sentencepiece_available():
    from ..t5.tokenization_t5 import T5Tokenizer
else:
    # 否则，使用dummy_sentencepiece_objects中的T5Tokenizer作为替代
    from ...utils.dummy_sentencepiece_objects import T5Tokenizer

# 定义MT5Tokenizer为T5Tokenizer
MT5Tokenizer = T5Tokenizer

# 如果tokenizers可用，则使用T5TokenizerFast来自../t5/tokenization_t5_fast模块
if is_tokenizers_available():
    from ..t5.tokenization_t5_fast import T5TokenizerFast
else:
    # 否则，使用dummy_tokenizers_objects中的T5TokenizerFast作为替代
    from ...utils.dummy_tokenizers_objects import T5TokenizerFast

# 定义MT5TokenizerFast为T5TokenizerFast
MT5TokenizerFast = T5TokenizerFast

# 定义模块导入结构_import_structure，包含MT5Config和MT5OnnxConfig
_import_structure = {"configuration_mt5": ["MT5Config", "MT5OnnxConfig"]}

# 尝试导入torch相关模块，如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，定义modeling_mt5结构包含各种MT5模型和类
    _import_structure["modeling_mt5"] = [
        "MT5EncoderModel",
        "MT5ForConditionalGeneration",
        "MT5ForQuestionAnswering",
        "MT5ForSequenceClassification",
        "MT5ForTokenClassification",
        "MT5Model",
        "MT5PreTrainedModel",
        "MT5Stack",
    ]

# 尝试导入tensorflow相关模块，如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，定义modeling_tf_mt5结构包含各种TFMT5模型和类
    _import_structure["modeling_tf_mt5"] = ["TFMT5EncoderModel", "TFMT5ForConditionalGeneration", "TFMT5Model"]

# 尝试导入flax相关模块，如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，定义modeling_flax_mt5结构包含各种FlaxMT5模型和类
    _import_structure["modeling_flax_mt5"] = ["FlaxMT5EncoderModel", "FlaxMT5ForConditionalGeneration", "FlaxMT5Model"]

# 如果在类型检查模式下，导入MT5Config和MT5OnnxConfig配置
if TYPE_CHECKING:
    from .configuration_mt5 import MT5Config, MT5OnnxConfig

    # 尝试导入torch相关MT5模块，如果不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入modeling_mt5中各种MT5模型和类
        from .modeling_mt5 import (
            MT5EncoderModel,
            MT5ForConditionalGeneration,
            MT5ForQuestionAnswering,
            MT5ForSequenceClassification,
            MT5ForTokenClassification,
            MT5Model,
            MT5PreTrainedModel,
            MT5Stack,
        )

    # 尝试导入tensorflow相关MT5模块，如果不可用则忽略
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    # 如果未导入模块，则从当前包导入 TensorFlow 版的 MT5 模型相关类
    else:
        from .modeling_tf_mt5 import TFMT5EncoderModel, TFMT5ForConditionalGeneration, TFMT5Model

    # 尝试检查是否可用 Flax，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    # 如果 OptionalDependencyNotAvailable 异常被抛出，则捕获并忽略
    except OptionalDependencyNotAvailable:
        pass
    # 如果没有异常被抛出，则导入 Flax 版的 MT5 模型相关类
    else:
        from .modeling_flax_mt5 import FlaxMT5EncoderModel, FlaxMT5ForConditionalGeneration, FlaxMT5Model
else:
    # 导入 sys 模块，用于动态操作模块对象
    import sys

    # 将当前模块的名称映射到 _LazyModule 类的实例，并设置相关属性
    sys.modules[__name__] = _LazyModule(
        __name__,  # 模块的名称
        globals()["__file__"],  # 当前模块的文件路径
        _import_structure,  # 导入结构
        extra_objects={"MT5Tokenizer": MT5Tokenizer, "MT5TokenizerFast": MT5TokenizerFast},  # 额外的对象映射
        module_spec=__spec__,  # 模块的规范
    )

`.\models\musicgen\configuration_musicgen.py`

# coding=utf-8
# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" MusicGen model configuration"""

# 导入所需模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import AutoConfig

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射字典，将模型名称映射到其配置文件的URL
MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/musicgen-small": "https://huggingface.co/facebook/musicgen-small/resolve/main/config.json",
    # 查看所有Musicgen模型：https://huggingface.co/models?filter=musicgen
}

# MusicgenDecoderConfig类，继承自PretrainedConfig类
class MusicgenDecoderConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`MusicgenDecoder`]. It is used to instantiate a
    MusicGen decoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MusicGen
    [facebook/musicgen-small](https://huggingface.co/facebook/musicgen-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 MusicgenDecoder 模型的参数及其默认值
    Args:
        vocab_size (`int`, *optional*, defaults to 2048):
            MusicgenDecoder 模型的词汇表大小，定义了在调用 `MusicgenDecoder` 时输入 `inputs_ids` 可表示的不同标记数量。
        hidden_size (`int`, *optional*, defaults to 1024):
            层和池化层的维度。
        num_hidden_layers (`int`, *optional*, defaults to 24):
            解码器层的数量。
        num_attention_heads (`int`, *optional*, defaults to 16):
            Transformer 块中每个注意力层的注意力头数量。
        ffn_dim (`int`, *optional*, defaults to 4096):
            Transformer 块中“中间”（通常称为前馈）层的维度。
        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            解码器和池化器中的非线性激活函数（函数或字符串）。支持的字符串包括 `"gelu"`, `"relu"`, `"silu"` 和 `"gelu_new"`。
        dropout (`float`, *optional*, defaults to 0.1):
            嵌入层、文本编码器和池化器中所有全连接层的 dropout 概率。
        attention_dropout (`float`, *optional*, defaults to 0.0):
            注意力概率的 dropout 比率。
        activation_dropout (`float`, *optional*, defaults to 0.0):
            全连接层内部激活的 dropout 比率。
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            模型可能使用的最大序列长度。通常设置为一个很大的值（例如 512、1024 或 2048）。
        initializer_factor (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        layerdrop (`float`, *optional*, defaults to 0.0):
            解码器的 LayerDrop 概率。详细信息请参阅 LayerDrop 论文（见 https://arxiv.org/abs/1909.11556）。
        scale_embedding (`bool`, *optional*, defaults to `False`):
            是否通过 sqrt(hidden_size) 缩放嵌入。
        use_cache (`bool`, *optional*, defaults to `True`):
            模型是否应返回最后的 key/values 注意力（并非所有模型都使用）。
        num_codebooks (`int`, *optional*, defaults to 4):
            转发到模型的并行码书数量。
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            是否应绑定输入和输出词嵌入。
        audio_channels (`int`, *optional*, defaults to 1):
            音频数据中的通道数。单声道为 1，立体声为 2。立体声模型生成左/右输出通道的单独音频流，单声道模型生成单一音频流输出。
    # 定义模型类型为 "musicgen_decoder"
    model_type = "musicgen_decoder"
    
    # 在推断阶段忽略的键列表，这些键不会用于推断过程中
    keys_to_ignore_at_inference = ["past_key_values"]
    
    # 定义模型类，包含各种初始化参数
    def __init__(
        self,
        vocab_size=2048,  # 词汇表大小，默认为 2048
        max_position_embeddings=2048,  # 最大位置编码长度，默认为 2048
        num_hidden_layers=24,  # 隐藏层的数量，默认为 24
        ffn_dim=4096,  # FeedForward 层的维度，默认为 4096
        num_attention_heads=16,  # 注意力头的数量，默认为 16
        layerdrop=0.0,  # LayerDrop 参数，默认为 0.0
        use_cache=True,  # 是否使用缓存，默认为 True
        activation_function="gelu",  # 激活函数类型，默认为 "gelu"
        hidden_size=1024,  # 隐藏层大小，默认为 1024
        dropout=0.1,  # 全连接层和注意力层的 Dropout 概率，默认为 0.1
        attention_dropout=0.0,  # 注意力模型的 Dropout 概率，默认为 0.0
        activation_dropout=0.0,  # 激活函数的 Dropout 概率，默认为 0.0
        initializer_factor=0.02,  # 初始化因子，默认为 0.02
        scale_embedding=False,  # 是否缩放嵌入层，默认为 False；若为 True，则缩放因子为 sqrt(d_model)
        num_codebooks=4,  # 编码书的数量，默认为 4
        audio_channels=1,  # 音频通道数，默认为 1
        pad_token_id=2048,  # 填充标记的 ID，默认为 2048
        bos_token_id=2048,  # 起始标记的 ID，默认为 2048
        eos_token_id=None,  # 终止标记的 ID，默认为 None
        tie_word_embeddings=False,  # 是否绑定词嵌入，默认为 False
        **kwargs,  # 其他关键字参数
    ):
        # 初始化模型参数
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.ffn_dim = ffn_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.activation_function = activation_function
        self.initializer_factor = initializer_factor
        self.layerdrop = layerdrop
        self.use_cache = use_cache
        self.scale_embedding = scale_embedding  # 若为 True，则嵌入层缩放因子为 sqrt(d_model)
        self.num_codebooks = num_codebooks
    
        # 检查音频通道数是否为合法值（1 或 2）
        if audio_channels not in [1, 2]:
            raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.")
        self.audio_channels = audio_channels
    
        # 调用父类的初始化方法，设置特殊的标记 ID 和其他参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
class MusicgenConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MusicgenModel`]. It is used to instantiate a
    MusicGen model according to the specified arguments, defining the text encoder, audio encoder and MusicGen decoder
    configs.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        kwargs (*optional*):
            Dictionary of keyword arguments. Notably:

                - **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
                  defines the text encoder config.
                - **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
                  defines the audio encoder config.
                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
                  the decoder config.

    Example:

    ```
    >>> from transformers import (
    ...     MusicgenConfig,
    ...     MusicgenDecoderConfig,
    ...     T5Config,
    ...     EncodecConfig,
    ...     MusicgenForConditionalGeneration,
    ... )

    >>> # Initializing text encoder, audio encoder, and decoder model configurations
    >>> text_encoder_config = T5Config()
    >>> audio_encoder_config = EncodecConfig()
    >>> decoder_config = MusicgenDecoderConfig()

    >>> configuration = MusicgenConfig.from_sub_models_config(
    ...     text_encoder_config, audio_encoder_config, decoder_config
    ... )

    >>> # Initializing a MusicgenForConditionalGeneration (with random weights) from the facebook/musicgen-small style configuration
    >>> model = MusicgenForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    >>> config_text_encoder = model.config.text_encoder
    >>> config_audio_encoder = model.config.audio_encoder
    >>> config_decoder = model.config.decoder

    >>> # Saving the model, including its configuration
    >>> model.save_pretrained("musicgen-model")

    >>> # loading model and config from pretrained folder
    >>> musicgen_config = MusicgenConfig.from_pretrained("musicgen-model")
    >>> model = MusicgenForConditionalGeneration.from_pretrained("musicgen-model", config=musicgen_config)
    ```

    Assigning the model_type class attribute for identification as a 'musicgen' model type.
    This attribute helps in distinguishing different model types in a system.
    """
    model_type = "musicgen"
    is_composition = True
    # 初始化方法，接受任意关键字参数
    def __init__(self, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 检查是否缺少必要的参数：text_encoder、audio_encoder、decoder
        if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
            # 如果缺少任一参数，则抛出数值错误异常
            raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")

        # 从关键字参数中取出并移除 text_encoder 的配置
        text_encoder_config = kwargs.pop("text_encoder")
        # 从 text_encoder_config 中取出并移除 model_type 字段
        text_encoder_model_type = text_encoder_config.pop("model_type")

        # 从关键字参数中取出并移除 audio_encoder 的配置
        audio_encoder_config = kwargs.pop("audio_encoder")
        # 从 audio_encoder_config 中取出并移除 model_type 字段
        audio_encoder_model_type = audio_encoder_config.pop("model_type")

        # 从关键字参数中取出 decoder 的配置
        decoder_config = kwargs.pop("decoder")

        # 使用 text_encoder_model_type 和 text_encoder_config 创建文本编码器的配置
        self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
        # 使用 audio_encoder_model_type 和 audio_encoder_config 创建音频编码器的配置
        self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
        # 使用 decoder_config 创建解码器的配置
        self.decoder = MusicgenDecoderConfig(**decoder_config)
        # 标记对象为编码器-解码器类型
        self.is_encoder_decoder = True

    @classmethod
    # 类方法：从子模型的配置创建 MusicgenConfig 对象
    def from_sub_models_config(
        cls,
        # 文本编码器的预训练配置对象
        text_encoder_config: PretrainedConfig,
        # 音频编码器的预训练配置对象
        audio_encoder_config: PretrainedConfig,
        # 解码器的音乐生成配置对象
        decoder_config: MusicgenDecoderConfig,
        **kwargs,
    ):
        r"""
        Instantiate a [`MusicgenConfig`] (or a derived class) from text encoder, audio encoder and decoder
        configurations.

        Returns:
            [`MusicgenConfig`]: An instance of a configuration object
        """
        # 使用传入的配置创建一个 MusicgenConfig 对象，并返回
        return cls(
            text_encoder=text_encoder_config.to_dict(),
            audio_encoder=audio_encoder_config.to_dict(),
            decoder=decoder_config.to_dict(),
            **kwargs,
        )

    @property
    # 属性方法：返回音频编码器的采样率
    # 这是一个属性方法，因为可能需要动态改变编解码器模型
    def sampling_rate(self):
        return self.audio_encoder.sampling_rate

`.\models\musicgen\convert_musicgen_transformers.py`

# 设置文件编码为 UTF-8，确保支持中文等非 ASCII 字符
# 版权声明和许可信息，指明此代码的版权归属和使用许可
# 导入必要的库和模块
import argparse  # 用于解析命令行参数
from pathlib import Path  # 用于处理文件路径的类
from typing import Dict, OrderedDict, Tuple  # 引入类型提示，用于静态类型检查

import torch  # 引入 PyTorch 库
from audiocraft.models import MusicGen  # 导入本地定义的 MusicGen 模型

# 从 transformers 库中导入必要的类和函数
from transformers import (
    AutoFeatureExtractor,  # 自动特征提取器
    AutoTokenizer,  # 自动分词器
    EncodecModel,  # 编码模型（可能是拼写错误，应为 EncoderModel）
    MusicgenDecoderConfig,  # Musicgen 解码器配置
    MusicgenForConditionalGeneration,  # 用于条件生成的 Musicgen 模型
    MusicgenProcessor,  # Musicgen 处理器
    T5EncoderModel,  # T5 编码模型
)
# 从 transformers 库的 musicgen 模块中导入特定的类
from transformers.models.musicgen.modeling_musicgen import MusicgenForCausalLM  # 用于因果语言模型的 Musicgen
from transformers.utils import logging  # 导入日志记录工具

# 设置日志的详细程度为 info
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预期缺失的模型键列表
EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]


def rename_keys(name):
    """根据预定义规则重命名模型状态字典中的键名。

    Args:
        name (str): 原始的键名字符串。

    Returns:
        str: 重命名后的键名字符串。
    """
    if "emb" in name:
        name = name.replace("emb", "model.decoder.embed_tokens")
    if "transformer" in name:
        name = name.replace("transformer", "model.decoder")
    if "cross_attention" in name:
        name = name.replace("cross_attention", "encoder_attn")
    if "linear1" in name:
        name = name.replace("linear1", "fc1")
    if "linear2" in name:
        name = name.replace("linear2", "fc2")
    if "norm1" in name:
        name = name.replace("norm1", "self_attn_layer_norm")
    if "norm_cross" in name:
        name = name.replace("norm_cross", "encoder_attn_layer_norm")
    if "norm2" in name:
        name = name.replace("norm2", "final_layer_norm")
    if "out_norm" in name:
        name = name.replace("out_norm", "model.decoder.layer_norm")
    if "linears" in name:
        name = name.replace("linears", "lm_heads")
    if "condition_provider.conditioners.description.output_proj" in name:
        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
    return name


def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
    """根据 Hugging Face 模块名称规则重命名 fairseq Musicgen 的状态字典，并将其分成解码器（LM）状态字典和编码器-解码器投影的状态字典。

    Args:
        state_dict (OrderedDict): 原始的 fairseq Musicgen 状态字典。
        hidden_size (int): 隐藏层大小。

    Returns:
        Tuple[Dict, Dict]: 重命名后的解码器状态字典和编码器-解码器投影状态字典的元组。
    """
    keys = list(state_dict.keys())
    enc_dec_proj_state_dict = {}  # 用于存储编码器-解码器投影的状态字典
    # 对于给定的每个键进行迭代处理
    for key in keys:
        # 弹出当前状态字典中的键，并将其对应的值赋给变量val
        val = state_dict.pop(key)
        # 使用指定函数重命名当前的键值
        key = rename_keys(key)
        # 如果当前键名包含'in_proj_weight'字符串
        if "in_proj_weight" in key:
            # 拆分融合的qkv投影权重
            # 更新状态字典，替换键名中的'in_proj_weight'为'q_proj.weight'，并赋予对应的值
            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
            # 更新状态字典，替换键名中的'in_proj_weight'为'k_proj.weight'，并赋予对应的值
            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
            # 更新状态字典，替换键名中的'in_proj_weight'为'v_proj.weight'，并赋予对应的值
            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
        # 如果当前键名包含'enc_to_dec_proj'字符串
        elif "enc_to_dec_proj" in key:
            # 将当前键值对存入enc_dec_proj_state_dict字典中，去除键名中'enc_to_dec_proj.'部分
            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
        else:
            # 否则，直接将当前键值对存回状态字典中
            state_dict[key] = val
    # 返回更新后的状态字典及enc_dec_proj_state_dict字典
    return state_dict, enc_dec_proj_state_dict
# 根据给定的检查点名称返回MusicgenDecoderConfig配置对象
def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenDecoderConfig:
    # 根据不同的检查点名称设置不同的隐藏层大小、隐藏层数和注意力头数
    if checkpoint == "small" or checkpoint == "facebook/musicgen-stereo-small":
        hidden_size = 1024
        num_hidden_layers = 24
        num_attention_heads = 16
    elif checkpoint == "medium" or checkpoint == "facebook/musicgen-stereo-medium":
        hidden_size = 1536
        num_hidden_layers = 48
        num_attention_heads = 24
    elif checkpoint == "large" or checkpoint == "facebook/musicgen-stereo-large":
        hidden_size = 2048
        num_hidden_layers = 48
        num_attention_heads = 32
    else:
        # 如果检查点名称不符合预期，则抛出数值错误异常
        raise ValueError(
            "Checkpoint should be one of `['small', 'medium', 'large']` for the mono checkpoints, "
            "or `['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
            f"for the stereo checkpoints, got {checkpoint}."
        )

    # 根据检查点名称中是否包含"stereo"关键词设置音频通道数和码书数
    if "stereo" in checkpoint:
        audio_channels = 2
        num_codebooks = 8
    else:
        audio_channels = 1
        num_codebooks = 4

    # 创建MusicgenDecoderConfig对象，使用之前设置的参数
    config = MusicgenDecoderConfig(
        hidden_size=hidden_size,
        ffn_dim=hidden_size * 4,
        num_hidden_layers=num_hidden_layers,
        num_attention_heads=num_attention_heads,
        num_codebooks=num_codebooks,
        audio_channels=audio_channels,
    )
    return config


@torch.no_grad()
# 从Fairseq模型的预训练检查点转换MusicGen模型的函数
def convert_musicgen_checkpoint(
    checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", safe_serialization=False
):
    # 从Fairseq库中获取预训练的MusicGen模型
    fairseq_model = MusicGen.get_pretrained(checkpoint, device=device)
    # 根据检查点名称获取解码器的配置信息
    decoder_config = decoder_config_from_checkpoint(checkpoint)

    # 获取Fairseq模型的语言模型状态字典
    decoder_state_dict = fairseq_model.lm.state_dict()
    # 重命名解码器的状态字典，同时获取编码器到解码器投影的状态字典
    decoder_state_dict, enc_dec_proj_state_dict = rename_state_dict(
        decoder_state_dict, hidden_size=decoder_config.hidden_size
    )

    # 从预训练模型中加载T5文本编码器和32kHz音频编码器
    text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-base")
    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
    # 创建MusicgenForCausalLM对象作为解码器
    decoder = MusicgenForCausalLM(decoder_config).eval()

    # 加载解码器的所有权重，但可能缺少嵌入层和编码器到解码器的投影
    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)

    # 对于符合预期缺失的键，移除其在缺失列表中
    for key in missing_keys.copy():
        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
            missing_keys.remove(key)

    # 如果仍有缺失的键存在，则抛出数值错误异常
    if len(missing_keys) > 0:
        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")

    # 如果存在不预期的键，则抛出数值错误异常
    if len(unexpected_keys) > 0:
        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")

    # 初始化组合模型，包括文本编码器、音频编码器和解码器
    model = MusicgenForConditionalGeneration(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder)

    # 加载预训练的编码器到解码器投影权重
    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
    # 检查是否可以进行前向传播

    # 创建一个长为 2*decoder_config.num_codebooks 的长整型张量，并重塑为形状为 (2, -1)
    input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1)
    # 将 input_ids 重塑为形状为 (2*decoder_config.num_codebooks, -1) 的张量作为 decoder_input_ids
    decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1)

    # 禁用梯度计算
    with torch.no_grad():
        # 使用模型进行推断，传入 input_ids 和 decoder_input_ids，获取logits
        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits

    # 检查 logits 的形状是否为 (2*decoder_config.num_codebooks, 1, 2048)，否则引发 ValueError 异常
    if logits.shape != (2 * decoder_config.num_codebooks, 1, 2048):
        raise ValueError("Incorrect shape for logits")

    # 实例化一个 T5 tokenizer，从预训练模型 "google-t5/t5-base" 加载
    tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
    # 实例化一个特征提取器，从预训练模型 "facebook/encodec_32khz" 加载，设置填充在左侧，特征大小为 decoder_config.audio_channels
    feature_extractor = AutoFeatureExtractor.from_pretrained(
        "facebook/encodec_32khz", padding_side="left", feature_size=decoder_config.audio_channels
    )

    # 实例化一个音乐生成处理器，传入特征提取器和 tokenizer
    processor = MusicgenProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

    # 设置适当的开始和填充标记的 ID
    model.generation_config.decoder_start_token_id = 2048
    model.generation_config.pad_token_id = 2048

    # 设置其他默认的生成配置参数
    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
    model.generation_config.do_sample = True
    model.generation_config.guidance_scale = 3.0

    # 如果指定了 pytorch_dump_folder，则保存模型和处理器到该文件夹
    if pytorch_dump_folder is not None:
        # 创建目录，如果已存在则不做任何操作
        Path(pytorch_dump_folder).mkdir(exist_ok=True)
        # 记录日志，显示正在保存模型到指定目录
        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
        # 保存模型到 pytorch_dump_folder，使用安全序列化进行保存
        model.save_pretrained(pytorch_dump_folder, safe_serialization=safe_serialization)
        # 保存处理器到 pytorch_dump_folder
        processor.save_pretrained(pytorch_dump_folder)

    # 如果提供了 repo_id，则推送模型到指定的 Hub 仓库
    if repo_id:
        # 记录日志，显示正在推送模型到指定 repo_id
        logger.info(f"Pushing model {checkpoint} to {repo_id}")
        # 将模型推送到指定的 repo_id，使用安全序列化进行保存
        model.push_to_hub(repo_id, safe_serialization=safe_serialization)
        # 将处理器推送到指定的 repo_id
        processor.push_to_hub(repo_id)
if __name__ == "__main__":
    # 如果脚本直接执行而非被导入，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必填参数
    parser.add_argument(
        "--checkpoint",
        default="small",
        type=str,
        help="Checkpoint size of the MusicGen model you'd like to convert. Can be one of: "
             "`['small', 'medium', 'large']` for the mono checkpoints, or "
             "`['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
             "for the stereo checkpoints.",
    )

    # 必填参数
    parser.add_argument(
        "--pytorch_dump_folder",
        required=True,
        default=None,
        type=str,
        help="Path to the output PyTorch model directory.",
    )

    parser.add_argument(
        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
    )

    parser.add_argument(
        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
    )

    parser.add_argument(
        "--safe_serialization",
        action="store_true",
        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).",
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数进行音乐生成模型的转换
    convert_musicgen_checkpoint(args.checkpoint, args.pytorch_dump_folder, args.push_to_hub)

`.\models\musicgen\modeling_musicgen.py`

# 设置编码格式为UTF-8，确保脚本中的中文等字符能正确处理
# 版权声明和许可条款，告知使用者如何合法使用代码
# 导入所需模块和类
# 从dataclasses模块导入dataclass装饰器，用于定义数据类
# 从typing模块导入类型检查相关的工具
# 导入PyTorch库
# 从torch.nn模块导入神经网络相关的类和函数
# 从...activations模块导入ACT2FN，用于激活函数映射
# 从...generation.configuration_utils模块导入GenerationConfig，生成配置类
# 从...generation.logits_process模块导入分类器无指导的logits处理器和logits处理器列表
# 从...generation.stopping_criteria模块导入停止标准列表
# 从...modeling_attn_mask_utils模块导入准备4D注意力掩码的工具函数
# 从...modeling_outputs模块导入各种模型输出类
# 从...modeling_utils模块导入预训练模型基类PreTrainedModel
# 从...utils模块导入各种实用函数和工具类
# 如果是类型检查阶段，则从...generation.streamers模块导入BaseStreamer类
# 获取日志记录器，用于在运行时记录消息和警告
    # 定义函数的参数和类型注释
    Args:
        encoder_outputs (`Tuple[torch.FloatTensor]` of length 1, with tensor shape `(batch_size, sequence_length, hidden_size)`):
            文本编码器模型最后一层的隐藏状态序列。
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            编码器注意力掩码，用于避免对填充的标记索引执行注意力操作。掩码值为 `[0, 1]`：1 表示**未被掩码**的标记，0 表示**被掩码**的标记。
        guidance_scale (`float`, *optional*):
            分类器自由引导的指导比例，用于设置条件对数（从提示预测的）与无条件对数（没有提示预测的）之间的平衡。
    """
    
    # 初始化函数的参数，设置默认值为 None
    encoder_outputs: Tuple[torch.FloatTensor] = None
    attention_mask: torch.LongTensor = None
    guidance_scale: float = None
# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    # 创建一个与输入形状相同的全零张量，用于存储右移后的输入ids
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将原始输入ids的除了第一个token外的所有token复制到右移后的张量中
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    if decoder_start_token_id is None:
        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
    # 将decoder的起始token id放到右移后的张量的第一个位置
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
    # 将右移后的张量中可能存在的-100值替换为pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


class MusicgenSinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int):
        super().__init__()
        self.embedding_dim = embedding_dim
        # 调用make_weights方法初始化权重
        self.make_weights(num_positions, embedding_dim)

    def make_weights(self, num_embeddings: int, embedding_dim: int):
        # 调用get_embedding方法生成sinusoidal位置编码的权重
        emb_weights = self.get_embedding(num_embeddings, embedding_dim)
        if hasattr(self, "weights"):
            # 在前向传播时将权重调整为参数的正确dtype和device
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        self.weights = nn.Parameter(emb_weights)
        self.weights.requires_grad = False
        self.weights.detach_()

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int):
        """
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        """
        # 计算sinusoidal位置编码的半维度
        half_dim = embedding_dim // 2
        # 计算emb参数
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        # 构建sinusoidal位置编码张量，按照cos和sin的方式组合
        emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
        if embedding_dim % 2 == 1:
            # 如果embedding_dim为奇数，进行零填充
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    # 定义一个前向传播方法，接收输入的 token ids 和过去键值长度作为参数
    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        # 获取输入 tensor 的批大小（batch size），代码簿数量（codebooks），以及序列长度（seq_len）
        bsz, codebooks, seq_len = input_ids.size()
        
        # 从输入的 token ids 创建位置 ids
        # 使用 torch.arange 生成长度为 seq_len 的序列，并加上 past_key_values_length 以处理位置偏移
        position_ids = (torch.arange(seq_len) + past_key_values_length).to(input_ids.device)
        
        # 如果序列长度大于当前权重张量的大小，则扩展权重张量
        if seq_len > self.weights.size(0):
            self.make_weights(seq_len + self.offset, self.embedding_dim)
        
        # 根据位置 ids 从权重张量中选择对应的权重，并分离（detach）出来
        return self.weights.index_select(0, position_ids.view(-1)).detach()
# 从transformers.models.bart.modeling_bart.BartAttention复制并修改为MusicgenAttention
class MusicgenAttention(nn.Module):
    """来自论文'Attention Is All You Need'的多头注意力机制"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[MusicgenConfig] = None,
    ):
        super().__init__()
        # 初始化模型参数
        self.embed_dim = embed_dim  # 嵌入维度
        self.num_heads = num_heads  # 注意力头数
        self.dropout = dropout  # dropout概率
        self.head_dim = embed_dim // num_heads  # 每个注意力头的维度
        self.config = config  # 配置对象

        # 检查embed_dim必须能被num_heads整除
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim必须能被num_heads整除 (得到 `embed_dim`: {self.embed_dim}"
                f" 和 `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子
        self.is_decoder = is_decoder  # 是否为解码器注意力
        self.is_causal = is_causal  # 是否为因果注意力

        # 线性变换层定义
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # K矩阵投影
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # V矩阵投影
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # Q矩阵投影
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 输出投影层

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 重新整形注意力张量，调整维度顺序以便多头注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 前向传播函数，实现注意力计算
        # hidden_states: 输入的隐藏状态张量
        # key_value_states: 键值状态张量（可选）
        # past_key_value: 过去的键值状态元组（可选）
        # attention_mask: 注意力掩码（可选）
        # layer_head_mask: 层头掩码（可选）
        # output_attentions: 是否输出注意力权重（布尔值）

        # 进行自注意力计算
        # 1. 计算查询、键、值的投影
        query = self.q_proj(hidden_states)
        key = self.k_proj(key_value_states if key_value_states is not None else hidden_states)
        value = self.v_proj(key_value_states if key_value_states is not None else hidden_states)

        # 2. 重塑张量以便并行计算多头注意力
        query = self._shape(query, query.size(1), query.size(0))
        key = self._shape(key, key.size(1), key.size(0))
        value = self._shape(value, value.size(1), value.size(0))

        # 3. 计算注意力分数及归一化
        attn_weights = torch.matmul(query, key.transpose(-1, -2))
        attn_weights *= self.scaling
        if attention_mask is not None:
            attn_weights += attention_mask

        attn_probs = nn.functional.softmax(attn_weights, dim=-1)
        attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training)

        # 4. 使用注意力权重计算加权和
        attn_output = torch.matmul(attn_probs, value)

        # 5. 将多头注意力结果重塑回原始形状
        attn_output = attn_output.transpose(1, 2).contiguous().view(attn_output.size(0), attn_output.size(2), -1)

        # 6. 执行最终的线性变换并返回结果
        attn_output = self.out_proj(attn_output)
        return attn_output
    # Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
class MusicgenPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 MusicgenDecoderConfig 作为配置类
    config_class = MusicgenDecoderConfig
    # 模型权重前缀
    base_model_prefix = "model"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要拆分的模块列表
    _no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"]

    def _init_weights(self, module):
        # 从配置中获取初始化因子
        std = self.config.initializer_factor
        # 如果是线性层或者卷积层
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            # 初始化权重为正态分布
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果有偏置项，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是嵌入层
        elif isinstance(module, nn.Embedding):
            # 初始化权重为正态分布
            module.weight.data.normal_(mean=0.0, std=std)
            # 如果有填充索引，将填充索引位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


MUSICGEN_START_DOCSTRING = r"""

    The Musicgen model was proposed in [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by
    Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, Alexandre Défossez. It is an
    encoder decoder transformer trained on the task of conditional music generation

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MusicgenConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

MUSICGEN_INPUTS_DOCSTRING = r"""
"""

MUSICGEN_DECODER_INPUTS_DOCSTRING = r"""
"""


class MusicgenDecoder(MusicgenPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MusicgenDecoderLayer`]
    """
    # 初始化函数，接受一个MusicgenDecoderConfig对象作为配置参数
    def __init__(self, config: MusicgenDecoderConfig):
        # 调用父类的初始化函数
        super().__init__(config)
        # 设置dropout率
        self.dropout = config.dropout
        # 设置层级dropout率
        self.layerdrop = config.layerdrop
        # 设置最大目标位置数
        self.max_target_positions = config.max_position_embeddings
        # 设置模型的隐藏层大小
        self.d_model = config.hidden_size
        # 设置代码本数目
        self.num_codebooks = config.num_codebooks
        # 设置嵌入缩放比例，如果配置中开启了嵌入缩放则为隐藏层大小的平方根，否则为1.0
        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0

        # 设置嵌入维度为词汇表大小加1（用于特殊符号），创建一个嵌入模块列表
        embed_dim = config.vocab_size + 1
        self.embed_tokens = nn.ModuleList(
            [nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
        )

        # 设置位置嵌入对象，使用正弦函数生成的位置嵌入
        self.embed_positions = MusicgenSinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            config.hidden_size,
        )

        # 创建解码器层列表，包含config.num_hidden_layers个MusicgenDecoderLayer对象
        self.layers = nn.ModuleList([MusicgenDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        
        # 创建层级归一化层，使用隐藏层大小进行归一化
        self.layer_norm = nn.LayerNorm(config.hidden_size)

        # 初始化梯度检查点标志为False
        self.gradient_checkpointing = False
        
        # 执行额外的初始化步骤，包括权重初始化和最终处理
        self.post_init()

    # 获取输入嵌入模块列表
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入嵌入模块列表的值
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 前向传播函数，接受多个输入参数并返回相应的输出
    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 在 MusicgenModel 类之上添加文档字符串，描述这是一个 Musicgen 解码器模型，输出原始隐藏状态，没有特定的顶层头部。
# MUSICGEN_START_DOCSTRING 是一个预定义的文档字符串常量，用于提供更详细的模型描述信息。
@add_start_docstrings(
    "The bare Musicgen decoder model outputting raw hidden-states without any specific head on top.",
    MUSICGEN_START_DOCSTRING,
)
# MusicgenModel 类，继承自 MusicgenPreTrainedModel 类。
class MusicgenModel(MusicgenPreTrainedModel):
    def __init__(self, config: MusicgenDecoderConfig):
        # 调用父类的初始化方法，传入配置对象 config。
        super().__init__(config)
        # 创建一个 MusicgenDecoder 对象并赋值给 self.decoder。
        self.decoder = MusicgenDecoder(config)
        # 初始化权重并应用最终处理（这里可能包括一些额外的初始化操作或配置参数）。
        self.post_init()

    # 获取输入嵌入的方法，返回解码器对象中的嵌入 tokens。
    def get_input_embeddings(self):
        return self.decoder.embed_tokens

    # 设置输入嵌入的方法，将输入的嵌入值赋给解码器对象的 embed_tokens 属性。
    def set_input_embeddings(self, value):
        self.decoder.embed_tokens = value

    # 获取解码器对象的方法，返回 self.decoder。
    def get_decoder(self):
        return self.decoder

    # 在 forward 方法上添加模型前向传播的文档字符串，这里使用了 MUSICGEN_DECODER_INPUTS_DOCSTRING。
    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        # 如果没有显式指定，使用默认的输出注意力机制
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有显式指定，使用默认的输出隐藏状态
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有显式指定，使用默认的缓存策略
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        # 如果没有显式指定，使用默认的返回字典设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
        # 调用解码器模型，返回解码器的输出结果
        decoder_outputs = self.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_attention_mask=encoder_attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果没有设置返回字典，则直接返回解码器输出
        if not return_dict:
            return decoder_outputs

        # 如果设置了返回字典，则构建包含过去键值和交叉注意力的输出对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            hidden_states=decoder_outputs.hidden_states,
            attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
        )
# 使用装饰器为类添加文档字符串，描述此类为带有语言建模头的 MusicGen 解码器模型
@add_start_docstrings(
    "The MusicGen decoder model with a language modelling head on top.",
    MUSICGEN_START_DOCSTRING,
)
class MusicgenForCausalLM(MusicgenPreTrainedModel):
    def __init__(self, config: MusicgenDecoderConfig):
        # 调用父类构造函数初始化配置
        super().__init__(config)

        # 创建基础的 MusicgenModel 模型
        self.model = MusicgenModel(config)

        # 设置编码本数量和语言模型头列表
        self.num_codebooks = config.num_codebooks
        self.lm_heads = nn.ModuleList(
            [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
        )

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回解码器的嵌入层
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置解码器的嵌入层
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        # 返回语言模型头列表
        return self.lm_heads

    def set_output_embeddings(self, new_embeddings):
        # 设置新的语言模型头列表
        self.lm_heads = new_embeddings

    def set_decoder(self, decoder):
        # 设置解码器
        self.model.decoder = decoder

    def get_decoder(self):
        # 获取解码器
        return self.model.decoder

    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 函数签名，定义了该解码器模型的前向传播方法，支持多种输入参数和可选的输出控制标志
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
                Returns:
        """

        # 根据 return_dict 参数决定是否返回字典形式的结果
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将模型的输入传递给模型，并获取模型的输出
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取隐藏状态（hidden_states）
        hidden_states = outputs[0]

        # 使用 lm_heads 对隐藏状态进行预测，得到语言模型的 logits
        lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)

        # 初始化损失值为 None
        loss = None
        # 如果存在 labels，则抛出未实现错误，因为 Musicgen 的训练尚未实现
        if labels is not None:
            raise NotImplementedError("Training is not implemented for Musicgen.")

        # 重新组织 lm_logits 的形状以适应后续处理
        # (bsz, num_codebooks, seq_len, vocab_size) -> (bsz * num_codebooks, seq_len, vocab_size)
        lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])

        # 如果不要求返回字典形式的输出，则将 lm_logits 与其他输出组合成 tuple 返回
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典形式的输出，则构造 CausalLMOutputWithCrossAttentions 对象返回
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=True,
        delay_pattern_mask=None,
        guidance_scale=None,
        **kwargs,
    ):
        # 如果延迟模式掩码为 None，则构建一个延迟模式掩码
        if delay_pattern_mask is None:
            input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
                input_ids,
                pad_token_id=self.generation_config.pad_token_id,
                max_length=self.generation_config.max_length,
            )

        # 应用延迟模式掩码到输入的 token IDs
        input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)

        # 如果有指导比例且大于 1，则为无分类器指导复制解码器参数到批次维度（在采样前将其拆分）
        if guidance_scale is not None and guidance_scale > 1:
            input_ids = input_ids.repeat((2, 1))
            # 如果存在注意力掩码，则将其在批次维度上重复
            if attention_mask is not None:
                attention_mask = attention_mask.repeat((2, 1))

        # 如果过去的键值不为 None，则仅保留最后一个 token ID
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        # 返回生成方法的参数字典
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "encoder_hidden_states": encoder_hidden_states,
            "encoder_attention_mask": encoder_attention_mask,
            "head_mask": head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
        """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
        the mask is set to -1, and otherwise setting to the value detailed in the mask."""
        # 获取输入 token IDs 的序列长度
        seq_len = input_ids.shape[-1]
        # 将解码器的 pad token 掩码裁剪到与序列长度相匹配的维度
        decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
        # 根据解码器 pad token 掩码，保留掩码值为 -1 的预测，其余设置为掩码中的详细值
        input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
        return input_ids

    @torch.no_grad()
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
        generation_config: Optional[GenerationConfig] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        synced_gpus: Optional[bool] = None,
        streamer: Optional["BaseStreamer"] = None,
        **kwargs,
@add_start_docstrings(
    "The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder, "
    "for music generation tasks with one or both of text and audio prompts.",
    MUSICGEN_START_DOCSTRING,
)
class MusicgenForConditionalGeneration(PreTrainedModel):
    # 指定配置类为MusicgenConfig
    config_class = MusicgenConfig
    # 指定基础模型前缀为"encoder_decoder"
    base_model_prefix = "encoder_decoder"
    # 主要输入名称为"input_ids"
    main_input_name = "input_ids"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def __init__(
        self,
        config: Optional[MusicgenConfig] = None,
        text_encoder: Optional[PreTrainedModel] = None,
        audio_encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[MusicgenForCausalLM] = None,
    ):
        # 构造函数，初始化函数，接受MusicgenConfig配置，文本编码器、音频编码器和解码器作为参数

    def tie_weights(self):
        # 绑定权重函数，用于可能需要绑定文本编码器和解码器的情况
        if self.config.tie_encoder_decoder:
            # 如果配置要求绑定文本编码器和解码器，则执行以下操作
            decoder_base_model_prefix = self.decoder.base_model_prefix
            self._tie_encoder_decoder_weights(
                self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
            )

    def get_audio_encoder(self):
        # 返回音频编码器
        return self.audio_encoder

    def get_text_encoder(self):
        # 返回文本编码器
        return self.text_encoder

    def get_encoder(self):
        # 获取文本编码器以计算生成时的编码器隐藏状态
        return self.get_text_encoder()

    def get_decoder(self):
        # 返回解码器
        return self.decoder

    def get_input_embeddings(self):
        # 返回文本编码器的输入嵌入
        return self.text_encoder.get_input_embeddings()

    def get_output_embeddings(self):
        # 返回解码器的输出嵌入
        return self.decoder.get_output_embeddings()

    def set_output_embeddings(self, new_embeddings):
        # 设置解码器的输出嵌入
        return self.decoder.set_output_embeddings(new_embeddings)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r"""
        Example:

        ```
        >>> from transformers import MusicgenForConditionalGeneration

        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
        ```"""

        # 目前不支持快速初始化复合模型
        if kwargs.get("_fast_init", False):
            logger.warning(
                "Fast initialization is currently not supported for MusicgenForConditionalGeneration. "
                "Falling back to slow initialization..."
            )
        kwargs["_fast_init"] = False

        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

    @classmethod
    def from_sub_models_pretrained(
        cls,
        text_encoder_pretrained_model_name_or_path: str = None,
        audio_encoder_pretrained_model_name_or_path: str = None,
        decoder_pretrained_model_name_or_path: str = None,
        *model_args,
        **kwargs,
    ):
        # 从预训练子模型加载复合模型的类方法

    @add_start_docstrings_to_model_forward(MUSICGEN_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法用于模型的前向传播
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs，类型为可选的长整型张量
        attention_mask: Optional[torch.BoolTensor] = None,  # 注意力遮罩，类型为可选的布尔张量
        input_values: Optional[torch.FloatTensor] = None,  # 输入的值，类型为可选的浮点张量
        padding_mask: Optional[torch.BoolTensor] = None,  # 填充遮罩，类型为可选的布尔张量
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器输入的 token IDs，类型为可选的长整型张量
        decoder_attention_mask: Optional[torch.BoolTensor] = None,  # 解码器注意力遮罩，类型为可选的布尔张量
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,  # 编码器输出，类型为可选的浮点张量元组
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,  # 过去的键值，类型为元组的元组，包含浮点张量
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入，类型为可选的浮点张量
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器输入的嵌入，类型为可选的浮点张量
        labels: Optional[torch.LongTensor] = None,  # 标签，类型为可选的长整型张量
        use_cache: Optional[bool] = None,  # 是否使用缓存，类型为可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，类型为可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典，类型为可选的布尔值
        **kwargs,  # 其他关键字参数，包括所有未列出的参数
    ):
        pass  # 这里是方法的占位符，未实现具体的功能逻辑

    # 定义一个方法用于为生成准备输入
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,  # 解码器输入的 token IDs，必填参数
        past_key_values=None,  # 过去的键值，类型为可选的默认空值
        attention_mask=None,  # 注意力遮罩，类型为可选的默认空值
        head_mask=None,  # 头部遮罩，类型为可选的默认空值
        decoder_attention_mask=None,  # 解码器注意力遮罩，类型为可选的默认空值
        decoder_head_mask=None,  # 解码器头部遮罩，类型为可选的默认空值
        cross_attn_head_mask=None,  # 交叉注意力头部遮罩，类型为可选的默认空值
        use_cache=None,  # 是否使用缓存，类型为可选的默认空值
        encoder_outputs=None,  # 编码器输出，类型为可选的默认空值
        decoder_delay_pattern_mask=None,  # 解码器延迟模式遮罩，类型为可选的默认空值
        guidance_scale=None,  # 引导比例，类型为可选的默认空值
        **kwargs,  # 其他关键字参数，包括所有未列出的参数
    ):
        pass  # 这里是方法的占位符，未实现具体的功能逻辑
    ):
        # 如果没有提供解码器延迟模式掩码，则从解码器构建一个
        if decoder_delay_pattern_mask is None:
            decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
                decoder_input_ids,
                self.generation_config.pad_token_id,
                max_length=self.generation_config.max_length,
            )

        # 应用延迟模式掩码到解码器输入IDs
        decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)

        # 如果给定了guidance_scale并且大于1，则进行以下操作
        if guidance_scale is not None and guidance_scale > 1:
            # 对于无分类器引导，需要在批次维度上复制解码器参数（在采样之前将其拆分）
            decoder_input_ids = decoder_input_ids.repeat((2, 1))
            if decoder_attention_mask is not None:
                decoder_attention_mask = decoder_attention_mask.repeat((2, 1))

        # 如果给定了过去的键值，则执行以下操作
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经仅传递最后一个输入ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认使用旧的行为：仅保留最后一个ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 从解码器输入IDs中去除前缀长度
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回准备好的字典，包含用于生成的各种输入和掩码
        return {
            "input_ids": None,  # encoder_outputs已定义，不需要input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }

    def _prepare_decoder_input_ids_for_generation(
        self,
        batch_size: int,
        model_input_name: str,
        model_kwargs: Dict[str, torch.Tensor],
        decoder_start_token_id: int = None,
        bos_token_id: int = None,
        device: torch.device = None,
    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""

        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
            # Retrieve `decoder_input_ids` from `model_kwargs` and remove it from the dictionary
            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
            # If `input_ids` is found in `model_kwargs` and it's not the main input name, assign it to `decoder_input_ids`
            decoder_input_ids = model_kwargs.pop("input_ids")
        else:
            # If neither `decoder_input_ids` nor `input_ids` are provided, initialize `decoder_input_ids` as None
            decoder_input_ids = None

        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
        # Get the special token ID to start `decoder_input_ids` sequence
        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
        if device is None:
            device = self.device
        # Create a tensor to initialize `decoder_input_ids` starting with `decoder_start_token_id`
        decoder_input_ids_start = (
            torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
            * decoder_start_token_id
        )

        # If no `decoder_input_ids` provided by the user, use `decoder_input_ids_start`
        if decoder_input_ids is None:
            decoder_input_ids = decoder_input_ids_start

        # If user-provided `decoder_input_ids` does not start with `decoder_start_token_id`, prepend it
        elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
            # Adjust `decoder_attention_mask` if provided along with `decoder_input_ids`
            if "decoder_attention_mask" in model_kwargs:
                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
                decoder_attention_mask = torch.cat(
                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
                    dim=-1,
                )
                model_kwargs["decoder_attention_mask"] = decoder_attention_mask

        return decoder_input_ids, model_kwargs
    ) -> Dict[str, Any]:
        # 1. 获取文本编码器
        encoder = self.get_text_encoder()
        
        # 2. 准备编码器参数和编码器关键字参数，从模型关键字参数中获取
        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
        encoder_kwargs = {
            argument: value
            for argument, value in model_kwargs.items()
            if not any(argument.startswith(p) for p in irrelevant_prefix)
        }
        
        # 检查编码器的参数签名
        encoder_signature = set(inspect.signature(encoder.forward).parameters)
        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
        
        # 如果编码器不接受通配符参数，则过滤掉不在签名内的参数
        if not encoder_accepts_wildcard:
            encoder_kwargs = {
                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
            }

        # 3. 确保编码器返回 `ModelOutput`
        model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
        encoder_kwargs["return_dict"] = True
        encoder_kwargs[model_input_name] = inputs_tensor
        
        # 调用编码器的 forward 方法获取最后隐藏状态
        last_hidden_state = encoder(**encoder_kwargs).last_hidden_state

        # 如果有指导比例并且大于1，则添加一个“null”输入到编码器隐藏状态中
        if guidance_scale is not None and guidance_scale > 1:
            last_hidden_state = torch.concatenate([last_hidden_state, torch.zeros_like(last_hidden_state)], dim=0)
            if "attention_mask" in model_kwargs:
                model_kwargs["attention_mask"] = torch.concatenate(
                    [model_kwargs["attention_mask"], torch.zeros_like(model_kwargs["attention_mask"])], dim=0
                )

        # 将编码器的输出设置为模型基本输出的最后隐藏状态
        model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=last_hidden_state)

        return model_kwargs

    def _prepare_audio_encoder_kwargs_for_generation(
        self, input_values, model_kwargs, model_input_name: Optional[str] = None
    ):
        raise NotImplementedError("This method is not implemented yet.")

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        # 根据标签准备解码器的输入 ID，将标签向右移动一位
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    def resize_token_embeddings(self, *args, **kwargs):
        # 抛出未实现错误，不能直接通过 EncoderDecoderModel 调整嵌入层大小
        raise NotImplementedError(
            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
            " model.decoder.resize_token_embeddings(...))"
        )

    def _maybe_initialize_input_ids_for_generation(
        self,
        inputs: Optional[torch.Tensor] = None,
        bos_token_id: Optional[int] = None,
        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
    ):
        # 这个方法可能用于为生成初始化输入 ID，具体功能暂时不明确，需要进一步分析上下文来理解其作用。
        pass
    ) -> torch.LongTensor:
        """Initializes input ids for generation, if necessary."""
        # 如果已经提供了输入，则直接返回这些输入
        if inputs is not None:
            return inputs

        # 检查是否在 `model_kwargs` 中存在 `encoder_outputs`
        encoder_outputs = model_kwargs.get("encoder_outputs")
        if encoder_outputs is not None:
            # 创建一个具有 `-100` 值的虚拟 input_ids，用作健全性检查，确保它们不会用于编码
            shape = encoder_outputs[0].size()[:-1]
            return torch.ones(shape, dtype=torch.long, device=self.device) * -100

        # 如果未提供 `input_ids` 但未定义 `bos_token_id`，则抛出错误
        if bos_token_id is None:
            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")

        # 如果 `model_kwargs` 中存在某些张量，则可以从中推断出批量大小。这在软提示或基于解码器的多模态实现中特别有用。
        batch_size = 1
        for value in model_kwargs.values():
            if isinstance(value, torch.Tensor):
                batch_size = value.shape[0]
                break
        # 创建一个形状为 (batch_size, 1) 的张量，填充值为 bos_token_id，并使用设备 self.device
        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
    # 定义一个方法，用于获取无条件生成的输入，以便在没有特征提取器或分词器的情况下使用模型。
    def get_unconditional_inputs(self, num_samples=1):
        """
        Helper function to get null inputs for unconditional generation, enabling the model to be used without the
        feature extractor or tokenizer.
    
        Args:
            num_samples (int, *optional*):
                Number of audio samples to unconditionally generate.
            max_new_tokens (int, *optional*):
                Number of tokens to generate for each sample. More tokens means longer audio samples, at the expense of
                longer inference (since more audio tokens need to be generated per sample).
    
        Example:
        ```
        >>> from transformers import MusicgenForConditionalGeneration
    
        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    
        >>> # get the unconditional (or 'null') inputs for the model
        >>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
        >>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
        ```"""
        # 创建一个全零张量，用于存储模型的隐藏状态输出，形状为 (num_samples, 1, hidden_size)
        last_hidden_state = torch.zeros(
            (num_samples, 1, self.config.text_encoder.hidden_size), device=self.device, dtype=self.dtype
        )
    
        # 创建一个全零张量作为注意力掩码，形状为 (num_samples, 1)，用于指示哪些位置需要注意力
        attention_mask = torch.zeros((num_samples, 1), device=self.device, dtype=torch.long)
    
        # 返回一个包含无条件生成所需输入的 MusicgenUnconditionalInput 对象
        return MusicgenUnconditionalInput(
            encoder_outputs=(last_hidden_state,),  # 编码器输出，包含隐藏状态
            attention_mask=attention_mask,          # 注意力掩码，全零表示不区分注意力
            guidance_scale=1.0,                     # 指导尺度，通常设置为1.0
        )

`.\models\musicgen\processing_musicgen.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Text/audio processor class for MusicGen
"""
from typing import List, Optional

import numpy as np

from ...processing_utils import ProcessorMixin
from ...utils import to_numpy


class MusicgenProcessor(ProcessorMixin):
    r"""
    Constructs a MusicGen processor which wraps an EnCodec feature extractor and a T5 tokenizer into a single processor
    class.

    [`MusicgenProcessor`] offers all the functionalities of [`EncodecFeatureExtractor`] and [`TTokenizer`]. See
    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.

    Args:
        feature_extractor (`EncodecFeatureExtractor`):
            An instance of [`EncodecFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`T5Tokenizer`):
            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
    """

    feature_extractor_class = "EncodecFeatureExtractor"
    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")

    def __init__(self, feature_extractor, tokenizer):
        # 调用父类构造函数初始化特征提取器和分词器
        super().__init__(feature_extractor, tokenizer)
        # 将特征提取器设为当前处理器
        self.current_processor = self.feature_extractor
        # 设定目标上下文管理器标志为假
        self._in_target_context_manager = False

    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
        # 调用分词器的方法获取解码器提示 ID
        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
    # 实现对象的可调用行为，将 `audio` 参数传递给 `EncodecFeatureExtractor` 的 [`~EncodecFeatureExtractor.__call__`] 方法，
    # 将 `text` 参数传递给 [`~T5Tokenizer.__call__`] 方法。更多信息请参考上述两个方法的文档字符串。
    def __call__(self, *args, **kwargs):
        """
        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
        argument to [`~T5Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
        information.
        """
        # 为了向后兼容
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

        # 从 kwargs 中弹出 `audio`、`sampling_rate` 和 `text` 参数
        audio = kwargs.pop("audio", None)
        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)

        # 如果有位置参数，则将第一个参数作为 `audio`，其余的作为 `args`
        if len(args) > 0:
            audio = args[0]
            args = args[1:]

        # 如果 `audio` 和 `text` 均为 None，则抛出 ValueError
        if audio is None and text is None:
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        # 如果存在 `text` 参数，则使用 tokenizer 处理文本
        if text is not None:
            inputs = self.tokenizer(text, **kwargs)

        # 如果存在 `audio` 参数，则使用 feature_extractor 处理音频
        if audio is not None:
            audio_inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)

        # 如果只有 `audio` 参数，则返回处理后的文本输入
        if audio is None:
            return inputs

        # 如果只有 `text` 参数，则返回处理后的音频输入
        elif text is None:
            return audio_inputs

        # 如果既有 `audio` 又有 `text` 参数，则合并处理结果并返回
        else:
            inputs["input_values"] = audio_inputs["input_values"]
            if "padding_mask" in audio_inputs:
                inputs["padding_mask"] = audio_inputs["padding_mask"]
            return inputs

    # 批量解码方法，用于解码来自 MusicGen 模型的音频输出批次或来自 tokenizer 的 token ids 批次
    def batch_decode(self, *args, **kwargs):
        """
        This method is used to decode either batches of audio outputs from the MusicGen model, or batches of token ids
        from the tokenizer. In the case of decoding token ids, this method forwards all its arguments to T5Tokenizer's
        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information.
        """
        # 从 kwargs 中弹出 `audio` 和 `padding_mask` 参数
        audio_values = kwargs.pop("audio", None)
        padding_mask = kwargs.pop("padding_mask", None)

        # 如果有位置参数，则将第一个参数作为 `audio_values`，其余的作为 `args`
        if len(args) > 0:
            audio_values = args[0]
            args = args[1:]

        # 如果存在 `audio_values` 参数，则调用 `_decode_audio` 方法解码音频
        if audio_values is not None:
            return self._decode_audio(audio_values, padding_mask=padding_mask)
        else:
            # 否则调用 tokenizer 的 batch_decode 方法解码 token ids
            return self.tokenizer.batch_decode(*args, **kwargs)

    # 解码方法，将所有参数转发给 T5Tokenizer 的 [`~PreTrainedTokenizer.decode`] 方法
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to T5Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
    def _decode_audio(self, audio_values, padding_mask: Optional = None) -> List[np.ndarray]:
        """
        This method strips any padding from the audio values to return a list of numpy audio arrays.
        """
        # 将输入的音频值转换为 numpy 数组
        audio_values = to_numpy(audio_values)
        
        # 获取批量大小、通道数和序列长度
        bsz, channels, seq_len = audio_values.shape

        # 如果没有提供填充掩码，则直接返回音频值的列表
        if padding_mask is None:
            return list(audio_values)

        # 将填充掩码也转换为 numpy 数组
        padding_mask = to_numpy(padding_mask)

        # 计算填充掩码的序列长度差，以便与生成的音频数组匹配
        difference = seq_len - padding_mask.shape[-1]

        # 根据填充值（非填充标记）填充填充掩码，确保生成的音频值不被视为填充标记
        padding_value = 1 - self.feature_extractor.padding_value
        padding_mask = np.pad(padding_mask, ((0, 0), (0, difference)), "constant", constant_values=padding_value)

        # 将音频值转换为列表形式以便后续处理
        audio_values = audio_values.tolist()
        for i in range(bsz):
            # 根据填充掩码切片音频数组，去除填充部分，并重新整形为通道数和变长序列
            sliced_audio = np.asarray(audio_values[i])[
                padding_mask[i][None, :] != self.feature_extractor.padding_value
            ]
            audio_values[i] = sliced_audio.reshape(channels, -1)

        # 返回处理后的音频值列表
        return audio_values

`.\models\musicgen\init.py`

# 引入类型检查模块，用于在不同环境下处理类型的依赖
from typing import TYPE_CHECKING

# 引入自定义的异常类和模块延迟加载工具类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块的导入结构，包含各模块的导出变量和类
_import_structure = {
    "configuration_musicgen": [
        "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "MusicgenConfig",
        "MusicgenDecoderConfig",
    ],
    "processing_musicgen": ["MusicgenProcessor"],
}

# 尝试检查是否存在 torch 库，若不存在则引发自定义的可选依赖未找到异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若存在 torch 库，则添加相关模型建模模块到导入结构中
    _import_structure["modeling_musicgen"] = [
        "MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MusicgenForConditionalGeneration",
        "MusicgenForCausalLM",
        "MusicgenModel",
        "MusicgenPreTrainedModel",
    ]

# 如果处于类型检查模式，从相应模块中导入配置和处理类
if TYPE_CHECKING:
    from .configuration_musicgen import (
        MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
        MusicgenConfig,
        MusicgenDecoderConfig,
    )
    from .processing_musicgen import MusicgenProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果存在 torch 库，则从模型建模模块中导入相关类
        from .modeling_musicgen import (
            MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
            MusicgenForCausalLM,
            MusicgenForConditionalGeneration,
            MusicgenModel,
            MusicgenPreTrainedModel,
        )

# 如果不处于类型检查模式，则导入 sys 模块，并将当前模块定义为延迟加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\musicgen_melody\configuration_musicgen_melody.py`

# coding=utf-8
# Copyright 2024 Meta AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Musicgen Melody model configuration"""

# 导入所需的模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import AutoConfig

# 获取logger对象用于记录日志
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射，指定预训练模型名称及其配置文件URL
MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/musicgen-melody": "https://huggingface.co/facebook/musicgen-melody/resolve/main/config.json",
}

# MusicgenMelodyDecoderConfig类，继承自PretrainedConfig，用于存储Musicgen Melody解码器的配置信息
class MusicgenMelodyDecoderConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`MusicgenMelodyDecoder`]. It is used to instantiate a
    Musicgen Melody decoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Musicgen Melody
    [facebook/musicgen-melody](https://huggingface.co/facebook/musicgen-melody) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """

    # 模型类型，用于标识模型配置类别
    model_type = "musicgen_melody_decoder"
    # 推断过程中忽略的关键字列表
    keys_to_ignore_at_inference = ["past_key_values"]

    # 初始化方法，定义了多个模型配置参数
    def __init__(
        self,
        vocab_size=2048,
        max_position_embeddings=2048,
        num_hidden_layers=24,
        ffn_dim=4096,
        num_attention_heads=16,
        layerdrop=0.0,
        use_cache=True,
        activation_function="gelu",
        hidden_size=1024,
        dropout=0.1,
        attention_dropout=0.0,
        activation_dropout=0.0,
        initializer_factor=0.02,
        scale_embedding=False,
        num_codebooks=4,
        audio_channels=1,
        pad_token_id=2048,
        bos_token_id=2048,
        eos_token_id=None,
        tie_word_embeddings=False,
        **kwargs,
    ):
        # 调用父类的初始化方法，传递参数给父类PretrainedConfig
        super().__init__(
            vocab_size=vocab_size,
            max_position_embeddings=max_position_embeddings,
            num_hidden_layers=num_hidden_layers,
            ffn_dim=ffn_dim,
            num_attention_heads=num_attention_heads,
            layerdrop=layerdrop,
            use_cache=use_cache,
            activation_function=activation_function,
            hidden_size=hidden_size,
            dropout=dropout,
            attention_dropout=attention_dropout,
            activation_dropout=activation_dropout,
            initializer_factor=initializer_factor,
            scale_embedding=scale_embedding,
            num_codebooks=num_codebooks,
            audio_channels=audio_channels,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
        # 初始化BERT模型的配置参数
        self.vocab_size = vocab_size  # 词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 最大位置嵌入长度
        self.hidden_size = hidden_size  # 隐藏层大小
        self.ffn_dim = ffn_dim  # 前馈神经网络维度
        self.num_hidden_layers = num_hidden_layers  # 隐藏层数量
        self.num_attention_heads = num_attention_heads  # 注意力头的数量
        self.dropout = dropout  # 普通的dropout概率
        self.attention_dropout = attention_dropout  # 注意力机制中的dropout概率
        self.activation_dropout = activation_dropout  # 激活函数中的dropout概率
        self.activation_function = activation_function  # 激活函数类型
        self.initializer_factor = initializer_factor  # 初始化因子
        self.layerdrop = layerdrop  # 层级dropout比例
        self.use_cache = use_cache  # 是否使用缓存
        self.scale_embedding = scale_embedding  # 如果为True，则嵌入的缩放因子将是sqrt(d_model)
        self.num_codebooks = num_codebooks  # 编码书的数量

        # 检查音频通道数是否为1（单声道）或2（立体声）
        if audio_channels not in [1, 2]:
            raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.")
        self.audio_channels = audio_channels  # 音频通道数

        # 调用父类构造函数，传递额外参数和BERT模型的特定参数
        super().__init__(
            pad_token_id=pad_token_id,  # 填充标记ID
            bos_token_id=bos_token_id,  # 起始标记ID
            eos_token_id=eos_token_id,  # 结束标记ID
            tie_word_embeddings=tie_word_embeddings,  # 是否绑定词嵌入
            **kwargs,  # 其他未命名参数
        )
# 定义 MusicgenMelodyConfig 类，继承自 PretrainedConfig 类，用于存储 Musicgen Melody 模型的配置信息
class MusicgenMelodyConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MusicgenMelodyModel`]. It is used to instantiate a
    Musicgen Melody model according to the specified arguments, defining the text encoder, audio encoder and Musicgen Melody decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the Musicgen Melody
    [facebook/musicgen-melody](https://huggingface.co/facebook/musicgen-melody) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        num_chroma (`int`, *optional*, defaults to 12): Number of chroma bins to use.
        chroma_length (`int`, *optional*, defaults to 235):
            Maximum chroma duration if audio is used to condition the model. Corresponds to the maximum duration used during training.
        kwargs (*optional*):
            Dictionary of keyword arguments. Notably:

                - **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
                  defines the text encoder config.
                - **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
                  defines the audio encoder config.
                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
                  the decoder config.

    Example:

    ```
    >>> from transformers import (
    ...     MusicgenMelodyConfig,
    ...     MusicgenMelodyDecoderConfig,
    ...     T5Config,
    ...     EncodecConfig,
    ...     MusicgenMelodyForConditionalGeneration,
    ... )

    >>> # Initializing text encoder, audio encoder, and decoder model configurations
    >>> text_encoder_config = T5Config()
    >>> audio_encoder_config = EncodecConfig()
    >>> decoder_config = MusicgenMelodyDecoderConfig()

    >>> configuration = MusicgenMelodyConfig.from_sub_models_config(
    ...     text_encoder_config, audio_encoder_config, decoder_config
    ... )

    >>> # Initializing a MusicgenMelodyForConditionalGeneration (with random weights) from the facebook/musicgen-melody style configuration
    >>> model = MusicgenMelodyForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    >>> config_text_encoder = model.config.text_encoder
    >>> config_audio_encoder = model.config.audio_encoder
    >>> config_decoder = model.config.decoder

    >>> # Saving the model, including its configuration
    >>> model.save_pretrained("musicgen_melody-model")

    >>> # loading model and config from pretrained folder
    >>> musicgen_melody_config = MusicgenMelodyConfig.from_pretrained("musicgen_melody-model")

    ```
    # 使用预训练模型名称和配置创建音乐生成模型对象
    model = MusicgenMelodyForConditionalGeneration.from_pretrained("musicgen_melody-model", config=musicgen_melody_config)



    # 设置模型类型为音乐生成旋律
    model_type = "musicgen_melody"
    # 标记此模型为一个生成作品
    is_composition = True



    def __init__(
        self,
        num_chroma=12,
        chroma_length=235,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # 检查是否初始化了 text_encoder、audio_encoder 和 decoder 配置，否则抛出值错误
        if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
            raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")
        
        # 弹出并初始化文本编码器配置
        text_encoder_config = kwargs.pop("text_encoder")
        text_encoder_model_type = text_encoder_config.pop("model_type")

        # 弹出并初始化音频编码器配置
        audio_encoder_config = kwargs.pop("audio_encoder")
        audio_encoder_model_type = audio_encoder_config.pop("model_type")

        # 弹出并初始化解码器配置
        decoder_config = kwargs.pop("decoder")

        # 使用 AutoConfig 根据模型类型和配置初始化文本编码器、音频编码器和解码器
        self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
        self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
        self.decoder = MusicgenMelodyDecoderConfig(**decoder_config)
        self.is_encoder_decoder = False

        # 设置音调数量和音调长度
        self.num_chroma = num_chroma
        self.chroma_length = chroma_length



    @classmethod
    def from_sub_models_config(
        cls,
        text_encoder_config: PretrainedConfig,
        audio_encoder_config: PretrainedConfig,
        decoder_config: MusicgenMelodyDecoderConfig,
        **kwargs,
    ):
        r"""
        从文本编码器、音频编码器和解码器配置实例化一个 MusicgenMelodyConfig（或其派生类）。

        Returns:
            [`MusicgenMelodyConfig`]: 配置对象的一个实例
        """

        # 使用给定的配置实例化当前类的对象
        return cls(
            text_encoder=text_encoder_config.to_dict(),
            audio_encoder=audio_encoder_config.to_dict(),
            decoder=decoder_config.to_dict(),
            **kwargs,
        )



    @property
    # 这是一个属性，因为您可能想要动态更改编解码器模型
    def sampling_rate(self):
        # 返回音频编码器的采样率
        return self.audio_encoder.sampling_rate

`.\models\musicgen_melody\convert_musicgen_melody_transformers.py`

# 设置文件编码格式为 UTF-8
# 版权声明和许可信息，指明此代码受 Apache License, Version 2.0 的保护
# 该脚本用于将原始存储库中的 Musicgen Melody 检查点转换
"""Convert Musicgen Melody checkpoints from the original repository."""
# 导入必要的库和模块
import argparse  # 用于解析命令行参数
from pathlib import Path  # 提供处理文件路径的类和方法
from typing import Dict, OrderedDict, Tuple  # 引入类型提示

import torch  # PyTorch 深度学习库
from audiocraft.models import MusicGen  # 导入自定义的音乐生成模型

# 从 Transformers 库中导入相关模块和类
from transformers import (
    AutoTokenizer,  # 自动模型令牌化
    EncodecModel,  # 编码器模型（可能是拼写错误，应为EncoderModel）
    T5EncoderModel,  # T5 编码器模型
)
# 导入 Musicgen Melody 的配置、特征提取、模型和处理模块
from transformers.models.musicgen_melody.configuration_musicgen_melody import MusicgenMelodyDecoderConfig
from transformers.models.musicgen_melody.feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
from transformers.models.musicgen_melody.modeling_musicgen_melody import (
    MusicgenMelodyForCausalLM,  # Musicgen Melody 的因果语言建模模型
    MusicgenMelodyForConditionalGeneration,  # Musicgen Melody 的条件生成模型
)
from transformers.models.musicgen_melody.processing_musicgen_melody import MusicgenMelodyProcessor  # 处理 Musicgen Melody 相关任务的模块
from transformers.utils import logging  # 导入日志记录模块

# 设置日志记录级别为信息
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预期缺失的模型键列表
EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
# 预期额外的模型键列表
EXPECTED_ADDITIONAL_KEYS = ["condition_provider.conditioners.self_wav.chroma.spec.window"]


# 定义一个函数用于重命名模型参数名
def rename_keys(name):
    if "emb" in name:
        name = name.replace("emb", "model.decoder.embed_tokens")
    if "transformer" in name:
        name = name.replace("transformer", "model.decoder")
    if "cross_attention" in name:
        name = name.replace("cross_attention", "encoder_attn")
    if "linear1" in name:
        name = name.replace("linear1", "fc1")
    if "linear2" in name:
        name = name.replace("linear2", "fc2")
    if "norm1" in name:
        name = name.replace("norm1", "self_attn_layer_norm")
    if "norm_cross" in name:
        name = name.replace("norm_cross", "encoder_attn_layer_norm")
    if "norm2" in name:
        name = name.replace("norm2", "final_layer_norm")
    if "out_norm" in name:
        name = name.replace("out_norm", "model.decoder.layer_norm")
    if "linears" in name:
        name = name.replace("linears", "lm_heads")
    if "condition_provider.conditioners.description.output_proj" in name:
        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
    if "condition_provider.conditioners.self_wav.output_proj" in name:
        name = name.replace("condition_provider.conditioners.self_wav.output_proj", "audio_enc_to_dec_proj")
    return name
# 定义一个函数，用于重命名给定的状态字典，并按照特定的模块名称重新命名。
def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
    """Function that takes the fairseq MusicgenMelody state dict and renames it according to the HF
    module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
    text encoder projection and for the audio encoder projection."""
    
    # 获取状态字典的所有键
    keys = list(state_dict.keys())
    # 初始化空字典，用于存储编码器-解码器投影和音频编码器到解码器投影之间的状态字典
    enc_dec_proj_state_dict = {}
    audio_enc_to_dec_proj_state_dict = {}
    
    # 遍历状态字典的每个键
    for key in keys:
        # 弹出当前键对应的值
        val = state_dict.pop(key)
        # 使用自定义函数重命名当前键
        key = rename_keys(key)
        
        # 如果当前键包含 "in_proj_weight"，则拆分融合的qkv投影
        if "in_proj_weight" in key:
            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
        # 如果当前键包含 "audio_enc_to_dec_proj"，则将其添加到音频编码器到解码器投影状态字典中
        elif "audio_enc_to_dec_proj" in key:
            audio_enc_to_dec_proj_state_dict[key[len("audio_enc_to_dec_proj.") :]] = val
        # 如果当前键包含 "enc_to_dec_proj"，则将其添加到编码器到解码器投影状态字典中
        elif "enc_to_dec_proj" in key:
            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
        # 否则，将当前键和对应的值添加回状态字典中
        else:
            state_dict[key] = val
    
    # 返回重命名后的状态字典，编码器-解码器投影状态字典和音频编码器到解码器投影状态字典
    return state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict


# 定义一个函数，从给定的检查点加载配置信息并返回 MusicgenMelodyDecoderConfig 对象
def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenMelodyDecoderConfig:
    # 根据给定的检查点名称，设置隐藏大小、隐藏层数、注意力头数等参数
    if checkpoint == "facebook/musicgen-melody" or checkpoint == "facebook/musicgen-stereo-melody":
        hidden_size = 1536
        num_hidden_layers = 48
        num_attention_heads = 24
    elif checkpoint == "facebook/musicgen-melody-large" or checkpoint == "facebook/musicgen-stereo-melody-large":
        hidden_size = 2048
        num_hidden_layers = 48
        num_attention_heads = 32
    else:
        # 如果检查点名称不在预期范围内，抛出 ValueError 异常
        raise ValueError(
            "Checkpoint should be one of `['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, "
            "or `['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
            f"for the stereo checkpoints, got {checkpoint}."
        )
    
    # 根据检查点名称中是否包含 "stereo" 设置音频通道数和码本数
    if "stereo" in checkpoint:
        audio_channels = 2
        num_codebooks = 8
    else:
        audio_channels = 1
        num_codebooks = 4
    
    # 创建并返回一个 MusicgenMelodyDecoderConfig 对象，包含从检查点加载的配置信息
    config = MusicgenMelodyDecoderConfig(
        hidden_size=hidden_size,
        ffn_dim=hidden_size * 4,
        num_hidden_layers=num_hidden_layers,
        num_attention_heads=num_attention_heads,
        num_codebooks=num_codebooks,
        audio_channels=audio_channels,
    )
    return config


# 定义一个装饰器，用于声明一个无需计算梯度的函数
@torch.no_grad()
def convert_musicgen_melody_checkpoint(
    checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", test_same_output=False
):
    # 从预训练模型加载指定的检查点，并将模型移至指定的设备上
    fairseq_model = MusicGen.get_pretrained(checkpoint, device=args.device)
    # 从加载的模型中获取语言模型的状态字典
    decoder_state_dict = fairseq_model.lm.state_dict()
    # 重命名解码器的状态字典，并根据隐藏层大小调整编码-解码投影的状态字典
    decoder_state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict = rename_state_dict(
        decoder_state_dict, hidden_size=decoder_config.hidden_size
    )

    # 使用预训练的T5模型初始化文本编码器
    text_encoder = T5EncoderModel.from_pretrained("t5-base")
    
    # 使用预训练的音频编码器初始化音频编码器
    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
    
    # 根据给定的解码器配置初始化音乐生成的Melody解码器，并设为评估模式
    decoder = MusicgenMelodyForCausalLM(decoder_config).eval()

    # 加载解码器权重，允许缺少嵌入和编码-解码投影
    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)

    # 移除与文本编码器或音频编码器相关的缺失键及期望的缺失键
    for key in missing_keys.copy():
        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
            missing_keys.remove(key)

    # 移除与期望的额外键相对应的意外键
    for key in unexpected_keys.copy():
        if key in EXPECTED_ADDITIONAL_KEYS:
            unexpected_keys.remove(key)

    # 如果存在缺失的键，则引发值错误
    if len(missing_keys) > 0:
        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")

    # 如果存在意外的键，则引发值错误
    if len(unexpected_keys) > 0:
        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")

    # 初始化组合模型，包括文本编码器、音频编码器和解码器
    model = MusicgenMelodyForConditionalGeneration(
        text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder
    ).to(args.device)

    # 加载预训练的编码-解码投影（从解码器状态字典中）
    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)

    # 加载预训练的音频编码器投影（从解码器状态字典中）
    model.audio_enc_to_dec_proj.load_state_dict(audio_enc_to_dec_proj_state_dict)

    # 检查是否可以进行前向传播
    input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1).to(device)
    decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1).to(device)

    # 使用torch.no_grad()上下文管理器执行前向传播，获取logits
    with torch.no_grad():
        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits

    # 计算预期的输出长度，并检查logits的形状是否符合预期
    output_length = 1 + input_ids.shape[1] + model.config.chroma_length
    if logits.shape != (2 * decoder_config.num_codebooks, output_length, 2048):
        raise ValueError("Incorrect shape for logits")

    # 初始化tokenizer，使用T5-base模型的自动tokenizer
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    
    # 初始化特征提取器为音乐生成Melody的特征提取器
    feature_extractor = MusicgenMelodyFeatureExtractor()

    # 初始化processor，使用特征提取器和tokenizer
    processor = MusicgenMelodyProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

    # 设置适当的开始/填充token id
    model.generation_config.decoder_start_token_id = 2048
    model.generation_config.pad_token_id = 2048

    # 设置其他默认的生成配置参数
    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
    model.generation_config.do_sample = True
    model.generation_config.guidance_scale = 3.0
    # 如果需要测试输出是否与原始模型相同
    if test_same_output:
        # 准备用于解码的输入张量，全部填充为模型的填充标记ID
        decoder_input_ids = torch.ones_like(decoder_input_ids).to(device) * model.generation_config.pad_token_id
        
        # 禁止梯度计算的上下文
        with torch.no_grad():
            # 限制解码器输入的长度，仅保留前 decoder_config.num_codebooks 个位置
            decoder_input_ids = decoder_input_ids[: decoder_config.num_codebooks]
            
            # 使用processor对文本进行处理，返回PyTorch张量格式的输入数据
            inputs = processor(text=["gen"], return_tensors="pt", padding=True).to(device)
            
            # 使用模型生成logits，给定解码器的输入张量
            logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits

            # 准备fairseq模型的tokens和attributes用于生成
            attributes, prompt_tokens = fairseq_model._prepare_tokens_and_attributes(["gen"], None)
            
            # 使用fairseq模型进行前向推断，计算原始模型的logits
            original_logits = fairseq_model.lm.forward(
                decoder_input_ids.reshape(1, decoder_config.num_codebooks, -1), attributes
            )

            # 使用torch的测试工具断言，检查生成的logits与原始模型的logits在数值上的接近度
            torch.testing.assert_close(
                original_logits.squeeze(2).reshape(decoder_config.num_codebooks, -1),
                logits[:, -1],
                rtol=1e-5,
                atol=5e-5,
            )

    # 如果提供了pytorch_dump_folder路径，则保存模型和processor的配置到指定目录
    if pytorch_dump_folder is not None:
        # 如果路径不存在，则创建该目录
        Path(pytorch_dump_folder).mkdir(exist_ok=True)
        
        # 记录日志，指示将模型保存到指定目录
        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
        
        # 保存模型的预训练配置到指定目录
        model.save_pretrained(pytorch_dump_folder)
        
        # 保存processor的配置到指定目录
        processor.save_pretrained(pytorch_dump_folder)

    # 如果提供了repo_id，则将模型和processor推送到指定的Hub repo中
    if repo_id:
        # 记录日志，指示将模型推送到指定的Hub repo中
        logger.info(f"Pushing model {checkpoint} to {repo_id}")
        
        # 将模型推送到指定的Hub repo中，并创建pull request
        model.push_to_hub(repo_id, create_pr=True)
        
        # 将processor推送到指定的Hub repo中，并创建pull request
        processor.push_to_hub(repo_id, create_pr=True)
if __name__ == "__main__":
    # 如果脚本作为主程序执行，进入主程序入口

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--checkpoint",
        default="facebook/musicgen-melody",
        type=str,
        help="Checkpoint size of the Musicgen Melody model you'd like to convert. Can be one of: "
        "`['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, or "
        "`['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
        "for the stereo checkpoints.",
    )
    # 添加必选参数--checkpoint，指定要转换的 Musicgen Melody 模型的检查点位置

    parser.add_argument(
        "--pytorch_dump_folder",
        default=None,
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加可选参数--pytorch_dump_folder，指定输出的 PyTorch 模型保存路径

    parser.add_argument(
        "--push_to_hub",
        default="musicgen-melody",
        type=str,
        help="Where to upload the converted model on the 🤗 hub.",
    )
    # 添加可选参数--push_to_hub，指定在 🤗 hub 上上传转换后的模型的位置标识

    parser.add_argument(
        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
    )
    # 添加可选参数--device，指定转换过程中使用的 Torch 设备，可以是 cpu 或 cuda

    parser.add_argument("--test_same_output", default=False, type=bool, help="If `True`, test if same output logits.")
    # 添加可选参数--test_same_output，如果设置为 True，则测试是否输出相同的 logits

    args = parser.parse_args()
    # 解析命令行参数并返回解析后的参数对象 args

    convert_musicgen_melody_checkpoint(
        args.checkpoint, args.pytorch_dump_folder, args.push_to_hub, args.device, args.test_same_output
    )
    # 调用函数 convert_musicgen_melody_checkpoint，传入解析后的参数，执行 Musicgen Melody 模型的转换操作

`.\models\musicgen_melody\feature_extraction_musicgen_melody.py`

"""
Feature extractor class for Musicgen Melody
"""
# 导入必要的模块和类
import copy  # 导入深拷贝模块
from typing import Any, Dict, List, Optional, Union  # 导入类型提示模块

import numpy as np  # 导入 NumPy 库

# 导入音频处理相关的实用函数和类
from ...audio_utils import chroma_filter_bank  
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor  
from ...feature_extraction_utils import BatchFeature  
from ...utils import TensorType, is_torch_available, is_torchaudio_available, logging  

# 如果 Torch 可用，导入 Torch 库
if is_torch_available():
    import torch  

# 如果 Torchaudio 可用，导入 Torchaudio 库
if is_torchaudio_available():
    import torchaudio  

# 获取日志记录器对象
logger = logging.get_logger(__name__)  


class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs a MusicgenMelody feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts chroma features from audio processed by [Demucs](https://github.com/adefossez/demucs/tree/main) or
    directly from raw audio waveform.
    """
    # 默认参数：特征向量的维度为12
    # The default feature size of the extracted features is 12
    feature_size=12,
    
    # 默认参数：音频文件的数字化采样率为32000赫兹（Hz）
    # The default sampling rate at which audio files are digitized is 32000 Hz
    sampling_rate=32000,
    
    # 默认参数：用于获取Mel频率系数的STFT中的重叠窗口长度为4096
    # Length of overlapping windows for STFT used to obtain Mel Frequency coefficients
    hop_length=4096,
    
    # 默认参数：用于裁剪和填充较长或较短音频序列的最大采样率样本数为30个
    # Maximum number of chunks of sampling_rate samples used to trim and pad longer or shorter audio sequences
    chunk_length=30,
    
    # 默认参数：傅里叶变换的大小为16384
    # Size of the Fourier transform
    n_fft=16384,
    
    # 默认参数：使用的色度频带数为12
    # Number of chroma bins to use
    num_chroma=12,
    
    # 默认参数：用于填充音频的填充值为0.0
    # Padding value used to pad the audio
    padding_value=0.0,
    
    # 默认参数：是否返回注意力掩码，默认为False
    # Whether to return the attention mask, default is False
    return_attention_mask=False,
    
    # 默认参数：如果Demucs输出作为输入，要提取的干索引为[3, 2]
    # Stem channels to extract if Demucs outputs are passed
    stem_indices=[3, 2],
    
    model_input_names = ["input_features"]

    def __init__(
        self,
        feature_size=12,
        sampling_rate=32000,
        hop_length=4096,
        chunk_length=30,
        n_fft=16384,
        num_chroma=12,
        padding_value=0.0,
        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
        stem_indices=[3, 2],
        **kwargs,
    ):
        # 调用父类构造函数，初始化特征提取器的参数
        # Call the parent class constructor to initialize parameters of the feature extractor
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            padding_value=padding_value,
            return_attention_mask=return_attention_mask,
            **kwargs,
        )
        
        # 设置本地变量n_fft、hop_length、chunk_length和n_samples的值
        # Set values for local variables n_fft, hop_length, chunk_length, and n_samples
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.chunk_length = chunk_length
        self.n_samples = chunk_length * sampling_rate
        self.sampling_rate = sampling_rate
        
        # 使用给定的参数初始化色度滤波器
        # Initialize chroma filters using given parameters
        self.chroma_filters = torch.from_numpy(
            chroma_filter_bank(sampling_rate=sampling_rate, num_frequency_bins=n_fft, tuning=0, num_chroma=num_chroma)
        ).float()
        
        # 初始化频谱图变换器，用于生成音频的频谱图
        # Initialize spectrogram transformer for generating spectrograms of audio
        self.spectrogram = torchaudio.transforms.Spectrogram(
            n_fft=n_fft, win_length=n_fft, hop_length=hop_length, power=2, center=True, pad=0, normalized=True
        )
        
        # 设置提取的干索引列表
        # Set list of stem indices to extract
        self.stem_indices = stem_indices
    def _torch_extract_fbank_features(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Compute the chroma spectrogram of the provided audio using the torchaudio spectrogram implementation and the librosa chroma features.
        """

        # if wav length is not long enough, pad it
        wav_length = waveform.shape[-1]
        if wav_length < self.n_fft:
            # Calculate padding required to match `n_fft`
            pad = self.n_fft - wav_length
            rest = 0 if pad % 2 == 0 else 1
            # Pad the waveform symmetrically to ensure correct length
            waveform = torch.nn.functional.pad(waveform, (pad // 2, pad // 2 + rest), "constant", 0)

        # squeeze alongside channel dimension
        spec = self.spectrogram(waveform).squeeze(1)

        # sum along the frequency dimension
        raw_chroma = torch.einsum("cf, ...ft->...ct", self.chroma_filters, spec)

        # normalise with max value
        norm_chroma = torch.nn.functional.normalize(raw_chroma, p=float("inf"), dim=-2, eps=1e-6)

        # transpose time and chroma dimension -> (batch, time, chroma)
        norm_chroma = norm_chroma.transpose(1, 2)

        # replace max value alongside chroma dimension with 1 and replace the rest with 0
        idx = norm_chroma.argmax(-1, keepdim=True)
        norm_chroma[:] = 0
        norm_chroma.scatter_(dim=-1, index=idx, value=1)

        return norm_chroma




    def _extract_stem_indices(self, audio, sampling_rate=None):
        """
        Extracts stems from the output of the [Demucs](https://github.com/adefossez/demucs/tree/main) audio separation model,
        then converts to mono-channel and resample to the feature extractor sampling rate.

        Args:
            audio (`torch.Tensor` of shape `(batch_size, num_stems, channel_size, audio_length)`):
                The output of the Demucs model to be processed.
            sampling_rate (`int`, *optional*):
                Demucs sampling rate. If not specified, defaults to `44000`.
        """
        sampling_rate = 44000 if sampling_rate is None else sampling_rate

        # extract "vocals" and "others" sources from audio encoder (demucs) output
        # [batch_size, num_stems, channel_size, audio_length]
        wav = audio[:, torch.tensor(self.stem_indices)]

        # merge extracted stems to single waveform
        wav = wav.sum(1)

        # convert to mono-channel waveform
        wav = wav.mean(dim=1, keepdim=True)

        # resample to model sampling rate
        # not equivalent to julius.resample
        if sampling_rate != self.sampling_rate:
            # Resample the waveform to match the feature extractor's sampling rate
            wav = torchaudio.functional.resample(
                wav, sampling_rate, self.sampling_rate, rolloff=0.945, lowpass_filter_width=24
            )

        # [batch_size, 1, audio_length] -> [batch_size, audio_length]
        wav = wav.squeeze(1)

        return wav
    def __call__(
        self,
        audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        truncation: bool = True,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_attention_mask: Optional[bool] = None,
        padding: Optional[str] = True,
        max_length: Optional[int] = None,
        sampling_rate: Optional[int] = None,
        **kwargs,
    ):
        """
        调用函数，用于处理音频数据并返回处理后的结果。

        Parameters:
            audio (Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]):
                输入的音频数据，可以是 numpy 数组，列表或嵌套列表形式。
            truncation (bool, optional):
                是否对音频进行截断，默认为 True。
            pad_to_multiple_of (Optional[int], optional):
                可选参数，对音频进行填充的倍数。
            return_tensors (Optional[Union[str, TensorType]], optional):
                可选参数，指定返回的数据类型，如字符串或张量类型。
            return_attention_mask (Optional[bool], optional):
                可选参数，是否返回注意力掩码。
            padding (Optional[str], optional):
                可选参数，是否进行填充，默认为 True。
            max_length (Optional[int], optional):
                可选参数，最大长度限制。
            sampling_rate (Optional[int], optional):
                可选参数，采样率。
            **kwargs:
                其他关键字参数。

        Returns:
            返回处理后的音频数据或特征。

        """
        # 在这里实现音频数据的处理逻辑，具体步骤依赖于输入参数和处理逻辑
        pass

    def to_dict(self) -> Dict[str, Any]:
        """
        将当前实例序列化为 Python 字典。

        Returns:
            `Dict[str, Any]`: 包含所有配置实例属性的字典。

        """
        # 深拷贝当前实例的 __dict__ 属性
        output = copy.deepcopy(self.__dict__)
        # 添加特定的类别信息到输出字典中
        output["feature_extractor_type"] = self.__class__.__name__
        # 如果存在特定的键，则从输出字典中删除相应的条目
        if "mel_filters" in output:
            del output["mel_filters"]
        if "window" in output:
            del output["window"]
        if "chroma_filters" in output:
            del output["chroma_filters"]
        if "spectrogram" in output:
            del output["spectrogram"]
        # 返回最终的输出字典
        return output

`.\models\musicgen_melody\modeling_musicgen_melody.py`

# 设置文件编码格式为 UTF-8
# 版权声明，版权归 Meta AI 和 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本使用本文件。除非符合许可证的规定，否则不得使用本文件。
# 您可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于"原样"的基础分发的，不提供任何明示或暗示的担保或条件。
# 请参阅许可证以获取有关详细信息。
""" PyTorch Musicgen Melody model."""

# 导入所需的模块和库
import copy
import inspect
import math
import random
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

# 导入自定义的模块和函数
from ...activations import ACT2FN
from ...generation.configuration_utils import GenerationConfig
from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
from ...generation.stopping_criteria import StoppingCriteriaList
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutputWithPast,
    ModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..auto.configuration_auto import AutoConfig
from ..auto.modeling_auto import AutoModel, AutoModelForTextEncoding
from .configuration_musicgen_melody import MusicgenMelodyConfig, MusicgenMelodyDecoderConfig

# 如果是类型检查阶段，导入 BaseStreamer 类
if TYPE_CHECKING:
    from ...generation.streamers import BaseStreamer

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 配置文件和检查点的文档常量
_CONFIG_FOR_DOC = "MusicgenMelodyConfig"
_CHECKPOINT_FOR_DOC = "facebook/musicgen-melody"

# 预训练模型的存档列表
MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/musicgen-melody",
    # 更多 Musicgen Melody 模型请查看 https://huggingface.co/models?filter=musicgen_melody
]

# 定义一个数据类，用于 Musicgen Melody 模型的输出，包含过去状态的基类
@dataclass
class MusicgenMelodyOutputWithPast(ModelOutput):
    """
    Base class for Musicgen Melody autoregressive outputs.
    
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            语言建模损失（在提供 `labels` 时返回）。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            语言建模头的预测分数（SoftMax 之前的每个词汇标记的分数）。
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, 当传递 `use_cache=True` 或 `config.use_cache=True` 时返回):
            长度为 `config.n_layers` 的 `tuple(torch.FloatTensor)` 的元组，每个元组包含 2 个张量，形状为
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`。

            包含预计算的隐藏状态（在自注意力块中的键和值），可用于加速顺序解码。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            `torch.FloatTensor` 的元组（如果模型有嵌入层则包含嵌入层的输出 + 每层的输出），形状为 `(batch_size, sequence_length, hidden_size)`。

            每层模型的隐藏状态以及可选的初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            `torch.FloatTensor` 的元组（每个层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            自注意力头中注意力 softmax 后的注意力权重，用于计算自注意力头中加权平均值。
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            条件隐藏状态序列，表示文本编码器输出和音频编码器输出的投影连接。
            作为条件信号使用。
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    encoder_hidden_states: Optional[torch.FloatTensor] = None
# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    # 创建一个和 input_ids 形状相同的全零张量
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将 input_ids 向右移动一位，赋值给 shifted_input_ids
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    if decoder_start_token_id is None:
        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
    # 将 decoder_start_token_id 放置在 shifted_input_ids 的第一列
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
    # 将 shifted_input_ids 中可能的 -100 值替换为 pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenSinusoidalPositionalEmbedding with Musicgen->MusicgenMelody
class MusicgenMelodySinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int):
        super().__init__()
        self.embedding_dim = embedding_dim
        # 根据给定的 num_positions 和 embedding_dim 创建权重矩阵
        self.make_weights(num_positions, embedding_dim)

    def make_weights(self, num_embeddings: int, embedding_dim: int):
        # 获取 sinusoidal 位置编码的权重矩阵
        emb_weights = self.get_embedding(num_embeddings, embedding_dim)
        if hasattr(self, "weights"):
            # 在 forward 方法中，将权重矩阵转换为参数的正确 dtype 和 device
            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

        # 将权重矩阵转换为 nn.Parameter，并设置为不可训练状态
        self.weights = nn.Parameter(emb_weights)
        self.weights.requires_grad = False
        self.weights.detach_()

    @staticmethod
    def get_embedding(num_embeddings: int, embedding_dim: int):
        """
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        """
        half_dim = embedding_dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
        if embedding_dim % 2 == 1:
            # 对于奇数 embedding_dim，补充一个零填充
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        return emb.to(torch.get_default_dtype())

    @torch.no_grad()
    # Ignore copy
    # 前向传播函数，接受嵌入输入和过去键值长度作为参数
    def forward(self, inputs_embeds: torch.Tensor, past_key_values_length: int = 0):
        # 获取输入嵌入的批大小、序列长度和嵌入维度
        bsz, seq_len, _ = inputs_embeds.size()
        
        # 根据输入的令牌 id 创建位置 id
        position_ids = (torch.arange(seq_len) + past_key_values_length).to(inputs_embeds.device)
        
        # 如果序列长度大于权重张量的大小，扩展权重张量
        if seq_len > self.weights.size(0):
            self.make_weights(seq_len + self.offset, self.embedding_dim)
        
        # 根据位置 id 从权重张量中选择对应位置的嵌入，然后分离（detach）出来
        return self.weights.index_select(0, position_ids.view(-1)).detach()
# 从transformers.models.bart.modeling_bart.BartAttention复制的代码，修改为MusicgenMelodyAttention
class MusicgenMelodyAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[MusicgenMelodyConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim  # 设置注意力机制的嵌入维度
        self.num_heads = num_heads  # 头数，即多头注意力中的注意力头的数量
        self.dropout = dropout  # dropout率，用于避免过拟合
        self.head_dim = embed_dim // num_heads  # 每个注意力头的维度
        self.config = config  # 可选的配置对象

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子，用于缩放注意力分数
        self.is_decoder = is_decoder  # 是否为解码器层
        self.is_causal = is_causal  # 是否是因果（自回归）注意力

        # 初始化线性投影层
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 键的投影层
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 值的投影层
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 查询的投影层
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # 输出的投影层

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        """重塑张量形状以适应多头注意力的输入要求"""
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        """
        执行注意力机制的前向传播
        Args:
            hidden_states: 输入的隐藏状态张量
            key_value_states: 可选的键值状态张量（用于encoder-decoder注意力）
            past_key_value: 可选的过去的键值对（用于加速Transformer解码器的计算）
            attention_mask: 可选的注意力掩码张量
            layer_head_mask: 可选的层级头掩码张量（用于控制每个头的选择性）
            output_attentions: 是否输出注意力权重

        Returns:
            tuple:
                - attention_output: 经过注意力机制后的输出张量
                - attention_weights: 注意力权重（如果output_attentions为True时）
        """
        # 省略前向传播的具体实现，主要处理输入和输出的形状变换以及线性变换
        pass

class MusicgenMelodyDecoderLayer(nn.Module):
    def __init__(self, config: MusicgenMelodyDecoderConfig):
        super().__init__()
        self.embed_dim = config.hidden_size  # 设置解码器层的隐藏大小

        # 创建自注意力层
        self.self_attn = MusicgenMelodyAttention(
            embed_dim=self.embed_dim,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            bias=False,
        )

        self.dropout = config.dropout  # dropout率
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数
        self.activation_dropout = config.activation_dropout  # 激活函数的dropout率

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 自注意力层的LayerNorm

        # 两个全连接层
        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False)  # 第一个全连接层
        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=False)  # 第二个全连接层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终输出的LayerNorm
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size `(attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 保存输入状态作为残差连接的基准

        hidden_states = self.self_attn_layer_norm(hidden_states)  # 对输入状态进行 layer normalization

        # Self Attention
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 获取自注意力机制中的过去键/值投影状态，如果有的话

        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=self_attn_past_key_value,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 应用自注意力机制，得到新的隐藏状态、注意力权重和当前键/值投影状态

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 对隐藏状态应用 dropout

        hidden_states = residual + hidden_states  # 残差连接

        # Fully Connected
        residual = hidden_states  # 保存残差连接前的状态

        hidden_states = self.final_layer_norm(hidden_states)  # 对最终的隐藏状态进行 layer normalization

        hidden_states = self.activation_fn(self.fc1(hidden_states))  # 应用激活函数和第一个全连接层
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)  # 第二个全连接层
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        
        hidden_states = residual + hidden_states  # 残差连接

        outputs = (hidden_states,)  # 输出为最终的隐藏状态

        if output_attentions:
            outputs += (self_attn_weights,)  # 如果需要输出注意力权重，则添加到输出中

        if use_cache:
            outputs += (present_key_value,)  # 如果需要使用缓存，则添加当前的键/值投影状态到输出中

        return outputs  # 返回所有输出
# 从 transformers.models.musicgen.modeling_musicgen.MusicgenPreTrainedModel 复制代码并将 Musicgen 替换为 MusicgenMelody
class MusicgenMelodyPreTrainedModel(PreTrainedModel):
    """
    用于处理权重初始化、下载和加载预训练模型的抽象类。

    Attributes:
        config_class: 与该模型相关的配置类 MusicgenMelodyDecoderConfig
        base_model_prefix: 模型的基础名称前缀为 "model"
        supports_gradient_checkpointing: 支持梯度检查点
        _no_split_modules: 不需要拆分的模块列表，包括 "MusicgenMelodyDecoderLayer" 和 "MusicgenMelodyAttention"
    """

    def _init_weights(self, module):
        """
        初始化给定模块的权重。

        Args:
            module: 要初始化权重的模块

        Notes:
            根据模块类型不同，使用配置的初始化因子初始化权重和偏置。
        """
        std = self.config.initializer_factor
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


# Musicgen_Melody_START_DOCSTRING 是 Musicgen Melody 模型的文档字符串
MUSICGEN_MELODY_START_DOCSTRING = r"""

    Musicgen Melody 模型是由 Jade Copet 等人在 [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) 中提出的。该模型是一个仅解码器的 Transformer，用于条件音乐生成。

    该模型继承自 [`PreTrainedModel`]。查阅超类文档以了解库为所有模型实现的通用方法（如下载或保存模型、调整输入嵌入、剪枝头等）。

    该模型也是 PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 的子类。可以像常规的 PyTorch 模块一样使用，并参考 PyTorch 文档了解所有与一般用法和行为相关的事项。

    Parameters:
        config ([`MusicgenMelodyConfig`]): 包含模型所有参数的模型配置类。使用配置文件初始化不会加载与模型关联的权重，只加载配置。查看 [`~PreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""

# MUSICGEN_MELODY_INPUTS_DOCSTRING 是 Musicgen Melody 模型输入的文档字符串
MUSICGEN_MELODY_INPUTS_DOCSTRING = r"""
"""

# MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING 是 Musicgen Melody 解码器输入的文档字符串
MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING = r"""
"""


# 从 transformers.models.musicgen.modeling_musicgen.MusicgenDecoder 复制代码并将 MUSICGEN->MUSICGEN_MELODY,Musicgen->MusicgenMelody
class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel):
    """
    Transformer 解码器，由 *config.num_hidden_layers* 层组成。每一层都是一个 [`MusicgenMelodyDecoderLayer`]
    """
    # 初始化函数，用于初始化模型参数及配置
    def __init__(self, config: MusicgenMelodyDecoderConfig):
        # 调用父类的初始化函数，传入配置对象
        super().__init__(config)
        # 设置模型的dropout比例
        self.dropout = config.dropout
        # 设置模型的层级dropout比例
        self.layerdrop = config.layerdrop
        # 设置模型的最大目标位置
        self.max_target_positions = config.max_position_embeddings
        # 设置模型的隐藏层维度
        self.d_model = config.hidden_size
        # 设置模型的码书数量
        self.num_codebooks = config.num_codebooks
        # 如果配置中指定了缩放embedding，则计算embedding的缩放因子，否则为1.0
        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0

        # 计算embedding的维度，词汇表大小加1
        embed_dim = config.vocab_size + 1
        # 使用nn.ModuleList创建多个嵌入层对象，数量为码书数量
        self.embed_tokens = nn.ModuleList(
            [nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
        )

        # 使用自定义的Sinusoidal位置嵌入类创建位置嵌入对象
        self.embed_positions = MusicgenMelodySinusoidalPositionalEmbedding(
            config.max_position_embeddings,
            config.hidden_size,
        )

        # 使用nn.ModuleList创建多个解码层对象，数量为隐藏层数量
        self.layers = nn.ModuleList([MusicgenMelodyDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 创建层归一化对象，用于层次之间的正则化
        self.layer_norm = nn.LayerNorm(config.hidden_size)

        # 是否启用梯度检查点，默认为False
        self.gradient_checkpointing = False
        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层对象的方法
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入嵌入层对象的方法
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 重写的前向传播函数，处理解码器的输入数据
    @add_start_docstrings_to_model_forward(MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING)
    # 忽略复制操作的说明
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 添加文档字符串，描述该模型是MusicgenMelody的解码器模型，输出原始的隐藏状态，没有特定的顶部头部。
# 使用MUSICGEN_MELODY_START_DOCSTRING中定义的文档字符串。
@add_start_docstrings(
    "The bare MusicgenMelody decoder model outputting raw hidden-states without any specific head on top.",
    MUSICGEN_MELODY_START_DOCSTRING,
)
# 从transformers.models.musicgen.modeling_musicgen.MusicgenModel复制代码，将MUSICGEN->MUSICGEN_MELODY，Musicgen->MusicgenMelody
class MusicgenMelodyModel(MusicgenMelodyPreTrainedModel):
    def __init__(self, config: MusicgenMelodyDecoderConfig):
        super().__init__(config)
        # 初始化解码器，并传入配置参数
        self.decoder = MusicgenMelodyDecoder(config)
        # 初始化权重并应用最终处理
        self.post_init()

    # 返回解码器的嵌入层
    def get_input_embeddings(self):
        return self.decoder.embed_tokens

    # 设置解码器的嵌入层
    def set_input_embeddings(self, value):
        self.decoder.embed_tokens = value

    # 返回解码器对象
    def get_decoder(self):
        return self.decoder

    # 添加文档字符串到模型的forward方法，使用MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING中定义的文档字符串
    # 忽略复制操作
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 如果没有显式提供输出注意力权重，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果没有显式提供输出隐藏状态，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果没有显式提供是否使用缓存，则使用配置中的默认值
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        # 如果没有显式提供是否返回字典，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 解码器的输出包括 (解码特征, 过去键值, 解码隐藏状态, 解码注意力权重)
        decoder_outputs = self.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果不要求返回字典，则直接返回解码器的输出
        if not return_dict:
            return decoder_outputs

        # 返回包含过去键值的基本模型输出对象，包括最终隐藏状态、过去键值、隐藏状态列表和注意力权重列表
        return BaseModelOutputWithPast(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            hidden_states=decoder_outputs.hidden_states,
            attentions=decoder_outputs.attentions,
        )
# 添加模型文档字符串，描述该类为带语言建模头部的Musicgen Melody解码器模型
@add_start_docstrings(
    "The Musicgen Melody decoder model with a language modelling head on top.",
    MUSICGEN_MELODY_START_DOCSTRING,
)
# 从transformers.models.musicgen.modeling_musicgen.MusicgenForCausalLM复制过来，将MUSICGEN->MUSICGEN_MELODY,Musicgen->MusicgenMelody,MusicGen->Musicgen Melody
class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel):
    def __init__(self, config: MusicgenMelodyDecoderConfig):
        # 调用父类构造函数初始化模型
        super().__init__(config)

        # 创建Musicgen Melody模型实例
        self.model = MusicgenMelodyModel(config)

        # 设置编码簿数量
        self.num_codebooks = config.num_codebooks
        # 创建线性层列表作为语言建模头部，每个编码簿对应一个线性层
        self.lm_heads = nn.ModuleList(
            [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
        )

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层（embed_tokens）的方法
    def get_input_embeddings(self):
        return self.model.decoder.embed_tokens

    # 设置输入嵌入层（embed_tokens）的方法
    def set_input_embeddings(self, value):
        self.model.decoder.embed_tokens = value

    # 获取输出嵌入层列表（lm_heads）的方法
    def get_output_embeddings(self):
        return self.lm_heads

    # 设置输出嵌入层列表（lm_heads）的方法
    def set_output_embeddings(self, new_embeddings):
        self.lm_heads = new_embeddings

    # 设置解码器（decoder）的方法
    def set_decoder(self, decoder):
        self.model.decoder = decoder

    # 获取解码器（decoder）的方法
    def get_decoder(self):
        return self.model.decoder

    # 添加模型前向方法的文档字符串，包括Musicgen Melody解码器输入的详细描述
    # 并使用装饰器replace_return_docstrings替换返回值的文档字符串为MusicgenMelodyOutputWithPast类型的描述
    @add_start_docstrings_to_model_forward(MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=MusicgenMelodyOutputWithPast, config_class=_CONFIG_FOR_DOC)
    # 忽略复制
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, MusicgenMelodyOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        Returns:
            Tuple or MusicgenMelodyOutputWithPast: Depending on `return_dict`, returns either a tuple or an instance
            of `MusicgenMelodyOutputWithPast`.
        """

        # Determine whether to use the provided `return_dict` or default from configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Forward pass through the model with specified inputs and configuration
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract hidden states from model outputs
        hidden_states = outputs[0]

        # Generate logits for language modeling heads based on hidden states
        lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)

        # Placeholder for loss; training for MusicgenMelody is not implemented
        loss = None
        if labels is not None:
            raise NotImplementedError("Training is not implemented for MusicgenMelody.")

        # Reshape logits for further processing
        # (bsz, num_codebooks, seq_len, vocab_size) -> (bsz * num_codebooks, seq_len, vocab_size)
        lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])

        # If `return_dict` is False, return output as a tuple with optional loss
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # If `return_dict` is True, return structured output using `MusicgenMelodyOutputWithPast`
        return MusicgenMelodyOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # Ignore copy
    def prepare_inputs_for_generation(
        self,
        input_ids,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        past_key_values=None,
        use_cache=True,
        delay_pattern_mask=None,
        guidance_scale=None,
        **kwargs,
        ):
        """
        Prepare inputs for the generation process, tailored for Music generation tasks.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The input token IDs.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding tokens.
            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Hidden states from the encoder.
            encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid attending to encoder padding tokens.
            head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask for the attention heads.
            past_key_values (tuple of `torch.Tensor` of shape `(batch_size, num_heads, past_sequence_length, hidden_size)`):
                Cached key and value states for fast decoding.
            use_cache (bool, *optional*):
                Whether to use caching mechanism for fast decoding.
            delay_pattern_mask (`torch.Tensor` of shape `(batch_size, sequence_length, num_codebooks, vocab_size)`, *optional*):
                Mask indicating patterns of delay for pattern-based music generation.
            guidance_scale (float, *optional*):
                Scaling factor for guidance during generation.

        Returns:
            dict: Dictionary containing prepared inputs for the generation process.
        """
    ):
        # 如果延迟模式掩码为 None，则调用 build_delay_pattern_mask 方法生成
        if delay_pattern_mask is None:
            input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
                input_ids,
                pad_token_id=self.generation_config.pad_token_id,
                max_length=self.generation_config.max_length,
            )

        # 应用延迟模式掩码到输入的 token ids
        input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)

        # 如果有指导尺度并且大于1，则进行以下操作
        if guidance_scale is not None and guidance_scale > 1:
            # 对于无分类器指导的情况，需要在批次维度上复制解码器参数（将在采样前拆分）
            input_ids = input_ids.repeat((2, 1))
            # 如果存在注意力掩码，则也进行相同的复制操作
            if attention_mask is not None:
                attention_mask = attention_mask.repeat((2, 1))

            # 如果存在编码器隐藏状态，则在批次维度上进行拼接，用零填充
            if encoder_hidden_states is not None:
                encoder_hidden_states = torch.concatenate(
                    [encoder_hidden_states, torch.zeros_like(encoder_hidden_states)], dim=0
                )

            # 如果存在编码器注意力掩码，则在批次维度上进行拼接，用零填充
            if encoder_attention_mask is not None:
                encoder_attention_mask = torch.concatenate(
                    [encoder_attention_mask, torch.zeros_like(encoder_attention_mask)], dim=0
                )

        # 如果过去的关键值不为 None，则仅保留输入的最后一个 token
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

            # 在第一代步骤中，仅使用条件信号，但保留注意力掩码
            encoder_hidden_states = None

        # 返回生成方法的结果字典
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "encoder_hidden_states": encoder_hidden_states,
            "encoder_attention_mask": encoder_attention_mask,
            "head_mask": head_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
        """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
        the mask is set to -1, and otherwise setting to the value detailed in the mask."""
        # 获取输入 token ids 的序列长度
        seq_len = input_ids.shape[-1]
        # 裁剪 decoder_pad_token_mask 到与输入序列长度相同的维度
        decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
        # 根据 mask 中值为 -1 的位置，保留输入 token ids 的预测，其他位置用 mask 中的值替换
        input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
        return input_ids

    @torch.no_grad()
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
        generation_config: Optional[GenerationConfig] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        synced_gpus: Optional[bool] = None,
        streamer: Optional["BaseStreamer"] = None,
        **kwargs,
# 添加文档字符串到类 `MusicgenMelodyForConditionalGeneration`，描述了该模型的组成和用途
@add_start_docstrings(
    "The composite Musicgen Melody model with a text and audio conditional models, a MusicgenMelody decoder and an audio encoder, "
    "for music generation tasks with one or both of text and audio prompts.",
    MUSICGEN_MELODY_START_DOCSTRING,
    """
        text_encoder (`Optional[PreTrainedModel]`, *optional*): Text encoder.
        audio_encoder (`Optional[PreTrainedModel]`, *optional*): Audio code decoder.
        decoder (`Optional[MusicgenMelodyForCausalLM]`, *optional*): MusicGen Melody decoder used to generate audio codes.
    """
)

class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
    # 指定配置类为 `MusicgenMelodyConfig`
    config_class = MusicgenMelodyConfig
    # 主要输入名称为 `input_ids`
    main_input_name = "input_ids"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def __init__(
        self,
        config: MusicgenMelodyConfig = None,
        text_encoder: Optional[PreTrainedModel] = None,
        audio_encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[MusicgenMelodyForCausalLM] = None,
    ):
        if config is None and None in (text_encoder, audio_encoder, decoder):
            raise ValueError(
                "Either a configuration has to be provided, or all three of text encoder, audio encoder and Musicgen Melody decoder."
            )
        # 如果配置为 None 并且 text_encoder、audio_encoder、decoder 中有任何一个为 None，则抛出 ValueError
        if config is None:
            # 如果配置为 None，则从子模型配置中创建 MusicgenMelodyConfig 对象
            config = MusicgenMelodyConfig.from_sub_models_config(
                text_encoder.config, audio_encoder.config, decoder.config
            )
        else:
            # 如果配置不为 None，则检查配置是否为 self.config_class 类型，否则抛出 ValueError
            if not isinstance(config, self.config_class):
                raise ValueError(f"Config: {config} has to be of type {self.config_class}")

        # 使用给定配置初始化父类
        super().__init__(config)

        # 如果 text_encoder 为 None，则从配置中创建 AutoModelForTextEncoding 对象
        if text_encoder is None:
            text_encoder = AutoModelForTextEncoding.from_config(config.text_encoder)

        # 如果 audio_encoder 为 None，则从配置中创建 AutoModel 对象
        if audio_encoder is None:
            audio_encoder = AutoModel.from_config(config.audio_encoder)

        # 如果 decoder 为 None，则使用 MusicgenMelodyForCausalLM 类创建 decoder 对象
        if decoder is None:
            decoder = MusicgenMelodyForCausalLM(config.decoder)

        # 分别将 text_encoder、audio_encoder、decoder 赋值给对象属性
        self.text_encoder = text_encoder
        self.audio_encoder = audio_encoder
        self.decoder = decoder

        # 确保各模型配置指向共享配置，以保持配置更新同步
        self.text_encoder.config = self.config.text_encoder
        self.audio_encoder.config = self.config.audio_encoder
        self.decoder.config = self.config.decoder

        # 如果 text_encoder 输出的 embeddings 不为 None，则抛出 ValueError
        if self.text_encoder.get_output_embeddings() is not None:
            raise ValueError(
                f"The encoder {self.text_encoder} should not have a LM Head. Please use a model without and LM Head"
            )

        # 如果 text_encoder 和 decoder 的隐藏层大小不一致，则初始化线性层进行投影
        if self.text_encoder.config.hidden_size != self.decoder.config.hidden_size:
            self.enc_to_dec_proj = nn.Linear(self.text_encoder.config.hidden_size, self.decoder.config.hidden_size)

        # 如果音频编码器提取色度后的输出维度与 decoder 的隐藏层大小不一致，则初始化线性层进行投影
        if self.config.num_chroma != self.decoder.config.hidden_size:
            self.audio_enc_to_dec_proj = nn.Linear(self.config.num_chroma, self.decoder.config.hidden_size)

        # 初始化后处理函数，包括初始化投影层的权重，并根据需要将 text_encoder 和 decoder 的权重进行绑定
        self.post_init()

    def _init_weights(self, module):
        # MusicgenMelodyForConditionalGeneration 由已初始化的 PreTrainedModels 组成
        # 投影层仍需初始化
        std = self.decoder.config.initializer_factor
        # 如果 module 是 nn.Linear 类型，则初始化其权重
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
    # 绑定权重函数，用于绑定文本编码器和解码器
    def tie_weights(self):
        # 如果需要绑定文本编码器和解码器
        if self.config.tie_encoder_decoder:
            # 获取解码器基础模型的前缀
            decoder_base_model_prefix = self.decoder.base_model_prefix
            # 绑定文本编码器和解码器基础模型的权重
            self._tie_encoder_decoder_weights(
                self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
            )

    # 获取文本编码器
    def get_text_encoder(self):
        return self.text_encoder

    # 获取编码器
    def get_encoder(self):
        # 获取文本编码器以计算生成时的条件隐藏状态
        return self.get_text_encoder()

    # 获取解码器
    def get_decoder(self):
        return self.decoder

    # 获取输入嵌入
    def get_input_embeddings(self):
        return self.text_encoder.get_input_embeddings()

    # 获取输出嵌入
    def get_output_embeddings(self):
        return self.decoder.get_output_embeddings()

    # 设置输出嵌入
    def set_output_embeddings(self, new_embeddings):
        return self.decoder.set_output_embeddings(new_embeddings)

    # 从预训练的子模型创建实例
    @classmethod
    def from_sub_models_pretrained(
        cls,
        text_encoder_pretrained_model_name_or_path: str = None,
        audio_encoder_pretrained_model_name_or_path: str = None,
        decoder_pretrained_model_name_or_path: str = None,
        *model_args,
        **kwargs,
    
    # 前向传播函数，接收一系列输入和参数
    @add_start_docstrings_to_model_forward(MUSICGEN_MELODY_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=MusicgenMelodyOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
        input_features: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    
    # 为生成准备输入的函数，接收一系列输入和参数
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        encoder_hidden_states=None,
        past_key_values=None,
        attention_mask=None,
        decoder_attention_mask=None,
        decoder_head_mask=None,
        use_cache=None,
        decoder_delay_pattern_mask=None,
        guidance_scale=None,
        **kwargs,
    ):
        if decoder_delay_pattern_mask is None:
            # 如果延迟模式掩码为None，则调用self.decoder.build_delay_pattern_mask生成新的延迟模式掩码
            decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
                decoder_input_ids,
                self.generation_config.pad_token_id,
                max_length=self.generation_config.max_length,
            )

        # 应用延迟模式掩码到decoder_input_ids上
        decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)

        if guidance_scale is not None and guidance_scale > 1:
            # 对于分类器无关的指导，我们需要将decoder参数在批次维度上复制（在采样之前会分割这些参数）
            decoder_input_ids = decoder_input_ids.repeat((2, 1))
            if decoder_attention_mask is not None:
                decoder_attention_mask = decoder_attention_mask.repeat((2, 1))

        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # 某些生成方法已经只传递了最后一个输入ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认旧的行为：保留仅最终ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 截取decoder_input_ids以去除前缀长度
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

            # 我们只想在第一个生成步骤使用条件信号，但保留注意力掩码
            encoder_hidden_states = None
            # 我们也必须更新注意力掩码

        return {
            "input_ids": None,  # encoder_hidden_states已定义。input_ids不需要
            "encoder_hidden_states": encoder_hidden_states,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_head_mask": decoder_head_mask,
            "use_cache": use_cache,
        }

    # 从transformers.models.musicgen.modeling_musicgen.MusicgenForConditionalGeneration._prepare_decoder_input_ids_for_generation复制而来
    def _prepare_decoder_input_ids_for_generation(
        self,
        batch_size: int,
        model_input_name: str,
        model_kwargs: Dict[str, torch.Tensor],
        decoder_start_token_id: int = None,
        bos_token_id: int = None,
        device: torch.device = None,
    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
        """为使用编码器-解码器模型生成准备 `decoder_input_ids`"""

        # 1. 检查用户是否手动定义了 `decoder_input_ids`。为了方便输入命名，我们也允许用户在 `input_ids` 下传递它，如果编码器不将其用作主要输入。
        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
            decoder_input_ids = model_kwargs.pop("input_ids")
        else:
            decoder_input_ids = None

        # 2. 编码器-解码器模型期望 `decoder_input_ids` 以特殊令牌开头。让我们确保这一点。
        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
        if device is None:
            device = self.device
        decoder_input_ids_start = (
            torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
            * decoder_start_token_id
        )

        # 如果没有用户输入 -> 使用 decoder_start_token_id 作为 decoder_input_ids
        if decoder_input_ids is None:
            decoder_input_ids = decoder_input_ids_start

        # 如果有用户输入但不以 decoder_start_token_id 开头 -> 在前面添加 decoder_start_token_id（并调整 decoder_attention_mask 如果提供）
        elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
            if "decoder_attention_mask" in model_kwargs:
                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
                decoder_attention_mask = torch.cat(
                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
                    dim=-1,
                )
                model_kwargs["decoder_attention_mask"] = decoder_attention_mask

        return decoder_input_ids, model_kwargs

    def _prepare_encoder_hidden_states_kwargs_for_generation(
        self,
        inputs_tensor: torch.Tensor,
        model_kwargs,
        model_input_name: Optional[str] = None,
        guidance_scale: Optional[float] = None,
    ):
        """为生成准备编码器隐藏状态的参数"""

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        """根据标签准备解码器的输入ids"""
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    def resize_token_embeddings(self, *args, **kwargs):
        """调整标记嵌入大小的方法，通过 EncoderDecoderModel 直接不支持。请使用包装对象的相应方法（model.encoder.resize_token_embeddings(...) 或 model.decoder.resize_token_embeddings(...)）"""
        raise NotImplementedError(
            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
            " model.decoder.resize_token_embeddings(...))"
        )
    def _maybe_initialize_input_ids_for_generation(
        self,
        inputs: Optional[torch.Tensor] = None,
        bos_token_id: Optional[int] = None,
        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
    ) -> torch.LongTensor:
        """Initializes input ids for generation, if necessary."""
        # 如果已经提供了输入张量，则直接返回
        if inputs is not None:
            return inputs

        # 如果未提供输入张量但未定义起始标记 ID，则抛出数值错误异常
        if bos_token_id is None:
            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")

        # 如果 `model_kwargs` 中包含张量，则从中推断批次大小
        # 这对基于解码器的语言模型的软提示或多模态实现非常有帮助
        batch_size = 1
        for value in model_kwargs.values():
            if isinstance(value, torch.Tensor):
                batch_size = value.shape[0]
                break

        # 返回形状为 (batch_size, 1) 的全一张量，乘以起始标记 ID，数据类型为长整型
        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id

    @torch.no_grad()
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
        generation_config: Optional[GenerationConfig] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        synced_gpus: Optional[bool] = None,
        streamer: Optional["BaseStreamer"] = None,
        **kwargs,
    ):
        """Generates sequences using the model."""
        # 实现生成序列的方法，这里不作注释

    def _update_model_kwargs_for_generation(
        self,
        outputs: ModelOutput,
        model_kwargs: Dict[str, Any],
        is_encoder_decoder: bool = False,
        standardize_cache_format: bool = False,
        model_inputs: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Updates model keyword arguments for generation."""
        # 更新模型生成过程中的关键字参数

        # 更新 past_key_values
        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
            outputs, standardize_cache_format=standardize_cache_format
        )

        # 如果输出对象有状态信息，则更新状态
        if getattr(outputs, "state", None) is not None:
            model_kwargs["state"] = outputs.state

        # 更新 token_type_ids，添加最后一个值
        if "token_type_ids" in model_kwargs:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)

        # 更新解码器注意力掩码
        if "decoder_attention_mask" in model_kwargs:
            decoder_attention_mask = model_kwargs["decoder_attention_mask"]
            model_kwargs["decoder_attention_mask"] = torch.cat(
                [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
                dim=-1,
            )

        # 返回更新后的模型关键字参数字典
        return model_kwargs

`.\models\musicgen_melody\processing_musicgen_melody.py`

# coding=utf-8
# Copyright 2024 Meta AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Text/audio processor class for MusicGen Melody
"""
from typing import List, Optional

import numpy as np

from ...processing_utils import ProcessorMixin
from ...utils import to_numpy


class MusicgenMelodyProcessor(ProcessorMixin):
    r"""
    Constructs a MusicGen Melody processor which wraps a Wav2Vec2 feature extractor - for raw audio waveform processing - and a T5 tokenizer into a single processor
    class.

    [`MusicgenProcessor`] offers all the functionalities of [`MusicgenMelodyFeatureExtractor`] and [`T5Tokenizer`]. See
    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.

    Args:
        feature_extractor (`MusicgenMelodyFeatureExtractor`):
            An instance of [`MusicgenMelodyFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`T5Tokenizer`):
            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
    """

    feature_extractor_class = "MusicgenMelodyFeatureExtractor"
    tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")

    def __init__(self, feature_extractor, tokenizer):
        super().__init__(feature_extractor, tokenizer)

    # Copied from transformers.models.musicgen.processing_musicgen.MusicgenProcessor.get_decoder_prompt_ids
    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
        """
        Retrieves decoder prompt IDs from the tokenizer.
        
        Args:
            task (str, optional): Task identifier. Defaults to None.
            language (str, optional): Language identifier. Defaults to None.
            no_timestamps (bool, optional): Flag indicating whether to exclude timestamps. Defaults to True.
        
        Returns:
            List[int]: List of decoder prompt IDs based on the provided task, language, and timestamps preferences.
        """
        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
    # 定义一个方法，用于处理模型的输入，支持音频和文本的预处理
    def __call__(self, audio=None, text=None, **kwargs):
        """
        主方法，用于准备模型的一个或多个序列和音频。如果 `audio` 不为 `None`，则将 `audio` 和 `kwargs` 参数传递给 MusicgenMelodyFeatureExtractor 的 [`~MusicgenMelodyFeatureExtractor.__call__`] 来预处理音频。如果 `text` 不为 `None`，则将 `text` 和 `kwargs` 参数传递给 PreTrainedTokenizer 的 [`~PreTrainedTokenizer.__call__`]。请参考上述两个方法的文档字符串获取更多信息。

        Args:
            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                要准备的音频或音频批处理。每个音频可以是 NumPy 数组或 PyTorch 张量。如果是 NumPy 数组/PyTorch 张量，则每个音频应为形状为 (T) 的单声道或立体声信号，其中 T 是音频的样本长度。
            text (`str`, `List[str]`, `List[List[str]]`):
                要编码的序列或序列批处理。每个序列可以是字符串或字符串列表（预分词字符串）。如果作为字符串列表（预分词）提供序列，则必须设置 `is_split_into_words=True`（以消除与序列批处理的歧义）。
            kwargs (*optional*):
                剩余的关键字参数字典，将传递给特征提取器和/或标记器。
        Returns:
            [`BatchEncoding`]: 一个 [`BatchEncoding`]，具有以下字段：
            - **input_ids** -- 要输入模型的令牌 ID 列表。在 `text` 不为 `None` 时返回。
            - **input_features** -- 要输入模型的音频输入特征。在 `audio` 不为 `None` 时返回。
            - **attention_mask** -- 列表的令牌索引，指定模型在 `text` 不为 `None` 时应注意哪些令牌。
            当仅指定 `audio` 时，返回时间戳的注意力蒙版。
        """

        # 从 `kwargs` 中弹出 `sampling_rate` 参数
        sampling_rate = kwargs.pop("sampling_rate", None)

        # 如果 `audio` 和 `text` 均为 `None`，则抛出 ValueError
        if audio is None and text is None:
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        # 如果 `text` 不为 `None`，则使用标记器处理文本输入
        if text is not None:
            inputs = self.tokenizer(text, **kwargs)
        
        # 如果 `audio` 不为 `None`，则使用特征提取器处理音频输入
        if audio is not None:
            audio_inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)

        # 如果仅有音频输入，则返回音频处理结果
        if text is None:
            return audio_inputs
        # 如果仅有文本输入，则返回文本处理结果
        elif audio is None:
            return inputs
        # 如果同时有文本和音频输入，则将音频特征添加到文本处理结果中后返回
        else:
            inputs["input_features"] = audio_inputs["input_features"]
            return inputs

    # 从 transformers.models.musicgen.processing_musicgen.MusicgenProcessor.batch_decode 复制，将 padding_mask 改为 attention_mask
    # 从关键字参数中弹出音频输出值，如果没有则为 None
    audio_values = kwargs.pop("audio", None)
    # 从关键字参数中弹出注意力掩码，如果没有则为 None
    attention_mask = kwargs.pop("attention_mask", None)

    # 如果位置参数的数量大于 0
    if len(args) > 0:
        # 将第一个位置参数作为音频输出值
        audio_values = args[0]
        # 剩余位置参数重新赋值为 args 去掉第一个元素后的部分
        args = args[1:]

    # 如果音频输出值不为 None
    if audio_values is not None:
        # 调用 _decode_audio 方法解码音频输出值，并传入注意力掩码
        return self._decode_audio(audio_values, attention_mask=attention_mask)
    else:
        # 否则调用 tokenizer 对象的 batch_decode 方法，传入剩余的位置参数和关键字参数
        return self.tokenizer.batch_decode(*args, **kwargs)


# 从 transformers.models.musicgen.processing_musicgen.MusicgenProcessor.decode 复制过来
# 此方法将所有参数转发给 T5Tokenizer 的 decode 方法
def decode(self, *args, **kwargs):
    """
    This method forwards all its arguments to T5Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
    docstring of this method for more information.
    """
    return self.tokenizer.decode(*args, **kwargs)


# 从 transformers.models.musicgen.processing_musicgen.MusicgenProcessor._decode_audio 复制过来
# 此方法将音频输出值解码并去除任何填充，返回一个 numpy 音频数组列表
def _decode_audio(self, audio_values, attention_mask: Optional = None) -> List[np.ndarray]:
    """
    This method strips any padding from the audio values to return a list of numpy audio arrays.
    """
    # 将音频输出值转换为 numpy 数组
    audio_values = to_numpy(audio_values)
    # 获取批次大小、通道数和序列长度
    bsz, channels, seq_len = audio_values.shape

    # 如果注意力掩码为 None，则直接返回音频数组列表
    if attention_mask is None:
        return list(audio_values)

    # 将注意力掩码转换为 numpy 数组
    attention_mask = to_numpy(attention_mask)

    # 匹配填充掩码的序列长度与生成的音频数组长度，使用非填充标记进行填充
    difference = seq_len - attention_mask.shape[-1]
    padding_value = 1 - self.feature_extractor.padding_value
    attention_mask = np.pad(attention_mask, ((0, 0), (0, difference)), "constant", constant_values=padding_value)

    # 将音频输出值转换为列表形式
    audio_values = audio_values.tolist()
    # 遍历批次中的每个样本
    for i in range(bsz):
        # 根据注意力掩码切片生成音频数组，去除填充标记
        sliced_audio = np.asarray(audio_values[i])[
            attention_mask[i][None, :] != self.feature_extractor.padding_value
        ]
        # 将处理后的音频数组重新整形，并存入音频值列表中
        audio_values[i] = sliced_audio.reshape(channels, -1)

    # 返回处理后的音频值列表
    return audio_values
    # Helper function to generate null inputs for unconditional generation, which allows using the model without
    # the feature extractor or tokenizer.
    def get_unconditional_inputs(self, num_samples=1, return_tensors="pt"):
        """
        Helper function to get null inputs for unconditional generation, enabling the model to be used without the
        feature extractor or tokenizer.

        Args:
            num_samples (int, *optional*):
                Number of audio samples to unconditionally generate.

        Example:
        ```
        >>> from transformers import MusicgenMelodyForConditionalGeneration, MusicgenMelodyProcessor

        >>> model = MusicgenMelodyForConditionalGeneration.from_pretrained("facebook/musicgen-melody")

        >>> # get the unconditional (or 'null') inputs for the model
        >>> processor = MusicgenMelodyProcessor.from_pretrained("facebook/musicgen-melody")
        >>> unconditional_inputs = processor.get_unconditional_inputs(num_samples=1)

        >>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
        ```"""
        # Use the tokenizer to encode empty strings for the specified number of samples,
        # returning tensors suitable for PyTorch (or other frameworks).
        inputs = self.tokenizer([""] * num_samples, return_tensors=return_tensors, return_attention_mask=True)
        # Set attention mask to zero for all samples to indicate that no tokens should be attended to.
        inputs["attention_mask"][:] = 0

        # Return the prepared inputs dictionary.
        return inputs

`.\models\musicgen_melody\init.py`

# 版权声明和许可证信息
# 版权所有 2024 年 HuggingFace 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可;
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件按“原样”分发，
# 无任何明示或暗示的保证或条件。
# 有关更多信息，请参见许可证。
from typing import TYPE_CHECKING

# 导入必要的模块和类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_torchaudio_available,
)

# 定义模块导入结构
_import_structure = {
    "configuration_musicgen_melody": [
        "MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "MusicgenMelodyConfig",
        "MusicgenMelodyDecoderConfig",
    ],
}

# 检查是否存在 torch，如果不存在则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 torch，添加相关的模型配置和类到导入结构中
    _import_structure["modeling_musicgen_melody"] = [
        "MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST",
        "MusicgenMelodyForConditionalGeneration",
        "MusicgenMelodyForCausalLM",
        "MusicgenMelodyModel",
        "MusicgenMelodyPreTrainedModel",
    ]

# 检查是否存在 torchaudio，如果不存在则抛出异常
try:
    if not is_torchaudio_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 torchaudio，添加相关的特征提取和处理器类到导入结构中
    _import_structure["feature_extraction_musicgen_melody"] = ["MusicgenMelodyFeatureExtractor"]
    _import_structure["processing_musicgen_melody"] = ["MusicgenMelodyProcessor"]

# 如果当前环境支持类型检查，进行额外的导入
if TYPE_CHECKING:
    from .configuration_musicgen_melody import (
        MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP,
        MusicgenMelodyConfig,
        MusicgenMelodyDecoderConfig,
    )

    # 如果存在 torch，导入相关模型类
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_musicgen_melody import (
            MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST,
            MusicgenMelodyForCausalLM,
            MusicgenMelodyForConditionalGeneration,
            MusicgenMelodyModel,
            MusicgenMelodyPreTrainedModel,
        )

    # 如果存在 torchaudio，导入相关特征提取和处理器类
    try:
        if not is_torchaudio_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
        from .processing_musicgen_melody import MusicgenMelodyProcessor

# 如果当前环境不支持类型检查，定义 LazyModule 来延迟加载模块
else:
    import sys

    # 将当前模块设置为 LazyModule 类型，用于按需加载导入结构
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\mvp\configuration_mvp.py`

# coding=utf-8
# 代码文件的版权声明和许可证信息

""" MVP model configuration"""
# 导入警告模块
import warnings

# 导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 导入日志记录工具
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# MVP预训练配置文件映射
MVP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/config.json",
}

# MVP配置类，继承自预训练配置类
class MvpConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MvpModel`]. It is used to instantiate a MVP model
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the MVP [RUCAIBox/mvp](https://huggingface.co/RUCAIBox/mvp)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import MvpConfig, MvpModel

    >>> # Initializing a MVP RUCAIBox/mvp style configuration
    >>> configuration = MvpConfig()

    >>> # Initializing a model (with random weights) from the RUCAIBox/mvp style configuration
    >>> model = MvpModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为MVP
    model_type = "mvp"
    # 推断阶段忽略的键
    keys_to_ignore_at_inference = ["past_key_values"]
    # 属性映射字典
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

    def __init__(
        self,
        vocab_size=50267,
        max_position_embeddings=1024,
        encoder_layers=12,
        encoder_ffn_dim=4096,
        encoder_attention_heads=16,
        decoder_layers=12,
        decoder_ffn_dim=4096,
        decoder_attention_heads=16,
        encoder_layerdrop=0.0,
        decoder_layerdrop=0.0,
        activation_function="gelu",
        d_model=1024,
        dropout=0.1,
        attention_dropout=0.0,
        activation_dropout=0.0,
        init_std=0.02,
        classifier_dropout=0.0,
        scale_embedding=False,
        use_cache=True,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        is_encoder_decoder=True,
        decoder_start_token_id=2,
        forced_eos_token_id=2,
        use_prompt=False,
        prompt_length=100,
        prompt_mid_dim=800,
        **kwargs,
    ):
        # 初始化方法，用于设置配置参数
        # 调用父类的初始化方法
        super().__init__(
            vocab_size=vocab_size,
            max_position_embeddings=max_position_embeddings,
            encoder_layers=encoder_layers,
            encoder_ffn_dim=encoder_ffn_dim,
            encoder_attention_heads=encoder_attention_heads,
            decoder_layers=decoder_layers,
            decoder_ffn_dim=decoder_ffn_dim,
            decoder_attention_heads=decoder_attention_heads,
            encoder_layerdrop=encoder_layerdrop,
            decoder_layerdrop=decoder_layerdrop,
            activation_function=activation_function,
            d_model=d_model,
            dropout=dropout,
            attention_dropout=attention_dropout,
            activation_dropout=activation_dropout,
            init_std=init_std,
            classifier_dropout=classifier_dropout,
            scale_embedding=scale_embedding,
            use_cache=use_cache,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            decoder_start_token_id=decoder_start_token_id,
            forced_eos_token_id=forced_eos_token_id,
            use_prompt=use_prompt,
            prompt_length=prompt_length,
            prompt_mid_dim=prompt_mid_dim,
            **kwargs,
        )
        ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.d_model = d_model
        self.encoder_ffn_dim = encoder_ffn_dim
        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
        self.decoder_ffn_dim = decoder_ffn_dim
        self.decoder_layers = decoder_layers
        self.decoder_attention_heads = decoder_attention_heads
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.activation_function = activation_function
        self.init_std = init_std
        self.encoder_layerdrop = encoder_layerdrop
        self.decoder_layerdrop = decoder_layerdrop
        self.classifier_dropout = classifier_dropout
        self.use_cache = use_cache
        self.num_hidden_layers = encoder_layers
        self.scale_embedding = scale_embedding  # 如果为 True，则缩放因子为 sqrt(d_model)
        self.use_prompt = use_prompt
        self.prompt_length = prompt_length
        self.prompt_mid_dim = prompt_mid_dim

        # 调用父类的初始化方法，传入配置参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            decoder_start_token_id=decoder_start_token_id,
            forced_eos_token_id=forced_eos_token_id,
            **kwargs,
        )

        # 如果 forced_bos_token_id 未设置且 force_bos_token_to_be_generated 在 kwargs 中为 True，则使用 bos_token_id
        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
            self.forced_bos_token_id = self.bos_token_id
            # 发出警告，提醒将来版本需要在配置中包含 forced_bos_token_id
            warnings.warn(
                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
                "The config can simply be saved and uploaded again to be fixed."
            )

`.\models\mvp\modeling_mvp.py`

# coding=utf-8
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch MVP model."""
import copy
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_mvp import MvpConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "RUCAIBox/mvp"
_CONFIG_FOR_DOC = "MvpConfig"

# Base model docstring
_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]

MVP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "RUCAIBox/mvp",
    "RUCAIBox/mvp-data-to-text",
    "RUCAIBox/mvp-open-dialog",
    "RUCAIBox/mvp-question-answering",
    "RUCAIBox/mvp-question-generation",
    "RUCAIBox/mvp-story",
    "RUCAIBox/mvp-summarization",
    "RUCAIBox/mvp-task-dialog",
    "RUCAIBox/mtl-data-to-text",
    "RUCAIBox/mtl-multi-task",
    "RUCAIBox/mtl-open-dialog",
    "RUCAIBox/mtl-question-answering",
    "RUCAIBox/mtl-question-generation",
    "RUCAIBox/mtl-story",
    "RUCAIBox/mtl-summarization",
    # See all MVP models at https://huggingface.co/models?filter=mvp
]


# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    
    Args:
        input_ids (torch.Tensor): Tensor of input ids.
        pad_token_id (int): The id of the padding token in the model's configuration.
        decoder_start_token_id (int): The id of the decoder's start token.
    """
    # Create a tensor of zeros with the same shape as input_ids
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # Shift input ids one position to the right
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    # Place the decoder start token at the beginning of each sequence
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        # Raise an error if pad_token_id is not defined
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # Replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
    # 返回经过移位处理后的输入 ID 列表
    return shifted_input_ids
# 从 transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding 复制并修改为 Mvp
class MvpLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        # 如果指定了 padding_idx，则将 embedding ids 偏移 2，并相应地调整 num_embeddings
        # 其他模型没有这个 hack
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
        """`input_ids' shape is expected to be [bsz x seqlen]."""
        
        # 获取 batch size 和序列长度
        bsz, seq_len = input_ids.shape[:2]
        # 根据序列长度生成位置信息张量，加上偏移量 self.offset
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        ).expand(bsz, -1)
        
        # 调用父类的 forward 方法来计算位置编码的 embedding
        return super().forward(positions + self.offset)


class MvpAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        # 初始化注意力机制的参数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        # 检查 embed_dim 必须能被 num_heads 整除，否则抛出 ValueError
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子，用于注意力分数计算
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        # 初始化线性变换层，用于查询、键、值和输出的线性映射
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将 tensor 重塑为 [bsz, num_heads, seq_len, head_dim] 的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        attn_prompt: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 省略了 forward 方法的具体实现部分
        pass  # 实现在其他地方
    # 初始化方法，接受一个MvpConfig类型的配置参数config
    def __init__(self, config: MvpConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置嵌入维度为config中定义的模型维度d_model
        self.embed_dim = config.d_model
        # 创建自注意力层对象，使用MvpAttention类，设置参数包括嵌入维度、注意力头数、注意力层的dropout
        self.self_attn = MvpAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
        )
        # 创建自注意力层的LayerNorm层，输入维度为embed_dim
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 设置全连接层的dropout概率
        self.dropout = config.dropout
        # 根据配置中的激活函数选择相应的激活函数
        self.activation_fn = ACT2FN[config.activation_function]
        # 设置激活函数的dropout概率
        self.activation_dropout = config.activation_dropout
        # 创建第一个全连接层，输入维度为embed_dim，输出维度为encoder_ffn_dim
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        # 创建第二个全连接层，输入维度为encoder_ffn_dim，输出维度为embed_dim
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        # 创建最终的LayerNorm层，输入维度为embed_dim
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 将输入的 hidden_states 赋值给 residual，用于后续残差连接
        residual = hidden_states
        # 调用 self_attn 方法，执行自注意力计算
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            attn_prompt=self_attn_prompt,
            output_attentions=output_attentions,
        )
        # 对 hidden_states 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 执行残差连接
        hidden_states = residual + hidden_states
        # 对连接后的 hidden_states 应用 layer normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 再次使用 residual 记录当前 hidden_states，用于下一步残差连接
        residual = hidden_states
        # 通过 fc1 和激活函数执行全连接层计算
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 对 hidden_states 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 通过 fc2 执行全连接层计算
        hidden_states = self.fc2(hidden_states)
        # 对 hidden_states 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 执行残差连接
        hidden_states = residual + hidden_states
        # 对连接后的 hidden_states 应用 layer normalization
        hidden_states = self.final_layer_norm(hidden_states)

        # 如果 hidden_states 的数据类型为 torch.float16，并且包含无穷大或 NaN 值，则进行截断处理
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 将最终结果存入 outputs 中
        outputs = (hidden_states,)

        # 如果设置了 output_attentions=True，则将注意力权重 attn_weights 添加到 outputs 中一并返回
        if output_attentions:
            outputs += (attn_weights,)

        return outputs
# 定义一个名为 MvpDecoderLayer 的类，继承自 nn.Module 类，用于 MVP 模型的解码器层
class MvpDecoderLayer(nn.Module):
    def __init__(self, config: MvpConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 设置编码维度为配置中的模型维度大小

        # 创建自注意力机制（self-attention）对象，用于解码器，配置包括维度、注意力头数、dropout 等
        self.self_attn = MvpAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )

        self.dropout = config.dropout  # 设置模型的全局dropout比例
        self.activation_fn = ACT2FN[config.activation_function]  # 获取激活函数
        self.activation_dropout = config.activation_dropout  # 获取激活函数的dropout比例

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 对自注意力输出进行 layer normalization

        # 创建编码器-解码器注意力机制对象，配置包括维度、注意力头数、dropout 等
        self.encoder_attn = MvpAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 对编码器-解码器注意力输出进行 layer normalization

        # 全连接层 1，输入维度为 embed_dim，输出维度为配置中的解码器前馈神经网络维度
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        # 全连接层 2，输入维度为配置中的解码器前馈神经网络维度，输出维度为 embed_dim
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)

        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 对最终输出进行 layer normalization

    # 前向传播函数，接受多个输入张量，执行解码器层的前向计算
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        self_attn_prompt: Optional[torch.Tensor] = None,
        cross_attn_prompt: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ):
        # 省略前向传播函数的具体实现，因为只需要注释每个参数和输入的作用
        pass


# 定义一个名为 MvpClassificationHead 的类，继承自 nn.Module 类，用于 MVP 模型的分类头部
# 适用于句子级分类任务
class MvpClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(
        self,
        input_dim: int,
        inner_dim: int,
        num_classes: int,
        pooler_dropout: float,
    ):
        super().__init__()
        # 全连接层，输入维度为 input_dim，输出维度为 inner_dim
        self.dense = nn.Linear(input_dim, inner_dim)
        self.dropout = nn.Dropout(p=pooler_dropout)  # dropout 层，使用给定的 dropout 比例
        # 输出投影层，输入维度为 inner_dim，输出维度为类别数 num_classes
        self.out_proj = nn.Linear(inner_dim, num_classes)

    # 前向传播函数，接受隐藏状态张量作为输入，执行分类头部的前向计算
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dropout(hidden_states)  # 应用 dropout 到隐藏状态
        hidden_states = self.dense(hidden_states)  # 进行全连接层计算
        hidden_states = torch.tanh(hidden_states)  # 应用 tanh 激活函数
        hidden_states = self.dropout(hidden_states)  # 再次应用 dropout
        hidden_states = self.out_proj(hidden_states)  # 输出投影到类别数维度
        return hidden_states  # 返回最终的隐藏状态


# 定义一个名为 MvpPrompt 的类，继承自 nn.Module 类，用于 MVP 模型的编码器或解码器的逐层提示
class MvpPrompt(nn.Module):
    """Layer-wise prompt for encoder or decoder."""
    # 初始化函数，用于设置模型参数和层次结构
    def __init__(self, config, num_layers, num_heads):
        super().__init__()
        # 从配置中获取提示文本长度并赋值给实例变量
        self.prompt_length = config.prompt_length
        # 将层数赋值给实例变量
        self.num_layers = num_layers
        # 将头数赋值给实例变量
        self.num_heads = num_heads
        # 计算每个头的维度
        self.head_dim = config.d_model // num_heads
        # 根据配置中的 dropout 概率创建一个 dropout 层
        self.dropout = nn.Dropout(p=config.dropout)
        # 创建一个提示文本的嵌入层，大小为 (提示文本长度, 模型维度)
        self.prompt_embedding = nn.Embedding(config.prompt_length, config.d_model)
        # 创建一个序列模块，用于处理提示文本的转换
        self.prompt_trans = nn.Sequential(
            # 线性层，将模型维度映射到中间维度
            nn.Linear(config.d_model, config.prompt_mid_dim),
            # GELU 激活函数
            nn.GELU(),
            # 再次线性映射，将中间维度映射为 (层数 * 2 * 模型维度)
            nn.Linear(config.prompt_mid_dim, num_layers * 2 * config.d_model),
        )

    # 前向传播函数，用于计算模型的输出
    def forward(self, prompt_ids: torch.Tensor) -> Tuple[torch.Tensor]:
        # 将输入的提示文本 IDs 转换为嵌入表示，并通过提示文本转换模块进行转换
        prompt = self.prompt_trans(self.prompt_embedding(prompt_ids))
        # 将转换后的结果重新形状为 (提示文本长度, 层数 * 2, 头数, 每个头的维度)
        prompt = prompt.view(self.prompt_length, self.num_layers * 2, self.num_heads, self.head_dim)
        # 对转换后的结果应用 dropout
        prompt = self.dropout(prompt)
        # 将维度重新排序，顺序为 (层数 * 2, 头数, 提示文本长度, 每个头的维度)，并按照指定维度拆分成多个张量
        prompt = prompt.permute([1, 2, 0, 3]).split(2)
        # 返回处理后的张量元组作为模型的输出
        return prompt
class MvpPreTrainedModel(PreTrainedModel):
    # 指定配置类为 MvpConfig
    config_class = MvpConfig
    # 基础模型前缀为 "model"
    base_model_prefix = "model"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        # 初始化权重函数，根据配置中的初始标准差进行初始化
        std = self.config.init_std
        if isinstance(module, nn.Linear):
            # 如果是线性层，使用正态分布初始化权重，偏置初始化为零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层，使用正态分布初始化权重，对于填充索引，将其权重初始化为零
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    @property
    def dummy_inputs(self):
        # 获取虚拟输入示例，包括输入的填充标记和输入 ID
        pad_token = self.config.pad_token_id
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),  # 生成注意力遮罩
            "input_ids": input_ids,  # 输入 ID
        }
        return dummy_inputs


MVP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MvpConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

MVP_INPUTS_DOCSTRING = r"""
    Placeholder for inputs documentation.
"""

MVP_CONDITIONAL_GENERATION_EXAMPLE = r"""
    Example of summarization:

    Fine-tuning a model
    ```
    >>> import torch
    >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

    >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
    >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

    >>> inputs = tokenizer(
    ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
    ...     return_tensors="pt",
    ... )
    >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

    >>> loss = model(**inputs, labels=labels).loss
    >>> loss.backward()
    ```

    Inference after the model fine-tuned
    ```
    >>> with torch.no_grad():
    ...     generated_ids = model.generate(**inputs)

    >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    ```
"""

MVP_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
    Example of single-label classification:

    Fine-tuning a model on `num_labels` classes
    ```
    Placeholder for sequence classification sample.
    # 导入PyTorch库
    import torch
    # 从transformers库中导入AutoTokenizer和MvpForSequenceClassification类
    from transformers import AutoTokenizer, MvpForSequenceClassification
    
    # 设置类别数为2，示例中是一个二分类任务
    num_labels = 2
    # 使用预训练模型"RUCAIBox/mvp"初始化分词器
    tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
    # 使用预训练模型"RUCAIBox/mvp"初始化序列分类模型，并指定类别数
    model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)
    
    # 对输入文本进行分词和转换为PyTorch张量格式
    inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
    # 设置输入文本对应的真实标签
    labels = torch.tensor(1)
    
    # 使用模型进行前向传播并计算损失
    loss = model(**inputs, labels=labels).loss
    # 根据损失计算梯度
    loss.backward()
    
    # 在模型微调后进行推理
    # 禁用梯度计算
    with torch.no_grad():
        # 获取模型的输出日志概率
        logits = model(**inputs).logits
    
    # 获取预测的类别ID，即输出概率最高的类别
    predicted_class_id = logits.argmax()
"""
MVP_QUESTION_ANSWERING_SAMPLE = r"""
    Example:

    Fine-tuning a model for extrative question answering, and our model also supports generative question answering
    using `BartForConditionalGeneration`
    ```
    >>> import torch
    >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

    >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
    >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

    >>> inputs = tokenizer(
    ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
    ...     return_tensors="pt",
    ... )
    >>> target_start_index = torch.tensor([18])
    >>> target_end_index = torch.tensor([19])

    >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
    >>> loss.backward()
    ```

    Inference after the model fine-tuned
    ```
    >>> with torch.no_grad():
    ...     outputs = model(**inputs)

    >>> answer_start_index = outputs.start_logits.argmax()
    >>> answer_end_index = outputs.end_logits.argmax()

    >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    >>> predict_answer = tokenizer.decode(predict_answer_tokens)
    ```
"""


class MvpEncoder(MvpPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    """

    def __init__(
        self, config: MvpConfig, embed_tokens: Optional[nn.Embedding] = None, use_prompt: Optional[bool] = False
    ):
        super().__init__(config)

        # Dropout rate as specified in the configuration
        self.dropout = config.dropout
        # Layer dropout rate as specified in the configuration
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        # Padding index for the embeddings
        self.padding_idx = config.pad_token_id
        # Maximum position embeddings allowed
        self.max_source_positions = config.max_position_embeddings
        # Embedding scale factor
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            # Use provided embedding tokens
            self.embed_tokens = embed_tokens
        else:
            # Otherwise, create new embedding tokens
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

        # Learned positional embeddings
        self.embed_positions = MvpLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )
        # List of encoder layers
        self.layers = nn.ModuleList([MvpEncoderLayer(config) for _ in range(config.encoder_layers)])
        # Layer normalization for embeddings
        self.layernorm_embedding = nn.LayerNorm(embed_dim)

        self.use_prompt = use_prompt
        if use_prompt:
            # Length of the prompt
            self.prompt_length = config.prompt_length
            # Self-attention mechanism for prompts
            self.self_attn_prompt = MvpPrompt(
                config,
                config.encoder_layers,
                config.encoder_attention_heads,
            )

        # Gradient checkpointing disabled by default
        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()
    # 定义一个方法，用于获取输入的嵌入表示
    def get_input_embeddings(self):
        # 返回存储在对象中的嵌入表示
        return self.embed_tokens

    # 定义一个方法，用于设置输入的嵌入表示
    def set_input_embeddings(self, value):
        # 将传入的值赋给对象中的嵌入表示
        self.embed_tokens = value

    # 定义模型的前向传播方法
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入的token id张量，默认为None
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩张量，可选
        head_mask: Optional[torch.Tensor] = None,  # 头部遮罩张量，可选
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入表示张量，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力张量，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态张量，可选
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果，可选
# 定义一个名为 MvpDecoder 的类，继承自 MvpPreTrainedModel
class MvpDecoder(MvpPreTrainedModel):
    """
    Transformer 解码器，由 config.decoder_layers 层组成。每层是一个 `MvpDecoderLayer` 对象。

    Args:
        config: MvpConfig 对象，配置参数
        embed_tokens (nn.Embedding): 输出的嵌入层
        use_prompt (bool): 是否使用提示语
    """

    # 初始化方法，接受配置 config、嵌入层 embed_tokens 和是否使用提示语 use_prompt 作为参数
    def __init__(
        self, config: MvpConfig, embed_tokens: Optional[nn.Embedding] = None, use_prompt: Optional[bool] = False
    ):
        # 调用父类的初始化方法
        super().__init__(config)
        
        # 设置类的属性
        self.dropout = config.dropout  # dropout 概率
        self.layerdrop = config.decoder_layerdrop  # 层间 dropout 概率
        self.padding_idx = config.pad_token_id  # 填充 token 的索引
        self.max_target_positions = config.max_position_embeddings  # 最大目标位置
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0  # 嵌入层的缩放因子
        
        # 如果提供了 embed_tokens，则使用提供的，否则创建一个新的 nn.Embedding
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        # 创建学习得到的位置嵌入层
        self.embed_positions = MvpLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )
        
        # 创建多个 MvpDecoderLayer 层，以列表形式存储在 self.layers 中
        self.layers = nn.ModuleList([MvpDecoderLayer(config) for _ in range(config.decoder_layers)])
        
        # 对嵌入层进行 LayerNorm 处理
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        # 如果 use_prompt 为 True，则创建 prompt 相关的属性
        self.use_prompt = use_prompt
        if use_prompt:
            self.prompt_length = config.prompt_length  # 提示语长度
            self.self_attn_prompt = MvpPrompt(
                config,
                config.decoder_layers,
                config.decoder_attention_heads,
            )
            self.cross_attn_prompt = MvpPrompt(
                config,
                config.decoder_layers,
                config.decoder_attention_heads,
            )

        self.gradient_checkpointing = False  # 梯度检查点，默认为 False
        
        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层 embed_tokens
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入嵌入层 embed_tokens 的值
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 前向传播方法，接受多种输入参数，并返回输出
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处未完全显示，因为代码截断了一部分，应完整显示 forward 方法的实现
        ...
    # 定义存储编码器和解码器权重的键列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
    
    # 模型初始化函数，接受一个MvpConfig类型的配置参数
    def __init__(self, config: MvpConfig):
        # 调用父类的初始化方法
        super().__init__(config)
    
        # 从配置中获取填充索引和词汇表大小
        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
        # 根据配置决定是否使用提示（prompt）
        self.use_prompt = config.use_prompt
        # 创建一个共享的嵌入层对象，将词汇表大小、模型维度和填充索引作为参数
        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
    
        # 创建编码器和解码器对象，传入配置、共享的嵌入层对象和是否使用提示作为参数
        self.encoder = MvpEncoder(config, self.shared, config.use_prompt)
        self.decoder = MvpDecoder(config, self.shared, config.use_prompt)
    
        # 初始化权重并进行最终处理
        self.post_init()
    
    # 获取输入嵌入层对象的方法
    def get_input_embeddings(self):
        return self.shared
    
    # 设置输入嵌入层对象的方法
    def set_input_embeddings(self, value):
        self.shared = value
        # 将编码器和解码器的嵌入层对象设置为共享的嵌入层对象
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared
    
    # 获取编码器对象的方法
    def get_encoder(self):
        return self.encoder
    
    # 获取解码器对象的方法
    def get_decoder(self):
        return self.decoder
    
    # 设置轻量级调整的方法，要求必须使用提示（prompt）
    def set_lightweight_tuning(self):
        assert self.use_prompt, "If you want to use lightweight tuning, make sure that `use_prompt=True`."
    
        # 冻结整个模型的梯度
        self.requires_grad_(False)
        # 解冻编码器和解码器中的提示（prompt）自注意力机制的权重
        self.encoder.self_attn_prompt.requires_grad_(True)
        self.decoder.self_attn_prompt.requires_grad_(True)
        self.decoder.cross_attn_prompt.requires_grad_(True)
    
    # 前向传播方法，带有详细的文档字符串注释
    @add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 添加类的文档字符串，描述这个类用于带有语言建模头部的MVP模型，适用于各种文本生成任务。
@add_start_docstrings(
    "The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING
)
class MvpForConditionalGeneration(MvpPreTrainedModel):
    # 定义了权重共享的关键键名列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

    def __init__(self, config: MvpConfig):
        super().__init__(config)
        # 根据给定配置初始化MVP模型
        self.model = MvpModel(config)
        # 注册一个缓冲区，存储最终对数偏置，形状为(1, 共享编码器的词汇表大小)
        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
        # 初始化线性层LM头部，输入维度为config.d_model，输出维度为共享编码器的词汇表大小，无偏置
        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        # 执行初始化权重和应用最终处理
        self.post_init()

    def get_encoder(self):
        # 返回模型的编码器部分
        return self.model.get_encoder()

    def get_decoder(self):
        # 返回模型的解码器部分
        return self.model.get_decoder()

    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
        # 调整词嵌入的大小，并且更新最终对数偏置
        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        self._resize_final_logits_bias(new_num_tokens)
        return new_embeddings

    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
        # 调整最终对数偏置的大小以匹配新的词汇表大小
        old_num_tokens = self.final_logits_bias.shape[-1]
        if new_num_tokens <= old_num_tokens:
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        else:
            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        self.register_buffer("final_logits_bias", new_bias)

    def get_output_embeddings(self):
        # 返回LM头部的输出词嵌入
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置LM头部的输出词嵌入
        self.lm_head = new_embeddings

    def set_lightweight_tuning(self):
        # 设置轻量级调整，即冻结LM头部的梯度更新
        self.model.set_lightweight_tuning()
        self.lm_head.requires_grad_(False)

    # 添加模型前向传播的文档字符串，使用MVP模型输入的文档字符串
    @add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
    # 替换返回值文档字符串，输出类型为Seq2SeqLMOutput，使用_CONFIG_FOR_DOC作为配置类
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 添加模型前向传播结束的文档字符串，使用MVP条件生成示例文档字符串
    @add_end_docstrings(MVP_CONDITIONAL_GENERATION_EXAMPLE)
    # 此方法用于执行 Transformer 模型的前向传播。
    def forward(
        self,
        input_ids: torch.LongTensor = None,  # 输入序列的 token IDs
        attention_mask: Optional[torch.Tensor] = None,  # 输入序列的注意力掩码
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器输入的 token IDs
        decoder_attention_mask: Optional[torch.LongTensor] = None,  # 解码器的注意力掩码
        head_mask: Optional[torch.Tensor] = None,  # 多头注意力机制的掩码
        decoder_head_mask: Optional[torch.Tensor] = None,  # 解码器的多头注意力机制的掩码
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 跨层注意力机制的多头掩码
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,  # 编码器的输出
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对，用于生成
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入嵌入向量
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器输入嵌入向量
        labels: Optional[torch.LongTensor] = None,  # 真实标签
        use_cache: Optional[bool] = None,  # 是否使用缓存
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回结果字典
    ) -> Union[Tuple, Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Returns either a tuple or a `Seq2SeqLMOutput` containing masked language model output.

        """
        # Determine whether to use the return_dict based on provided argument or default configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Adjust use_cache if labels are provided, and issue a warning if necessary
        if labels is not None:
            if use_cache:
                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            use_cache = False
            # If decoder input is not provided, shift labels for decoder input preparation
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        # Pass the inputs to the model for computation
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # Compute logits for the language model head and add final logits bias
        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias

        masked_lm_loss = None
        # Calculate masked language model loss if labels are provided
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        # Return the appropriate output format based on return_dict flag
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return Seq2SeqLMOutput containing detailed output components
        return Seq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用了过去的键值（用于生成过程中），则裁剪decoder_input_ids
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经只传递最后一个输入ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：仅保留最后一个ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 裁剪掉不需要的前缀部分
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回准备好的输入字典
        return {
            "input_ids": None,  # encoder_outputs已定义，不需要input_ids
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 更改此处以避免缓存（可能是为了调试目的）
        }

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        # 将标签右移一个位置以作为解码器的输入
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 缓存的交叉注意力状态无需重新排序 -> 它们始终保持不变
            reordered_past += (
                # 根据beam_idx对层的过去状态进行重新排序
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                + layer_past[2:],
            )
        return reordered_past
# 添加模型文档字符串，用于描述带有顶部序列分类/头部（池化输出顶部的线性层）的 MVP 模型，例如用于 GLUE 任务。
@add_start_docstrings(
    """
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    """,
    MVP_START_DOCSTRING,
)
class MvpForSequenceClassification(MvpPreTrainedModel):
    # 定义权重共享的关键键列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config: MvpConfig, **kwargs):
        # 调用父类构造函数
        super().__init__(config, **kwargs)
        # 初始化 MvpModel
        self.model = MvpModel(config)
        # 初始化 MvpClassificationHead，用于分类任务
        self.classification_head = MvpClassificationHead(
            config.d_model,
            config.d_model,
            config.num_labels,
            config.classifier_dropout,
        )

        # 初始化权重并进行最终处理
        self.post_init()

    # 设置轻量级调优
    def set_lightweight_tuning(self):
        self.model.set_lightweight_tuning()
        # 设置分类头部的梯度为 False，以便冻结它
        self.classification_head.requires_grad_(False)

    # 添加模型前向传播的文档字符串
    @add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
    # 添加模型前向传播的结束文档字符串，用于序列分类任务的示例
    @add_end_docstrings(MVP_SEQUENCE_CLASSIFICATION_SAMPLE)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,



# 添加模型文档字符串，用于描述带有顶部抽取式问答任务的 MVP 模型，例如用于 SQuAD 的线性层输出隐藏状态以计算 `span start logits` 和 `span end logits`。
@add_start_docstrings(
    """
    MVP Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
    on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    MVP_START_DOCSTRING,
)
class MvpForQuestionAnswering(MvpPreTrainedModel):
    # 定义权重共享的关键键列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config):
        # 调用父类构造函数
        super().__init__(config)

        # 设置问题回答任务的标签数量为 2
        config.num_labels = 2
        self.num_labels = config.num_labels

        # 初始化 MvpModel
        self.model = MvpModel(config)
        # 初始化用于问答任务的线性层
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    # 设置轻量级调优
    def set_lightweight_tuning(self):
        self.model.set_lightweight_tuning()
        # 设置问答任务线性层的梯度为 False，以便冻结它
        self.qa_outputs.requires_grad_(False)

    # 添加模型前向传播的文档字符串
    @add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
    # 添加模型前向传播的结束文档字符串，用于抽取式问答任务的示例
    @add_end_docstrings(MVP_QUESTION_ANSWERING_SAMPLE)
    # 定义 Transformer 模型的前向传播方法，用于执行模型推断或训练时的前向计算
    def forward(
        self,
        input_ids: torch.Tensor = None,  # 输入序列的 token IDs
        attention_mask: Optional[torch.Tensor] = None,  # 输入序列的注意力掩码，标记哪些位置是真实的 token
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器输入序列的 token IDs
        decoder_attention_mask: Optional[torch.LongTensor] = None,  # 解码器输入序列的注意力掩码
        head_mask: Optional[torch.Tensor] = None,  # 多头注意力机制的掩码
        decoder_head_mask: Optional[torch.Tensor] = None,  # 解码器多头注意力机制的掩码
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 跨注意力头的掩码
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,  # 编码器输出的列表，包含不同层的隐藏状态
        start_positions: Optional[torch.LongTensor] = None,  # 序列开始位置的索引
        end_positions: Optional[torch.LongTensor] = None,  # 序列结束位置的索引
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入表示
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器输入的嵌入表示
        use_cache: Optional[bool] = None,  # 是否使用缓存进行推断
        output_attentions: Optional[bool] = None,  # 是否返回注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否返回所有隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出
        ):
# 从 transformers.models.mvp.modeling_mvp.MvpDecoderWrapper 复制，并将 Bart 改为 Mvp
class MvpDecoderWrapper(MvpPreTrainedModel):
    """
    这个包装类是一个辅助类，用于在使用 [`EncoderDecoderModel`] 框架与因果语言模型结合时正确加载预训练检查点。
    """

    def __init__(self, config):
        super().__init__(config)
        # 初始化 MvpDecoder 对象
        self.decoder = MvpDecoder(config)

    def forward(self, *args, **kwargs):
        # 调用 MvpDecoder 的 forward 方法
        return self.decoder(*args, **kwargs)


class MvpForCausalLM(MvpPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        config = copy.deepcopy(config)
        # 标记该模型为解码器，非编码器解码器结构
        config.is_decoder = True
        config.is_encoder_decoder = False
        super().__init__(config)
        # 使用 MvpDecoderWrapper 封装模型
        self.model = MvpDecoderWrapper(config)

        # 初始化语言模型头部，一个线性层
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回模型中的嵌入层
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置模型中的嵌入层
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        # 返回语言模型头部的嵌入层
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置语言模型头部的嵌入层
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        # 设置解码器
        self.model.decoder = decoder

    def get_decoder(self):
        # 返回解码器
        return self.model.decoder

    def set_lightweight_tuning(self):
        # 设置轻量级调整
        self.model.set_lightweight_tuning()
        # 冻结语言模型头部的梯度
        self.lm_head.requires_grad_(False)

    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 模型前向传播方法，支持参数详见函数签名
        ...

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
    ):
        # 为生成准备输入的方法，支持参数详见函数签名
        ...
    ):
        # 如果模型作为编码器-解码器模型的解码器使用，解码器注意力遮罩在需要时动态创建
        if attention_mask is None:
            # 如果注意力遮罩为空，创建一个全为1的新张量，形状与input_ids相同
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            # 获取过去键值的长度（通常是序列长度）
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法已经只传递最后一个输入ID
            if input_ids.shape[1] > past_length:
                # 如果输入ID序列长度大于过去长度，计算需要移除的前缀长度
                remove_prefix_length = past_length
            else:
                # 否则，默认行为：仅保留最后一个ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 从输入ID中移除前缀部分，保留剩余部分作为新的输入ID
            input_ids = input_ids[:, remove_prefix_length:]
        # 返回字典，包含模型需要的输入信息和状态信息
        return {
            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
            "attention_mask": attention_mask,  # 注意力遮罩，用于指定哪些位置需要关注
            "past_key_values": past_key_values,  # 过去的键值状态，用于生成模型的历史信息
            "use_cache": use_cache,  # 是否使用缓存，以提高生成效率
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 根据beam_idx重新排列过去的状态信息
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排列后的过去状态
        return reordered_past

Transformers-源码解析-八十-

Transformers 源码解析（八十）

.\models\mt5\modeling_tf_mt5.py

.\models\mt5\__init__.py

.\models\musicgen\configuration_musicgen.py

.\models\musicgen\convert_musicgen_transformers.py

.\models\musicgen\modeling_musicgen.py

.\models\musicgen\processing_musicgen.py

.\models\musicgen\__init__.py

.\models\musicgen_melody\configuration_musicgen_melody.py

.\models\musicgen_melody\convert_musicgen_melody_transformers.py

.\models\musicgen_melody\feature_extraction_musicgen_melody.py

.\models\musicgen_melody\modeling_musicgen_melody.py

.\models\musicgen_melody\processing_musicgen_melody.py

.\models\musicgen_melody\__init__.py

.\models\mvp\configuration_mvp.py

.\models\mvp\modeling_mvp.py

`.\models\mt5\modeling_tf_mt5.py`

`.\models\mt5\init.py`

`.\models\musicgen\configuration_musicgen.py`

`.\models\musicgen\convert_musicgen_transformers.py`

`.\models\musicgen\modeling_musicgen.py`

`.\models\musicgen\processing_musicgen.py`

`.\models\musicgen\init.py`

`.\models\musicgen_melody\configuration_musicgen_melody.py`

`.\models\musicgen_melody\convert_musicgen_melody_transformers.py`

`.\models\musicgen_melody\feature_extraction_musicgen_melody.py`

`.\models\musicgen_melody\modeling_musicgen_melody.py`

`.\models\musicgen_melody\processing_musicgen_melody.py`

`.\models\musicgen_melody\init.py`

`.\models\mvp\configuration_mvp.py`

`.\models\mvp\modeling_mvp.py`