Transformers 源码解析(八十)
.\models\mt5\modeling_tf_mt5.py
""" Tensorflow mT5 model."""
from ...utils import logging
from ..t5.modeling_tf_t5 import TFT5EncoderModel, TFT5ForConditionalGeneration, TFT5Model
from .configuration_mt5 import MT5Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "T5Config"
class TFMT5Model(TFT5Model):
r"""
This class overrides [`TFT5Model`]. Please check the superclass for the appropriate documentation alongside usage
examples.
Examples:
```
>>> from transformers import TFMT5Model, AutoTokenizer
>>> model = TFMT5Model.from_pretrained("google/mt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, return_tensors="tf")
>>> labels = tokenizer(text_target=summary, return_tensors="tf")
>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
>>> hidden_states = outputs.last_hidden_state
```"""
model_type = "mt5"
config_class = MT5Config
class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
r"""
This class overrides [`TFT5ForConditionalGeneration`]. Please check the superclass for the appropriate
documentation alongside usage examples.
Examples:
```
>>> from transformers import TFMT5ForConditionalGeneration, AutoTokenizer
>>> model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, text_target=summary, return_tensors="tf")
>>> outputs = model(**inputs)
>>> loss = outputs.loss
```"""
model_type = "mt5"
config_class = MT5Config
class TFMT5EncoderModel(TFT5EncoderModel):
r"""
This class overrides [`TFT5EncoderModel`]. Please check the superclass for the appropriate documentation alongside
usage examples.
Examples:
```
>>> from transformers import TFMT5EncoderModel, AutoTokenizer
>>> model = TFMT5EncoderModel.from_pretrained("google/mt5-small")
# 设置tokenizer为从预训练模型"google/mt5-small"加载的自动分词器
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
# 设置一个新闻文章的示例文本
article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
# 使用tokenizer对文章进行分词并返回TensorFlow格式的输入ID
input_ids = tokenizer(article, return_tensors="tf").input_ids
# 对输入ID进行模型推理,获取模型的输出
outputs = model(input_ids)
# 从模型的输出中提取最后一个隐藏状态的表示
hidden_state = outputs.last_hidden_state
# 设置模型类型为"mt5",这里暂存了模型的类型信息
model_type = "mt5"
# 设置配置类为MT5Config,用于模型配置的加载和管理
config_class = MT5Config
.\models\mt5\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
if is_sentencepiece_available():
from ..t5.tokenization_t5 import T5Tokenizer
else:
from ...utils.dummy_sentencepiece_objects import T5Tokenizer
MT5Tokenizer = T5Tokenizer
if is_tokenizers_available():
from ..t5.tokenization_t5_fast import T5TokenizerFast
else:
from ...utils.dummy_tokenizers_objects import T5TokenizerFast
MT5TokenizerFast = T5TokenizerFast
_import_structure = {"configuration_mt5": ["MT5Config", "MT5OnnxConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mt5"] = [
"MT5EncoderModel",
"MT5ForConditionalGeneration",
"MT5ForQuestionAnswering",
"MT5ForSequenceClassification",
"MT5ForTokenClassification",
"MT5Model",
"MT5PreTrainedModel",
"MT5Stack",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_mt5"] = ["TFMT5EncoderModel", "TFMT5ForConditionalGeneration", "TFMT5Model"]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_mt5"] = ["FlaxMT5EncoderModel", "FlaxMT5ForConditionalGeneration", "FlaxMT5Model"]
if TYPE_CHECKING:
from .configuration_mt5 import MT5Config, MT5OnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mt5 import (
MT5EncoderModel,
MT5ForConditionalGeneration,
MT5ForQuestionAnswering,
MT5ForSequenceClassification,
MT5ForTokenClassification,
MT5Model,
MT5PreTrainedModel,
MT5Stack,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_mt5 import TFMT5EncoderModel, TFMT5ForConditionalGeneration, TFMT5Model
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_mt5 import FlaxMT5EncoderModel, FlaxMT5ForConditionalGeneration, FlaxMT5Model
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
extra_objects={"MT5Tokenizer": MT5Tokenizer, "MT5TokenizerFast": MT5TokenizerFast},
module_spec=__spec__,
)
.\models\musicgen\configuration_musicgen.py
""" MusicGen model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import AutoConfig
logger = logging.get_logger(__name__)
MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/musicgen-small": "https://huggingface.co/facebook/musicgen-small/resolve/main/config.json",
}
class MusicgenDecoderConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of an [`MusicgenDecoder`]. It is used to instantiate a
MusicGen decoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MusicGen
[facebook/musicgen-small](https://huggingface.co/facebook/musicgen-small) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
vocab_size (`int`, *optional*, defaults to 2048):
MusicgenDecoder 模型的词汇表大小,定义了在调用 `MusicgenDecoder` 时输入 `inputs_ids` 可表示的不同标记数量。
hidden_size (`int`, *optional*, defaults to 1024):
层和池化层的维度。
num_hidden_layers (`int`, *optional*, defaults to 24):
解码器层的数量。
num_attention_heads (`int`, *optional*, defaults to 16):
Transformer 块中每个注意力层的注意力头数量。
ffn_dim (`int`, *optional*, defaults to 4096):
Transformer 块中“中间”(通常称为前馈)层的维度。
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
解码器和池化器中的非线性激活函数(函数或字符串)。支持的字符串包括 `"gelu"`, `"relu"`, `"silu"` 和 `"gelu_new"`。
dropout (`float`, *optional*, defaults to 0.1):
嵌入层、文本编码器和池化器中所有全连接层的 dropout 概率。
attention_dropout (`float`, *optional*, defaults to 0.0):
注意力概率的 dropout 比率。
activation_dropout (`float`, *optional*, defaults to 0.0):
全连接层内部激活的 dropout 比率。
max_position_embeddings (`int`, *optional*, defaults to 2048):
模型可能使用的最大序列长度。通常设置为一个很大的值(例如 512、1024 或 2048)。
initializer_factor (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
layerdrop (`float`, *optional*, defaults to 0.0):
解码器的 LayerDrop 概率。详细信息请参阅 LayerDrop 论文(见 https://arxiv.org/abs/1909.11556)。
scale_embedding (`bool`, *optional*, defaults to `False`):
是否通过 sqrt(hidden_size) 缩放嵌入。
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后的 key/values 注意力(并非所有模型都使用)。
num_codebooks (`int`, *optional*, defaults to 4):
转发到模型的并行码书数量。
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
是否应绑定输入和输出词嵌入。
audio_channels (`int`, *optional*, defaults to 1):
音频数据中的通道数。单声道为 1,立体声为 2。立体声模型生成左/右输出通道的单独音频流,单声道模型生成单一音频流输出。
model_type = "musicgen_decoder"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=2048,
max_position_embeddings=2048,
num_hidden_layers=24,
ffn_dim=4096,
num_attention_heads=16,
layerdrop=0.0,
use_cache=True,
activation_function="gelu",
hidden_size=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
initializer_factor=0.02,
scale_embedding=False,
num_codebooks=4,
audio_channels=1,
pad_token_id=2048,
bos_token_id=2048,
eos_token_id=None,
tie_word_embeddings=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.ffn_dim = ffn_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.initializer_factor = initializer_factor
self.layerdrop = layerdrop
self.use_cache = use_cache
self.scale_embedding = scale_embedding
self.num_codebooks = num_codebooks
if audio_channels not in [1, 2]:
raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.")
self.audio_channels = audio_channels
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
class MusicgenConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MusicgenModel`]. It is used to instantiate a
MusicGen model according to the specified arguments, defining the text encoder, audio encoder and MusicGen decoder
configs.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
kwargs (*optional*):
Dictionary of keyword arguments. Notably:
- **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
defines the text encoder config.
- **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
defines the audio encoder config.
- **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
the decoder config.
Example:
```
>>> from transformers import (
... MusicgenConfig,
... MusicgenDecoderConfig,
... T5Config,
... EncodecConfig,
... MusicgenForConditionalGeneration,
... )
>>> # Initializing text encoder, audio encoder, and decoder model configurations
>>> text_encoder_config = T5Config()
>>> audio_encoder_config = EncodecConfig()
>>> decoder_config = MusicgenDecoderConfig()
>>> configuration = MusicgenConfig.from_sub_models_config(
... text_encoder_config, audio_encoder_config, decoder_config
... )
>>> # Initializing a MusicgenForConditionalGeneration (with random weights) from the facebook/musicgen-small style configuration
>>> model = MusicgenForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
>>> config_text_encoder = model.config.text_encoder
>>> config_audio_encoder = model.config.audio_encoder
>>> config_decoder = model.config.decoder
>>> # Saving the model, including its configuration
>>> model.save_pretrained("musicgen-model")
>>> # loading model and config from pretrained folder
>>> musicgen_config = MusicgenConfig.from_pretrained("musicgen-model")
>>> model = MusicgenForConditionalGeneration.from_pretrained("musicgen-model", config=musicgen_config)
```
Assigning the model_type class attribute for identification as a 'musicgen' model type.
This attribute helps in distinguishing different model types in a system.
"""
model_type = "musicgen"
is_composition = True
def __init__(self, **kwargs):
super().__init__(**kwargs)
if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")
text_encoder_config = kwargs.pop("text_encoder")
text_encoder_model_type = text_encoder_config.pop("model_type")
audio_encoder_config = kwargs.pop("audio_encoder")
audio_encoder_model_type = audio_encoder_config.pop("model_type")
decoder_config = kwargs.pop("decoder")
self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
self.decoder = MusicgenDecoderConfig(**decoder_config)
self.is_encoder_decoder = True
@classmethod
def from_sub_models_config(
cls,
text_encoder_config: PretrainedConfig,
audio_encoder_config: PretrainedConfig,
decoder_config: MusicgenDecoderConfig,
**kwargs,
):
r"""
Instantiate a [`MusicgenConfig`] (or a derived class) from text encoder, audio encoder and decoder
configurations.
Returns:
[`MusicgenConfig`]: An instance of a configuration object
"""
return cls(
text_encoder=text_encoder_config.to_dict(),
audio_encoder=audio_encoder_config.to_dict(),
decoder=decoder_config.to_dict(),
**kwargs,
)
@property
def sampling_rate(self):
return self.audio_encoder.sampling_rate
.\models\musicgen\convert_musicgen_transformers.py
import argparse
from pathlib import Path
from typing import Dict, OrderedDict, Tuple
import torch
from audiocraft.models import MusicGen
from transformers import (
AutoFeatureExtractor,
AutoTokenizer,
EncodecModel,
MusicgenDecoderConfig,
MusicgenForConditionalGeneration,
MusicgenProcessor,
T5EncoderModel,
)
from transformers.models.musicgen.modeling_musicgen import MusicgenForCausalLM
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
def rename_keys(name):
"""根据预定义规则重命名模型状态字典中的键名。
Args:
name (str): 原始的键名字符串。
Returns:
str: 重命名后的键名字符串。
"""
if "emb" in name:
name = name.replace("emb", "model.decoder.embed_tokens")
if "transformer" in name:
name = name.replace("transformer", "model.decoder")
if "cross_attention" in name:
name = name.replace("cross_attention", "encoder_attn")
if "linear1" in name:
name = name.replace("linear1", "fc1")
if "linear2" in name:
name = name.replace("linear2", "fc2")
if "norm1" in name:
name = name.replace("norm1", "self_attn_layer_norm")
if "norm_cross" in name:
name = name.replace("norm_cross", "encoder_attn_layer_norm")
if "norm2" in name:
name = name.replace("norm2", "final_layer_norm")
if "out_norm" in name:
name = name.replace("out_norm", "model.decoder.layer_norm")
if "linears" in name:
name = name.replace("linears", "lm_heads")
if "condition_provider.conditioners.description.output_proj" in name:
name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
return name
def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
"""根据 Hugging Face 模块名称规则重命名 fairseq Musicgen 的状态字典,并将其分成解码器(LM)状态字典和编码器-解码器投影的状态字典。
Args:
state_dict (OrderedDict): 原始的 fairseq Musicgen 状态字典。
hidden_size (int): 隐藏层大小。
Returns:
Tuple[Dict, Dict]: 重命名后的解码器状态字典和编码器-解码器投影状态字典的元组。
"""
keys = list(state_dict.keys())
enc_dec_proj_state_dict = {}
for key in keys:
val = state_dict.pop(key)
key = rename_keys(key)
if "in_proj_weight" in key:
state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
elif "enc_to_dec_proj" in key:
enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
else:
state_dict[key] = val
return state_dict, enc_dec_proj_state_dict
def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenDecoderConfig:
if checkpoint == "small" or checkpoint == "facebook/musicgen-stereo-small":
hidden_size = 1024
num_hidden_layers = 24
num_attention_heads = 16
elif checkpoint == "medium" or checkpoint == "facebook/musicgen-stereo-medium":
hidden_size = 1536
num_hidden_layers = 48
num_attention_heads = 24
elif checkpoint == "large" or checkpoint == "facebook/musicgen-stereo-large":
hidden_size = 2048
num_hidden_layers = 48
num_attention_heads = 32
else:
raise ValueError(
"Checkpoint should be one of `['small', 'medium', 'large']` for the mono checkpoints, "
"or `['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
f"for the stereo checkpoints, got {checkpoint}."
)
if "stereo" in checkpoint:
audio_channels = 2
num_codebooks = 8
else:
audio_channels = 1
num_codebooks = 4
config = MusicgenDecoderConfig(
hidden_size=hidden_size,
ffn_dim=hidden_size * 4,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_codebooks=num_codebooks,
audio_channels=audio_channels,
)
return config
@torch.no_grad()
def convert_musicgen_checkpoint(
checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", safe_serialization=False
):
fairseq_model = MusicGen.get_pretrained(checkpoint, device=device)
decoder_config = decoder_config_from_checkpoint(checkpoint)
decoder_state_dict = fairseq_model.lm.state_dict()
decoder_state_dict, enc_dec_proj_state_dict = rename_state_dict(
decoder_state_dict, hidden_size=decoder_config.hidden_size
)
text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-base")
audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
decoder = MusicgenForCausalLM(decoder_config).eval()
missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
for key in missing_keys.copy():
if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
missing_keys.remove(key)
if len(missing_keys) > 0:
raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
if len(unexpected_keys) > 0:
raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
model = MusicgenForConditionalGeneration(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder)
model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1)
decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1)
with torch.no_grad():
logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
if logits.shape != (2 * decoder_config.num_codebooks, 1, 2048):
raise ValueError("Incorrect shape for logits")
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
feature_extractor = AutoFeatureExtractor.from_pretrained(
"facebook/encodec_32khz", padding_side="left", feature_size=decoder_config.audio_channels
)
processor = MusicgenProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
model.generation_config.decoder_start_token_id = 2048
model.generation_config.pad_token_id = 2048
model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
model.generation_config.do_sample = True
model.generation_config.guidance_scale = 3.0
if pytorch_dump_folder is not None:
Path(pytorch_dump_folder).mkdir(exist_ok=True)
logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
model.save_pretrained(pytorch_dump_folder, safe_serialization=safe_serialization)
processor.save_pretrained(pytorch_dump_folder)
if repo_id:
logger.info(f"Pushing model {checkpoint} to {repo_id}")
model.push_to_hub(repo_id, safe_serialization=safe_serialization)
processor.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint",
default="small",
type=str,
help="Checkpoint size of the MusicGen model you'd like to convert. Can be one of: "
"`['small', 'medium', 'large']` for the mono checkpoints, or "
"`['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
"for the stereo checkpoints.",
)
parser.add_argument(
"--pytorch_dump_folder",
required=True,
default=None,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
parser.add_argument(
"--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
)
parser.add_argument(
"--safe_serialization",
action="store_true",
help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).",
)
args = parser.parse_args()
convert_musicgen_checkpoint(args.checkpoint, args.pytorch_dump_folder, args.push_to_hub)
.\models\musicgen\modeling_musicgen.py
Args:
encoder_outputs (`Tuple[torch.FloatTensor]` of length 1, with tensor shape `(batch_size, sequence_length, hidden_size)`):
文本编码器模型最后一层的隐藏状态序列。
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
编码器注意力掩码,用于避免对填充的标记索引执行注意力操作。掩码值为 `[0, 1]`:1 表示**未被掩码**的标记,0 表示**被掩码**的标记。
guidance_scale (`float`, *optional*):
分类器自由引导的指导比例,用于设置条件对数(从提示预测的)与无条件对数(没有提示预测的)之间的平衡。
"""
# 初始化函数的参数,设置默认值为 None
encoder_outputs: Tuple[torch.FloatTensor] = None
attention_mask: torch.LongTensor = None
guidance_scale: float = None
# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
# 创建一个与输入形状相同的全零张量,用于存储右移后的输入ids
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
# 将原始输入ids的除了第一个token外的所有token复制到右移后的张量中
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
if decoder_start_token_id is None:
raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
# 将decoder的起始token id放到右移后的张量的第一个位置
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
# 将右移后的张量中可能存在的-100值替换为pad_token_id
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class MusicgenSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int):
super().__init__()
self.embedding_dim = embedding_dim
# 调用make_weights方法初始化权重
self.make_weights(num_positions, embedding_dim)
def make_weights(self, num_embeddings: int, embedding_dim: int):
# 调用get_embedding方法生成sinusoidal位置编码的权重
emb_weights = self.get_embedding(num_embeddings, embedding_dim)
if hasattr(self, "weights"):
# 在前向传播时将权重调整为参数的正确dtype和device
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.weights = nn.Parameter(emb_weights)
self.weights.requires_grad = False
self.weights.detach_()
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int):
"""
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
"""
# 计算sinusoidal位置编码的半维度
half_dim = embedding_dim // 2
# 计算emb参数
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
# 构建sinusoidal位置编码张量,按照cos和sin的方式组合
emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
# 如果embedding_dim为奇数,进行零填充
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
return emb.to(torch.get_default_dtype())
@torch.no_grad()
# 定义一个前向传播方法,接收输入的 token ids 和过去键值长度作为参数
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
# 获取输入 tensor 的批大小(batch size),代码簿数量(codebooks),以及序列长度(seq_len)
bsz, codebooks, seq_len = input_ids.size()
# 从输入的 token ids 创建位置 ids
# 使用 torch.arange 生成长度为 seq_len 的序列,并加上 past_key_values_length 以处理位置偏移
position_ids = (torch.arange(seq_len) + past_key_values_length).to(input_ids.device)
# 如果序列长度大于当前权重张量的大小,则扩展权重张量
if seq_len > self.weights.size(0):
self.make_weights(seq_len + self.offset, self.embedding_dim)
# 根据位置 ids 从权重张量中选择对应的权重,并分离(detach)出来
return self.weights.index_select(0, position_ids.view(-1)).detach()
# 从transformers.models.bart.modeling_bart.BartAttention复制并修改为MusicgenAttention
class MusicgenAttention(nn.Module):
"""来自论文'Attention Is All You Need'的多头注意力机制"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[MusicgenConfig] = None,
):
super().__init__()
# 初始化模型参数
self.embed_dim = embed_dim # 嵌入维度
self.num_heads = num_heads # 注意力头数
self.dropout = dropout # dropout概率
self.head_dim = embed_dim // num_heads # 每个注意力头的维度
self.config = config # 配置对象
# 检查embed_dim必须能被num_heads整除
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim必须能被num_heads整除 (得到 `embed_dim`: {self.embed_dim}"
f" 和 `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5 # 缩放因子
self.is_decoder = is_decoder # 是否为解码器注意力
self.is_causal = is_causal # 是否为因果注意力
# 线性变换层定义
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) # K矩阵投影
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) # V矩阵投影
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) # Q矩阵投影
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) # 输出投影层
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
# 重新整形注意力张量,调整维度顺序以便多头注意力计算
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
# 前向传播函数,实现注意力计算
# hidden_states: 输入的隐藏状态张量
# key_value_states: 键值状态张量(可选)
# past_key_value: 过去的键值状态元组(可选)
# attention_mask: 注意力掩码(可选)
# layer_head_mask: 层头掩码(可选)
# output_attentions: 是否输出注意力权重(布尔值)
# 进行自注意力计算
# 1. 计算查询、键、值的投影
query = self.q_proj(hidden_states)
key = self.k_proj(key_value_states if key_value_states is not None else hidden_states)
value = self.v_proj(key_value_states if key_value_states is not None else hidden_states)
# 2. 重塑张量以便并行计算多头注意力
query = self._shape(query, query.size(1), query.size(0))
key = self._shape(key, key.size(1), key.size(0))
value = self._shape(value, value.size(1), value.size(0))
# 3. 计算注意力分数及归一化
attn_weights = torch.matmul(query, key.transpose(-1, -2))
attn_weights *= self.scaling
if attention_mask is not None:
attn_weights += attention_mask
attn_probs = nn.functional.softmax(attn_weights, dim=-1)
attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training)
# 4. 使用注意力权重计算加权和
attn_output = torch.matmul(attn_probs, value)
# 5. 将多头注意力结果重塑回原始形状
attn_output = attn_output.transpose(1, 2).contiguous().view(attn_output.size(0), attn_output.size(2), -1)
# 6. 执行最终的线性变换并返回结果
attn_output = self.out_proj(attn_output)
return attn_output
# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer.forward
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
class MusicgenPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 使用 MusicgenDecoderConfig 作为配置类
config_class = MusicgenDecoderConfig
# 模型权重前缀
base_model_prefix = "model"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 不需要拆分的模块列表
_no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"]
def _init_weights(self, module):
# 从配置中获取初始化因子
std = self.config.initializer_factor
# 如果是线性层或者卷积层
if isinstance(module, (nn.Linear, nn.Conv1d)):
# 初始化权重为正态分布
module.weight.data.normal_(mean=0.0, std=std)
# 如果有偏置项,初始化为零
if module.bias is not None:
module.bias.data.zero_()
# 如果是嵌入层
elif isinstance(module, nn.Embedding):
# 初始化权重为正态分布
module.weight.data.normal_(mean=0.0, std=std)
# 如果有填充索引,将填充索引位置的权重初始化为零
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
MUSICGEN_START_DOCSTRING = r"""
The Musicgen model was proposed in [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by
Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, Alexandre Défossez. It is an
encoder decoder transformer trained on the task of conditional music generation
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MusicgenConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MUSICGEN_INPUTS_DOCSTRING = r"""
"""
MUSICGEN_DECODER_INPUTS_DOCSTRING = r"""
"""
class MusicgenDecoder(MusicgenPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MusicgenDecoderLayer`]
"""
# 初始化函数,接受一个MusicgenDecoderConfig对象作为配置参数
def __init__(self, config: MusicgenDecoderConfig):
# 调用父类的初始化函数
super().__init__(config)
# 设置dropout率
self.dropout = config.dropout
# 设置层级dropout率
self.layerdrop = config.layerdrop
# 设置最大目标位置数
self.max_target_positions = config.max_position_embeddings
# 设置模型的隐藏层大小
self.d_model = config.hidden_size
# 设置代码本数目
self.num_codebooks = config.num_codebooks
# 设置嵌入缩放比例,如果配置中开启了嵌入缩放则为隐藏层大小的平方根,否则为1.0
self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
# 设置嵌入维度为词汇表大小加1(用于特殊符号),创建一个嵌入模块列表
embed_dim = config.vocab_size + 1
self.embed_tokens = nn.ModuleList(
[nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
)
# 设置位置嵌入对象,使用正弦函数生成的位置嵌入
self.embed_positions = MusicgenSinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.hidden_size,
)
# 创建解码器层列表,包含config.num_hidden_layers个MusicgenDecoderLayer对象
self.layers = nn.ModuleList([MusicgenDecoderLayer(config) for _ in range(config.num_hidden_layers)])
# 创建层级归一化层,使用隐藏层大小进行归一化
self.layer_norm = nn.LayerNorm(config.hidden_size)
# 初始化梯度检查点标志为False
self.gradient_checkpointing = False
# 执行额外的初始化步骤,包括权重初始化和最终处理
self.post_init()
# 获取输入嵌入模块列表
def get_input_embeddings(self):
return self.embed_tokens
# 设置输入嵌入模块列表的值
def set_input_embeddings(self, value):
self.embed_tokens = value
# 前向传播函数,接受多个输入参数并返回相应的输出
@add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 在 MusicgenModel 类之上添加文档字符串,描述这是一个 Musicgen 解码器模型,输出原始隐藏状态,没有特定的顶层头部。
# MUSICGEN_START_DOCSTRING 是一个预定义的文档字符串常量,用于提供更详细的模型描述信息。
@add_start_docstrings(
"The bare Musicgen decoder model outputting raw hidden-states without any specific head on top.",
MUSICGEN_START_DOCSTRING,
)
# MusicgenModel 类,继承自 MusicgenPreTrainedModel 类。
class MusicgenModel(MusicgenPreTrainedModel):
def __init__(self, config: MusicgenDecoderConfig):
# 调用父类的初始化方法,传入配置对象 config。
super().__init__(config)
# 创建一个 MusicgenDecoder 对象并赋值给 self.decoder。
self.decoder = MusicgenDecoder(config)
# 初始化权重并应用最终处理(这里可能包括一些额外的初始化操作或配置参数)。
self.post_init()
# 获取输入嵌入的方法,返回解码器对象中的嵌入 tokens。
def get_input_embeddings(self):
return self.decoder.embed_tokens
# 设置输入嵌入的方法,将输入的嵌入值赋给解码器对象的 embed_tokens 属性。
def set_input_embeddings(self, value):
self.decoder.embed_tokens = value
# 获取解码器对象的方法,返回 self.decoder。
def get_decoder(self):
return self.decoder
# 在 forward 方法上添加模型前向传播的文档字符串,这里使用了 MUSICGEN_DECODER_INPUTS_DOCSTRING。
@add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
# 如果没有显式指定,使用默认的输出注意力机制
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果没有显式指定,使用默认的输出隐藏状态
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果没有显式指定,使用默认的缓存策略
use_cache = use_cache if use_cache is not None else self.config.use_cache
# 如果没有显式指定,使用默认的返回字典设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
# 调用解码器模型,返回解码器的输出结果
decoder_outputs = self.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_attention_mask=encoder_attention_mask,
encoder_hidden_states=encoder_hidden_states,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果没有设置返回字典,则直接返回解码器输出
if not return_dict:
return decoder_outputs
# 如果设置了返回字典,则构建包含过去键值和交叉注意力的输出对象
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
hidden_states=decoder_outputs.hidden_states,
attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
)
# 使用装饰器为类添加文档字符串,描述此类为带有语言建模头的 MusicGen 解码器模型
@add_start_docstrings(
"The MusicGen decoder model with a language modelling head on top.",
MUSICGEN_START_DOCSTRING,
)
class MusicgenForCausalLM(MusicgenPreTrainedModel):
def __init__(self, config: MusicgenDecoderConfig):
# 调用父类构造函数初始化配置
super().__init__(config)
# 创建基础的 MusicgenModel 模型
self.model = MusicgenModel(config)
# 设置编码本数量和语言模型头列表
self.num_codebooks = config.num_codebooks
self.lm_heads = nn.ModuleList(
[nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
)
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
# 返回解码器的嵌入层
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
# 设置解码器的嵌入层
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
# 返回语言模型头列表
return self.lm_heads
def set_output_embeddings(self, new_embeddings):
# 设置新的语言模型头列表
self.lm_heads = new_embeddings
def set_decoder(self, decoder):
# 设置解码器
self.model.decoder = decoder
def get_decoder(self):
# 获取解码器
return self.model.decoder
@add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 函数签名,定义了该解码器模型的前向传播方法,支持多种输入参数和可选的输出控制标志
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Returns:
"""
# 根据 return_dict 参数决定是否返回字典形式的结果
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将模型的输入传递给模型,并获取模型的输出
outputs = self.model(
input_ids,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取隐藏状态(hidden_states)
hidden_states = outputs[0]
# 使用 lm_heads 对隐藏状态进行预测,得到语言模型的 logits
lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)
# 初始化损失值为 None
loss = None
# 如果存在 labels,则抛出未实现错误,因为 Musicgen 的训练尚未实现
if labels is not None:
raise NotImplementedError("Training is not implemented for Musicgen.")
# 重新组织 lm_logits 的形状以适应后续处理
# (bsz, num_codebooks, seq_len, vocab_size) -> (bsz * num_codebooks, seq_len, vocab_size)
lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])
# 如果不要求返回字典形式的输出,则将 lm_logits 与其他输出组合成 tuple 返回
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典形式的输出,则构造 CausalLMOutputWithCrossAttentions 对象返回
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
def prepare_inputs_for_generation(
self,
input_ids,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
use_cache=True,
delay_pattern_mask=None,
guidance_scale=None,
**kwargs,
):
# 如果延迟模式掩码为 None,则构建一个延迟模式掩码
if delay_pattern_mask is None:
input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
input_ids,
pad_token_id=self.generation_config.pad_token_id,
max_length=self.generation_config.max_length,
)
# 应用延迟模式掩码到输入的 token IDs
input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
# 如果有指导比例且大于 1,则为无分类器指导复制解码器参数到批次维度(在采样前将其拆分)
if guidance_scale is not None and guidance_scale > 1:
input_ids = input_ids.repeat((2, 1))
# 如果存在注意力掩码,则将其在批次维度上重复
if attention_mask is not None:
attention_mask = attention_mask.repeat((2, 1))
# 如果过去的键值不为 None,则仅保留最后一个 token ID
if past_key_values is not None:
input_ids = input_ids[:, -1:]
# 返回生成方法的参数字典
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"encoder_hidden_states": encoder_hidden_states,
"encoder_attention_mask": encoder_attention_mask,
"head_mask": head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
"""Apply a delay pattern mask to the decoder input ids, only preserving predictions where
the mask is set to -1, and otherwise setting to the value detailed in the mask."""
# 获取输入 token IDs 的序列长度
seq_len = input_ids.shape[-1]
# 将解码器的 pad token 掩码裁剪到与序列长度相匹配的维度
decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
# 根据解码器 pad token 掩码,保留掩码值为 -1 的预测,其余设置为掩码中的详细值
input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
return input_ids
@torch.no_grad()
def generate(
self,
inputs: Optional[torch.Tensor] = None,
generation_config: Optional[GenerationConfig] = None,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
synced_gpus: Optional[bool] = None,
streamer: Optional["BaseStreamer"] = None,
**kwargs,
@add_start_docstrings(
"The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder, "
"for music generation tasks with one or both of text and audio prompts.",
MUSICGEN_START_DOCSTRING,
)
class MusicgenForConditionalGeneration(PreTrainedModel):
# 指定配置类为MusicgenConfig
config_class = MusicgenConfig
# 指定基础模型前缀为"encoder_decoder"
base_model_prefix = "encoder_decoder"
# 主要输入名称为"input_ids"
main_input_name = "input_ids"
# 支持梯度检查点
supports_gradient_checkpointing = True
def __init__(
self,
config: Optional[MusicgenConfig] = None,
text_encoder: Optional[PreTrainedModel] = None,
audio_encoder: Optional[PreTrainedModel] = None,
decoder: Optional[MusicgenForCausalLM] = None,
):
# 构造函数,初始化函数,接受MusicgenConfig配置,文本编码器、音频编码器和解码器作为参数
def tie_weights(self):
# 绑定权重函数,用于可能需要绑定文本编码器和解码器的情况
if self.config.tie_encoder_decoder:
# 如果配置要求绑定文本编码器和解码器,则执行以下操作
decoder_base_model_prefix = self.decoder.base_model_prefix
self._tie_encoder_decoder_weights(
self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
)
def get_audio_encoder(self):
# 返回音频编码器
return self.audio_encoder
def get_text_encoder(self):
# 返回文本编码器
return self.text_encoder
def get_encoder(self):
# 获取文本编码器以计算生成时的编码器隐藏状态
return self.get_text_encoder()
def get_decoder(self):
# 返回解码器
return self.decoder
def get_input_embeddings(self):
# 返回文本编码器的输入嵌入
return self.text_encoder.get_input_embeddings()
def get_output_embeddings(self):
# 返回解码器的输出嵌入
return self.decoder.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
# 设置解码器的输出嵌入
return self.decoder.set_output_embeddings(new_embeddings)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""
Example:
```
>>> from transformers import MusicgenForConditionalGeneration
>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
```"""
# 目前不支持快速初始化复合模型
if kwargs.get("_fast_init", False):
logger.warning(
"Fast initialization is currently not supported for MusicgenForConditionalGeneration. "
"Falling back to slow initialization..."
)
kwargs["_fast_init"] = False
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@classmethod
def from_sub_models_pretrained(
cls,
text_encoder_pretrained_model_name_or_path: str = None,
audio_encoder_pretrained_model_name_or_path: str = None,
decoder_pretrained_model_name_or_path: str = None,
*model_args,
**kwargs,
):
# 从预训练子模型加载复合模型的类方法
@add_start_docstrings_to_model_forward(MUSICGEN_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法用于模型的前向传播
def forward(
self,
input_ids: Optional[torch.LongTensor] = None, # 输入的 token IDs,类型为可选的长整型张量
attention_mask: Optional[torch.BoolTensor] = None, # 注意力遮罩,类型为可选的布尔张量
input_values: Optional[torch.FloatTensor] = None, # 输入的值,类型为可选的浮点张量
padding_mask: Optional[torch.BoolTensor] = None, # 填充遮罩,类型为可选的布尔张量
decoder_input_ids: Optional[torch.LongTensor] = None, # 解码器输入的 token IDs,类型为可选的长整型张量
decoder_attention_mask: Optional[torch.BoolTensor] = None, # 解码器注意力遮罩,类型为可选的布尔张量
encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None, # 编码器输出,类型为可选的浮点张量元组
past_key_values: Tuple[Tuple[torch.FloatTensor]] = None, # 过去的键值,类型为元组的元组,包含浮点张量
inputs_embeds: Optional[torch.FloatTensor] = None, # 输入的嵌入,类型为可选的浮点张量
decoder_inputs_embeds: Optional[torch.FloatTensor] = None, # 解码器输入的嵌入,类型为可选的浮点张量
labels: Optional[torch.LongTensor] = None, # 标签,类型为可选的长整型张量
use_cache: Optional[bool] = None, # 是否使用缓存,类型为可选的布尔值
output_attentions: Optional[bool] = None, # 是否输出注意力权重,类型为可选的布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,类型为可选的布尔值
return_dict: Optional[bool] = None, # 是否返回字典,类型为可选的布尔值
**kwargs, # 其他关键字参数,包括所有未列出的参数
):
pass # 这里是方法的占位符,未实现具体的功能逻辑
# 定义一个方法用于为生成准备输入
def prepare_inputs_for_generation(
self,
decoder_input_ids, # 解码器输入的 token IDs,必填参数
past_key_values=None, # 过去的键值,类型为可选的默认空值
attention_mask=None, # 注意力遮罩,类型为可选的默认空值
head_mask=None, # 头部遮罩,类型为可选的默认空值
decoder_attention_mask=None, # 解码器注意力遮罩,类型为可选的默认空值
decoder_head_mask=None, # 解码器头部遮罩,类型为可选的默认空值
cross_attn_head_mask=None, # 交叉注意力头部遮罩,类型为可选的默认空值
use_cache=None, # 是否使用缓存,类型为可选的默认空值
encoder_outputs=None, # 编码器输出,类型为可选的默认空值
decoder_delay_pattern_mask=None, # 解码器延迟模式遮罩,类型为可选的默认空值
guidance_scale=None, # 引导比例,类型为可选的默认空值
**kwargs, # 其他关键字参数,包括所有未列出的参数
):
pass # 这里是方法的占位符,未实现具体的功能逻辑
):
# 如果没有提供解码器延迟模式掩码,则从解码器构建一个
if decoder_delay_pattern_mask is None:
decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
decoder_input_ids,
self.generation_config.pad_token_id,
max_length=self.generation_config.max_length,
)
# 应用延迟模式掩码到解码器输入IDs
decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)
# 如果给定了guidance_scale并且大于1,则进行以下操作
if guidance_scale is not None and guidance_scale > 1:
# 对于无分类器引导,需要在批次维度上复制解码器参数(在采样之前将其拆分)
decoder_input_ids = decoder_input_ids.repeat((2, 1))
if decoder_attention_mask is not None:
decoder_attention_mask = decoder_attention_mask.repeat((2, 1))
# 如果给定了过去的键值,则执行以下操作
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 一些生成方法已经仅传递最后一个输入ID
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 默认使用旧的行为:仅保留最后一个ID
remove_prefix_length = decoder_input_ids.shape[1] - 1
# 从解码器输入IDs中去除前缀长度
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
# 返回准备好的字典,包含用于生成的各种输入和掩码
return {
"input_ids": None, # encoder_outputs已定义,不需要input_ids
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def _prepare_decoder_input_ids_for_generation(
self,
batch_size: int,
model_input_name: str,
model_kwargs: Dict[str, torch.Tensor],
decoder_start_token_id: int = None,
bos_token_id: int = None,
device: torch.device = None,
) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
"""Prepares `decoder_input_ids` for generation with encoder-decoder models"""
# 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
# we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
# Retrieve `decoder_input_ids` from `model_kwargs` and remove it from the dictionary
decoder_input_ids = model_kwargs.pop("decoder_input_ids")
elif "input_ids" in model_kwargs and model_input_name != "input_ids":
# If `input_ids` is found in `model_kwargs` and it's not the main input name, assign it to `decoder_input_ids`
decoder_input_ids = model_kwargs.pop("input_ids")
else:
# If neither `decoder_input_ids` nor `input_ids` are provided, initialize `decoder_input_ids` as None
decoder_input_ids = None
# 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
# Get the special token ID to start `decoder_input_ids` sequence
decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
if device is None:
device = self.device
# Create a tensor to initialize `decoder_input_ids` starting with `decoder_start_token_id`
decoder_input_ids_start = (
torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
* decoder_start_token_id
)
# If no `decoder_input_ids` provided by the user, use `decoder_input_ids_start`
if decoder_input_ids is None:
decoder_input_ids = decoder_input_ids_start
# If user-provided `decoder_input_ids` does not start with `decoder_start_token_id`, prepend it
elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
# Adjust `decoder_attention_mask` if provided along with `decoder_input_ids`
if "decoder_attention_mask" in model_kwargs:
decoder_attention_mask = model_kwargs["decoder_attention_mask"]
decoder_attention_mask = torch.cat(
(torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
dim=-1,
)
model_kwargs["decoder_attention_mask"] = decoder_attention_mask
return decoder_input_ids, model_kwargs
) -> Dict[str, Any]:
# 1. 获取文本编码器
encoder = self.get_text_encoder()
# 2. 准备编码器参数和编码器关键字参数,从模型关键字参数中获取
irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
encoder_kwargs = {
argument: value
for argument, value in model_kwargs.items()
if not any(argument.startswith(p) for p in irrelevant_prefix)
}
# 检查编码器的参数签名
encoder_signature = set(inspect.signature(encoder.forward).parameters)
encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
# 如果编码器不接受通配符参数,则过滤掉不在签名内的参数
if not encoder_accepts_wildcard:
encoder_kwargs = {
argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
}
# 3. 确保编码器返回 `ModelOutput`
model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
encoder_kwargs["return_dict"] = True
encoder_kwargs[model_input_name] = inputs_tensor
# 调用编码器的 forward 方法获取最后隐藏状态
last_hidden_state = encoder(**encoder_kwargs).last_hidden_state
# 如果有指导比例并且大于1,则添加一个“null”输入到编码器隐藏状态中
if guidance_scale is not None and guidance_scale > 1:
last_hidden_state = torch.concatenate([last_hidden_state, torch.zeros_like(last_hidden_state)], dim=0)
if "attention_mask" in model_kwargs:
model_kwargs["attention_mask"] = torch.concatenate(
[model_kwargs["attention_mask"], torch.zeros_like(model_kwargs["attention_mask"])], dim=0
)
# 将编码器的输出设置为模型基本输出的最后隐藏状态
model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=last_hidden_state)
return model_kwargs
def _prepare_audio_encoder_kwargs_for_generation(
self, input_values, model_kwargs, model_input_name: Optional[str] = None
):
raise NotImplementedError("This method is not implemented yet.")
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
# 根据标签准备解码器的输入 ID,将标签向右移动一位
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
def resize_token_embeddings(self, *args, **kwargs):
# 抛出未实现错误,不能直接通过 EncoderDecoderModel 调整嵌入层大小
raise NotImplementedError(
"Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
" respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
" model.decoder.resize_token_embeddings(...))"
)
def _maybe_initialize_input_ids_for_generation(
self,
inputs: Optional[torch.Tensor] = None,
bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
):
# 这个方法可能用于为生成初始化输入 ID,具体功能暂时不明确,需要进一步分析上下文来理解其作用。
pass
) -> torch.LongTensor:
"""Initializes input ids for generation, if necessary."""
# 如果已经提供了输入,则直接返回这些输入
if inputs is not None:
return inputs
# 检查是否在 `model_kwargs` 中存在 `encoder_outputs`
encoder_outputs = model_kwargs.get("encoder_outputs")
if encoder_outputs is not None:
# 创建一个具有 `-100` 值的虚拟 input_ids,用作健全性检查,确保它们不会用于编码
shape = encoder_outputs[0].size()[:-1]
return torch.ones(shape, dtype=torch.long, device=self.device) * -100
# 如果未提供 `input_ids` 但未定义 `bos_token_id`,则抛出错误
if bos_token_id is None:
raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
# 如果 `model_kwargs` 中存在某些张量,则可以从中推断出批量大小。这在软提示或基于解码器的多模态实现中特别有用。
batch_size = 1
for value in model_kwargs.values():
if isinstance(value, torch.Tensor):
batch_size = value.shape[0]
break
# 创建一个形状为 (batch_size, 1) 的张量,填充值为 bos_token_id,并使用设备 self.device
return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
# 定义一个方法,用于获取无条件生成的输入,以便在没有特征提取器或分词器的情况下使用模型。
def get_unconditional_inputs(self, num_samples=1):
"""
Helper function to get null inputs for unconditional generation, enabling the model to be used without the
feature extractor or tokenizer.
Args:
num_samples (int, *optional*):
Number of audio samples to unconditionally generate.
max_new_tokens (int, *optional*):
Number of tokens to generate for each sample. More tokens means longer audio samples, at the expense of
longer inference (since more audio tokens need to be generated per sample).
Example:
```
>>> from transformers import MusicgenForConditionalGeneration
>>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
>>>
>>> unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
>>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
```"""
# 创建一个全零张量,用于存储模型的隐藏状态输出,形状为 (num_samples, 1, hidden_size)
last_hidden_state = torch.zeros(
(num_samples, 1, self.config.text_encoder.hidden_size), device=self.device, dtype=self.dtype
)
# 创建一个全零张量作为注意力掩码,形状为 (num_samples, 1),用于指示哪些位置需要注意力
attention_mask = torch.zeros((num_samples, 1), device=self.device, dtype=torch.long)
# 返回一个包含无条件生成所需输入的 MusicgenUnconditionalInput 对象
return MusicgenUnconditionalInput(
encoder_outputs=(last_hidden_state,), # 编码器输出,包含隐藏状态
attention_mask=attention_mask, # 注意力掩码,全零表示不区分注意力
guidance_scale=1.0, # 指导尺度,通常设置为1.0
)
.\models\musicgen\processing_musicgen.py
"""
Text/audio processor class for MusicGen
"""
from typing import List, Optional
import numpy as np
from ...processing_utils import ProcessorMixin
from ...utils import to_numpy
class MusicgenProcessor(ProcessorMixin):
r"""
Constructs a MusicGen processor which wraps an EnCodec feature extractor and a T5 tokenizer into a single processor
class.
[`MusicgenProcessor`] offers all the functionalities of [`EncodecFeatureExtractor`] and [`TTokenizer`]. See
[`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
Args:
feature_extractor (`EncodecFeatureExtractor`):
An instance of [`EncodecFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`T5Tokenizer`):
An instance of [`T5Tokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "EncodecFeatureExtractor"
tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
def __call__(self, *args, **kwargs):
"""
Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
argument to [`~T5Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
information.
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if text is not None:
inputs = self.tokenizer(text, **kwargs)
if audio is not None:
audio_inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if audio is None:
return inputs
elif text is None:
return audio_inputs
else:
inputs["input_values"] = audio_inputs["input_values"]
if "padding_mask" in audio_inputs:
inputs["padding_mask"] = audio_inputs["padding_mask"]
return inputs
def batch_decode(self, *args, **kwargs):
"""
This method is used to decode either batches of audio outputs from the MusicGen model, or batches of token ids
from the tokenizer. In the case of decoding token ids, this method forwards all its arguments to T5Tokenizer's
[`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information.
"""
audio_values = kwargs.pop("audio", None)
padding_mask = kwargs.pop("padding_mask", None)
if len(args) > 0:
audio_values = args[0]
args = args[1:]
if audio_values is not None:
return self._decode_audio(audio_values, padding_mask=padding_mask)
else:
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to T5Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def _decode_audio(self, audio_values, padding_mask: Optional = None) -> List[np.ndarray]:
"""
This method strips any padding from the audio values to return a list of numpy audio arrays.
"""
audio_values = to_numpy(audio_values)
bsz, channels, seq_len = audio_values.shape
if padding_mask is None:
return list(audio_values)
padding_mask = to_numpy(padding_mask)
difference = seq_len - padding_mask.shape[-1]
padding_value = 1 - self.feature_extractor.padding_value
padding_mask = np.pad(padding_mask, ((0, 0), (0, difference)), "constant", constant_values=padding_value)
audio_values = audio_values.tolist()
for i in range(bsz):
sliced_audio = np.asarray(audio_values[i])[
padding_mask[i][None, :] != self.feature_extractor.padding_value
]
audio_values[i] = sliced_audio.reshape(channels, -1)
return audio_values
.\models\musicgen\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_musicgen": [
"MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
"MusicgenConfig",
"MusicgenDecoderConfig",
],
"processing_musicgen": ["MusicgenProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_musicgen"] = [
"MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST",
"MusicgenForConditionalGeneration",
"MusicgenForCausalLM",
"MusicgenModel",
"MusicgenPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_musicgen import (
MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
MusicgenConfig,
MusicgenDecoderConfig,
)
from .processing_musicgen import MusicgenProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_musicgen import (
MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
MusicgenForCausalLM,
MusicgenForConditionalGeneration,
MusicgenModel,
MusicgenPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\musicgen_melody\configuration_musicgen_melody.py
""" Musicgen Melody model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import AutoConfig
logger = logging.get_logger(__name__)
MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/musicgen-melody": "https://huggingface.co/facebook/musicgen-melody/resolve/main/config.json",
}
class MusicgenMelodyDecoderConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of an [`MusicgenMelodyDecoder`]. It is used to instantiate a
Musicgen Melody decoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Musicgen Melody
[facebook/musicgen-melody](https://huggingface.co/facebook/musicgen-melody) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "musicgen_melody_decoder"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=2048,
max_position_embeddings=2048,
num_hidden_layers=24,
ffn_dim=4096,
num_attention_heads=16,
layerdrop=0.0,
use_cache=True,
activation_function="gelu",
hidden_size=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
initializer_factor=0.02,
scale_embedding=False,
num_codebooks=4,
audio_channels=1,
pad_token_id=2048,
bos_token_id=2048,
eos_token_id=None,
tie_word_embeddings=False,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
max_position_embeddings=max_position_embeddings,
num_hidden_layers=num_hidden_layers,
ffn_dim=ffn_dim,
num_attention_heads=num_attention_heads,
layerdrop=layerdrop,
use_cache=use_cache,
activation_function=activation_function,
hidden_size=hidden_size,
dropout=dropout,
attention_dropout=attention_dropout,
activation_dropout=activation_dropout,
initializer_factor=initializer_factor,
scale_embedding=scale_embedding,
num_codebooks=num_codebooks,
audio_channels=audio_channels,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.ffn_dim = ffn_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.initializer_factor = initializer_factor
self.layerdrop = layerdrop
self.use_cache = use_cache
self.scale_embedding = scale_embedding
self.num_codebooks = num_codebooks
if audio_channels not in [1, 2]:
raise ValueError(f"Expected 1 (mono) or 2 (stereo) audio channels, got {audio_channels} channels.")
self.audio_channels = audio_channels
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
class MusicgenMelodyConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MusicgenMelodyModel`]. It is used to instantiate a
Musicgen Melody model according to the specified arguments, defining the text encoder, audio encoder and Musicgen Melody decoder
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the Musicgen Melody
[facebook/musicgen-melody](https://huggingface.co/facebook/musicgen-melody) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
num_chroma (`int`, *optional*, defaults to 12): Number of chroma bins to use.
chroma_length (`int`, *optional*, defaults to 235):
Maximum chroma duration if audio is used to condition the model. Corresponds to the maximum duration used during training.
kwargs (*optional*):
Dictionary of keyword arguments. Notably:
- **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
defines the text encoder config.
- **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
defines the audio encoder config.
- **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
the decoder config.
Example:
```
>>> from transformers import (
... MusicgenMelodyConfig,
... MusicgenMelodyDecoderConfig,
... T5Config,
... EncodecConfig,
... MusicgenMelodyForConditionalGeneration,
... )
>>> # Initializing text encoder, audio encoder, and decoder model configurations
>>> text_encoder_config = T5Config()
>>> audio_encoder_config = EncodecConfig()
>>> decoder_config = MusicgenMelodyDecoderConfig()
>>> configuration = MusicgenMelodyConfig.from_sub_models_config(
... text_encoder_config, audio_encoder_config, decoder_config
... )
>>> # Initializing a MusicgenMelodyForConditionalGeneration (with random weights) from the facebook/musicgen-melody style configuration
>>> model = MusicgenMelodyForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
>>> config_text_encoder = model.config.text_encoder
>>> config_audio_encoder = model.config.audio_encoder
>>> config_decoder = model.config.decoder
>>> # Saving the model, including its configuration
>>> model.save_pretrained("musicgen_melody-model")
>>> # loading model and config from pretrained folder
>>> musicgen_melody_config = MusicgenMelodyConfig.from_pretrained("musicgen_melody-model")
```
# 使用预训练模型名称和配置创建音乐生成模型对象
model = MusicgenMelodyForConditionalGeneration.from_pretrained("musicgen_melody-model", config=musicgen_melody_config)
# 设置模型类型为音乐生成旋律
model_type = "musicgen_melody"
# 标记此模型为一个生成作品
is_composition = True
def __init__(
self,
num_chroma=12,
chroma_length=235,
**kwargs,
):
super().__init__(**kwargs)
# 检查是否初始化了 text_encoder、audio_encoder 和 decoder 配置,否则抛出值错误
if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")
# 弹出并初始化文本编码器配置
text_encoder_config = kwargs.pop("text_encoder")
text_encoder_model_type = text_encoder_config.pop("model_type")
# 弹出并初始化音频编码器配置
audio_encoder_config = kwargs.pop("audio_encoder")
audio_encoder_model_type = audio_encoder_config.pop("model_type")
# 弹出并初始化解码器配置
decoder_config = kwargs.pop("decoder")
# 使用 AutoConfig 根据模型类型和配置初始化文本编码器、音频编码器和解码器
self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
self.decoder = MusicgenMelodyDecoderConfig(**decoder_config)
self.is_encoder_decoder = False
# 设置音调数量和音调长度
self.num_chroma = num_chroma
self.chroma_length = chroma_length
@classmethod
def from_sub_models_config(
cls,
text_encoder_config: PretrainedConfig,
audio_encoder_config: PretrainedConfig,
decoder_config: MusicgenMelodyDecoderConfig,
**kwargs,
):
r"""
从文本编码器、音频编码器和解码器配置实例化一个 MusicgenMelodyConfig(或其派生类)。
Returns:
[`MusicgenMelodyConfig`]: 配置对象的一个实例
"""
# 使用给定的配置实例化当前类的对象
return cls(
text_encoder=text_encoder_config.to_dict(),
audio_encoder=audio_encoder_config.to_dict(),
decoder=decoder_config.to_dict(),
**kwargs,
)
@property
# 这是一个属性,因为您可能想要动态更改编解码器模型
def sampling_rate(self):
# 返回音频编码器的采样率
return self.audio_encoder.sampling_rate
.\models\musicgen_melody\convert_musicgen_melody_transformers.py
"""Convert Musicgen Melody checkpoints from the original repository."""
import argparse
from pathlib import Path
from typing import Dict, OrderedDict, Tuple
import torch
from audiocraft.models import MusicGen
from transformers import (
AutoTokenizer,
EncodecModel,
T5EncoderModel,
)
from transformers.models.musicgen_melody.configuration_musicgen_melody import MusicgenMelodyDecoderConfig
from transformers.models.musicgen_melody.feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
from transformers.models.musicgen_melody.modeling_musicgen_melody import (
MusicgenMelodyForCausalLM,
MusicgenMelodyForConditionalGeneration,
)
from transformers.models.musicgen_melody.processing_musicgen_melody import MusicgenMelodyProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
EXPECTED_ADDITIONAL_KEYS = ["condition_provider.conditioners.self_wav.chroma.spec.window"]
def rename_keys(name):
if "emb" in name:
name = name.replace("emb", "model.decoder.embed_tokens")
if "transformer" in name:
name = name.replace("transformer", "model.decoder")
if "cross_attention" in name:
name = name.replace("cross_attention", "encoder_attn")
if "linear1" in name:
name = name.replace("linear1", "fc1")
if "linear2" in name:
name = name.replace("linear2", "fc2")
if "norm1" in name:
name = name.replace("norm1", "self_attn_layer_norm")
if "norm_cross" in name:
name = name.replace("norm_cross", "encoder_attn_layer_norm")
if "norm2" in name:
name = name.replace("norm2", "final_layer_norm")
if "out_norm" in name:
name = name.replace("out_norm", "model.decoder.layer_norm")
if "linears" in name:
name = name.replace("linears", "lm_heads")
if "condition_provider.conditioners.description.output_proj" in name:
name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
if "condition_provider.conditioners.self_wav.output_proj" in name:
name = name.replace("condition_provider.conditioners.self_wav.output_proj", "audio_enc_to_dec_proj")
return name
def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
"""Function that takes the fairseq MusicgenMelody state dict and renames it according to the HF
module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
text encoder projection and for the audio encoder projection."""
keys = list(state_dict.keys())
enc_dec_proj_state_dict = {}
audio_enc_to_dec_proj_state_dict = {}
for key in keys:
val = state_dict.pop(key)
key = rename_keys(key)
if "in_proj_weight" in key:
state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
elif "audio_enc_to_dec_proj" in key:
audio_enc_to_dec_proj_state_dict[key[len("audio_enc_to_dec_proj.") :]] = val
elif "enc_to_dec_proj" in key:
enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
else:
state_dict[key] = val
return state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict
def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenMelodyDecoderConfig:
if checkpoint == "facebook/musicgen-melody" or checkpoint == "facebook/musicgen-stereo-melody":
hidden_size = 1536
num_hidden_layers = 48
num_attention_heads = 24
elif checkpoint == "facebook/musicgen-melody-large" or checkpoint == "facebook/musicgen-stereo-melody-large":
hidden_size = 2048
num_hidden_layers = 48
num_attention_heads = 32
else:
raise ValueError(
"Checkpoint should be one of `['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, "
"or `['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
f"for the stereo checkpoints, got {checkpoint}."
)
if "stereo" in checkpoint:
audio_channels = 2
num_codebooks = 8
else:
audio_channels = 1
num_codebooks = 4
config = MusicgenMelodyDecoderConfig(
hidden_size=hidden_size,
ffn_dim=hidden_size * 4,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_codebooks=num_codebooks,
audio_channels=audio_channels,
)
return config
@torch.no_grad()
def convert_musicgen_melody_checkpoint(
checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", test_same_output=False
):
fairseq_model = MusicGen.get_pretrained(checkpoint, device=args.device)
decoder_state_dict = fairseq_model.lm.state_dict()
decoder_state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict = rename_state_dict(
decoder_state_dict, hidden_size=decoder_config.hidden_size
)
text_encoder = T5EncoderModel.from_pretrained("t5-base")
audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
decoder = MusicgenMelodyForCausalLM(decoder_config).eval()
missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
for key in missing_keys.copy():
if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
missing_keys.remove(key)
for key in unexpected_keys.copy():
if key in EXPECTED_ADDITIONAL_KEYS:
unexpected_keys.remove(key)
if len(missing_keys) > 0:
raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
if len(unexpected_keys) > 0:
raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
model = MusicgenMelodyForConditionalGeneration(
text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder
).to(args.device)
model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
model.audio_enc_to_dec_proj.load_state_dict(audio_enc_to_dec_proj_state_dict)
input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1).to(device)
decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1).to(device)
with torch.no_grad():
logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
output_length = 1 + input_ids.shape[1] + model.config.chroma_length
if logits.shape != (2 * decoder_config.num_codebooks, output_length, 2048):
raise ValueError("Incorrect shape for logits")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
feature_extractor = MusicgenMelodyFeatureExtractor()
processor = MusicgenMelodyProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
model.generation_config.decoder_start_token_id = 2048
model.generation_config.pad_token_id = 2048
model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
model.generation_config.do_sample = True
model.generation_config.guidance_scale = 3.0
if test_same_output:
decoder_input_ids = torch.ones_like(decoder_input_ids).to(device) * model.generation_config.pad_token_id
with torch.no_grad():
decoder_input_ids = decoder_input_ids[: decoder_config.num_codebooks]
inputs = processor(text=["gen"], return_tensors="pt", padding=True).to(device)
logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
attributes, prompt_tokens = fairseq_model._prepare_tokens_and_attributes(["gen"], None)
original_logits = fairseq_model.lm.forward(
decoder_input_ids.reshape(1, decoder_config.num_codebooks, -1), attributes
)
torch.testing.assert_close(
original_logits.squeeze(2).reshape(decoder_config.num_codebooks, -1),
logits[:, -1],
rtol=1e-5,
atol=5e-5,
)
if pytorch_dump_folder is not None:
Path(pytorch_dump_folder).mkdir(exist_ok=True)
logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
model.save_pretrained(pytorch_dump_folder)
processor.save_pretrained(pytorch_dump_folder)
if repo_id:
logger.info(f"Pushing model {checkpoint} to {repo_id}")
model.push_to_hub(repo_id, create_pr=True)
processor.push_to_hub(repo_id, create_pr=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint",
default="facebook/musicgen-melody",
type=str,
help="Checkpoint size of the Musicgen Melody model you'd like to convert. Can be one of: "
"`['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, or "
"`['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
"for the stereo checkpoints.",
)
parser.add_argument(
"--pytorch_dump_folder",
default=None,
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--push_to_hub",
default="musicgen-melody",
type=str,
help="Where to upload the converted model on the 🤗 hub.",
)
parser.add_argument(
"--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
)
parser.add_argument("--test_same_output", default=False, type=bool, help="If `True`, test if same output logits.")
args = parser.parse_args()
convert_musicgen_melody_checkpoint(
args.checkpoint, args.pytorch_dump_folder, args.push_to_hub, args.device, args.test_same_output
)
.\models\musicgen_melody\feature_extraction_musicgen_melody.py
"""
Feature extractor class for Musicgen Melody
"""
import copy
from typing import Any, Dict, List, Optional, Union
import numpy as np
from ...audio_utils import chroma_filter_bank
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import TensorType, is_torch_available, is_torchaudio_available, logging
if is_torch_available():
import torch
if is_torchaudio_available():
import torchaudio
logger = logging.get_logger(__name__)
class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a MusicgenMelody feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.
This class extracts chroma features from audio processed by [Demucs](https://github.com/adefossez/demucs/tree/main) or
directly from raw audio waveform.
"""
feature_size=12,
sampling_rate=32000,
hop_length=4096,
chunk_length=30,
n_fft=16384,
num_chroma=12,
padding_value=0.0,
return_attention_mask=False,
stem_indices=[3, 2],
model_input_names = ["input_features"]
def __init__(
self,
feature_size=12,
sampling_rate=32000,
hop_length=4096,
chunk_length=30,
n_fft=16384,
num_chroma=12,
padding_value=0.0,
return_attention_mask=False,
stem_indices=[3, 2],
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
self.n_fft = n_fft
self.hop_length = hop_length
self.chunk_length = chunk_length
self.n_samples = chunk_length * sampling_rate
self.sampling_rate = sampling_rate
self.chroma_filters = torch.from_numpy(
chroma_filter_bank(sampling_rate=sampling_rate, num_frequency_bins=n_fft, tuning=0, num_chroma=num_chroma)
).float()
self.spectrogram = torchaudio.transforms.Spectrogram(
n_fft=n_fft, win_length=n_fft, hop_length=hop_length, power=2, center=True, pad=0, normalized=True
)
self.stem_indices = stem_indices
def _torch_extract_fbank_features(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Compute the chroma spectrogram of the provided audio using the torchaudio spectrogram implementation and the librosa chroma features.
"""
wav_length = waveform.shape[-1]
if wav_length < self.n_fft:
pad = self.n_fft - wav_length
rest = 0 if pad % 2 == 0 else 1
waveform = torch.nn.functional.pad(waveform, (pad // 2, pad // 2 + rest), "constant", 0)
spec = self.spectrogram(waveform).squeeze(1)
raw_chroma = torch.einsum("cf, ...ft->...ct", self.chroma_filters, spec)
norm_chroma = torch.nn.functional.normalize(raw_chroma, p=float("inf"), dim=-2, eps=1e-6)
norm_chroma = norm_chroma.transpose(1, 2)
idx = norm_chroma.argmax(-1, keepdim=True)
norm_chroma[:] = 0
norm_chroma.scatter_(dim=-1, index=idx, value=1)
return norm_chroma
def _extract_stem_indices(self, audio, sampling_rate=None):
"""
Extracts stems from the output of the [Demucs](https://github.com/adefossez/demucs/tree/main) audio separation model,
then converts to mono-channel and resample to the feature extractor sampling rate.
Args:
audio (`torch.Tensor` of shape `(batch_size, num_stems, channel_size, audio_length)`):
The output of the Demucs model to be processed.
sampling_rate (`int`, *optional*):
Demucs sampling rate. If not specified, defaults to `44000`.
"""
sampling_rate = 44000 if sampling_rate is None else sampling_rate
wav = audio[:, torch.tensor(self.stem_indices)]
wav = wav.sum(1)
wav = wav.mean(dim=1, keepdim=True)
if sampling_rate != self.sampling_rate:
wav = torchaudio.functional.resample(
wav, sampling_rate, self.sampling_rate, rolloff=0.945, lowpass_filter_width=24
)
wav = wav.squeeze(1)
return wav
def __call__(
self,
audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
truncation: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = None,
padding: Optional[str] = True,
max_length: Optional[int] = None,
sampling_rate: Optional[int] = None,
**kwargs,
):
"""
调用函数,用于处理音频数据并返回处理后的结果。
Parameters:
audio (Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]):
输入的音频数据,可以是 numpy 数组,列表或嵌套列表形式。
truncation (bool, optional):
是否对音频进行截断,默认为 True。
pad_to_multiple_of (Optional[int], optional):
可选参数,对音频进行填充的倍数。
return_tensors (Optional[Union[str, TensorType]], optional):
可选参数,指定返回的数据类型,如字符串或张量类型。
return_attention_mask (Optional[bool], optional):
可选参数,是否返回注意力掩码。
padding (Optional[str], optional):
可选参数,是否进行填充,默认为 True。
max_length (Optional[int], optional):
可选参数,最大长度限制。
sampling_rate (Optional[int], optional):
可选参数,采样率。
**kwargs:
其他关键字参数。
Returns:
返回处理后的音频数据或特征。
"""
pass
def to_dict(self) -> Dict[str, Any]:
"""
将当前实例序列化为 Python 字典。
Returns:
`Dict[str, Any]`: 包含所有配置实例属性的字典。
"""
output = copy.deepcopy(self.__dict__)
output["feature_extractor_type"] = self.__class__.__name__
if "mel_filters" in output:
del output["mel_filters"]
if "window" in output:
del output["window"]
if "chroma_filters" in output:
del output["chroma_filters"]
if "spectrogram" in output:
del output["spectrogram"]
return output
.\models\musicgen_melody\modeling_musicgen_melody.py
""" PyTorch Musicgen Melody model."""
import copy
import inspect
import math
import random
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...generation.configuration_utils import GenerationConfig
from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
from ...generation.stopping_criteria import StoppingCriteriaList
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutputWithPast,
ModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ..auto.configuration_auto import AutoConfig
from ..auto.modeling_auto import AutoModel, AutoModelForTextEncoding
from .configuration_musicgen_melody import MusicgenMelodyConfig, MusicgenMelodyDecoderConfig
if TYPE_CHECKING:
from ...generation.streamers import BaseStreamer
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "MusicgenMelodyConfig"
_CHECKPOINT_FOR_DOC = "facebook/musicgen-melody"
MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/musicgen-melody",
]
@dataclass
class MusicgenMelodyOutputWithPast(ModelOutput):
"""
Base class for Musicgen Melody autoregressive outputs.
"""
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
语言建模损失(在提供 `labels` 时返回)。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
语言建模头的预测分数(SoftMax 之前的每个词汇标记的分数)。
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, 当传递 `use_cache=True` 或 `config.use_cache=True` 时返回):
长度为 `config.n_layers` 的 `tuple(torch.FloatTensor)` 的元组,每个元组包含 2 个张量,形状为
`(batch_size, num_heads, sequence_length, embed_size_per_head)`。
包含预计算的隐藏状态(在自注意力块中的键和值),可用于加速顺序解码。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
`torch.FloatTensor` 的元组(如果模型有嵌入层则包含嵌入层的输出 + 每层的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
每层模型的隐藏状态以及可选的初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, 当传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
`torch.FloatTensor` 的元组(每个层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
自注意力头中注意力 softmax 后的注意力权重,用于计算自注意力头中加权平均值。
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
条件隐藏状态序列,表示文本编码器输出和音频编码器输出的投影连接。
作为条件信号使用。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_hidden_states: Optional[torch.FloatTensor] = None
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
if decoder_start_token_id is None:
raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class MusicgenMelodySinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int):
super().__init__()
self.embedding_dim = embedding_dim
self.make_weights(num_positions, embedding_dim)
def make_weights(self, num_embeddings: int, embedding_dim: int):
emb_weights = self.get_embedding(num_embeddings, embedding_dim)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.weights = nn.Parameter(emb_weights)
self.weights.requires_grad = False
self.weights.detach_()
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int):
"""
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(self, inputs_embeds: torch.Tensor, past_key_values_length: int = 0):
bsz, seq_len, _ = inputs_embeds.size()
position_ids = (torch.arange(seq_len) + past_key_values_length).to(inputs_embeds.device)
if seq_len > self.weights.size(0):
self.make_weights(seq_len + self.offset, self.embedding_dim)
return self.weights.index_select(0, position_ids.view(-1)).detach()
class MusicgenMelodyAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[MusicgenMelodyConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
"""重塑张量形状以适应多头注意力的输入要求"""
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
"""
执行注意力机制的前向传播
Args:
hidden_states: 输入的隐藏状态张量
key_value_states: 可选的键值状态张量(用于encoder-decoder注意力)
past_key_value: 可选的过去的键值对(用于加速Transformer解码器的计算)
attention_mask: 可选的注意力掩码张量
layer_head_mask: 可选的层级头掩码张量(用于控制每个头的选择性)
output_attentions: 是否输出注意力权重
Returns:
tuple:
- attention_output: 经过注意力机制后的输出张量
- attention_weights: 注意力权重(如果output_attentions为True时)
"""
pass
class MusicgenMelodyDecoderLayer(nn.Module):
def __init__(self, config: MusicgenMelodyDecoderConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = MusicgenMelodyAttention(
embed_dim=self.embed_dim,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
bias=False,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False)
self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=False)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size `(attention_heads,)`.
past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
class MusicgenMelodyPreTrainedModel(PreTrainedModel):
"""
用于处理权重初始化、下载和加载预训练模型的抽象类。
Attributes:
config_class: 与该模型相关的配置类 MusicgenMelodyDecoderConfig
base_model_prefix: 模型的基础名称前缀为 "model"
supports_gradient_checkpointing: 支持梯度检查点
_no_split_modules: 不需要拆分的模块列表,包括 "MusicgenMelodyDecoderLayer" 和 "MusicgenMelodyAttention"
"""
def _init_weights(self, module):
"""
初始化给定模块的权重。
Args:
module: 要初始化权重的模块
Notes:
根据模块类型不同,使用配置的初始化因子初始化权重和偏置。
"""
std = self.config.initializer_factor
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
MUSICGEN_MELODY_START_DOCSTRING = r"""
Musicgen Melody 模型是由 Jade Copet 等人在 [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) 中提出的。该模型是一个仅解码器的 Transformer,用于条件音乐生成。
该模型继承自 [`PreTrainedModel`]。查阅超类文档以了解库为所有模型实现的通用方法(如下载或保存模型、调整输入嵌入、剪枝头等)。
该模型也是 PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 的子类。可以像常规的 PyTorch 模块一样使用,并参考 PyTorch 文档了解所有与一般用法和行为相关的事项。
Parameters:
config ([`MusicgenMelodyConfig`]): 包含模型所有参数的模型配置类。使用配置文件初始化不会加载与模型关联的权重,只加载配置。查看 [`~PreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""
MUSICGEN_MELODY_INPUTS_DOCSTRING = r"""
"""
MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING = r"""
"""
class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel):
"""
Transformer 解码器,由 *config.num_hidden_layers* 层组成。每一层都是一个 [`MusicgenMelodyDecoderLayer`]
"""
def __init__(self, config: MusicgenMelodyDecoderConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.layerdrop
self.max_target_positions = config.max_position_embeddings
self.d_model = config.hidden_size
self.num_codebooks = config.num_codebooks
self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
embed_dim = config.vocab_size + 1
self.embed_tokens = nn.ModuleList(
[nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
)
self.embed_positions = MusicgenMelodySinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.hidden_size,
)
self.layers = nn.ModuleList([MusicgenMelodyDecoderLayer(config) for _ in range(config.num_hidden_layers)])
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The bare MusicgenMelody decoder model outputting raw hidden-states without any specific head on top.",
MUSICGEN_MELODY_START_DOCSTRING,
)
class MusicgenMelodyModel(MusicgenMelodyPreTrainedModel):
def __init__(self, config: MusicgenMelodyDecoderConfig):
super().__init__(config)
self.decoder = MusicgenMelodyDecoder(config)
self.post_init()
def get_input_embeddings(self):
return self.decoder.embed_tokens
def set_input_embeddings(self, value):
self.decoder.embed_tokens = value
def get_decoder(self):
return self.decoder
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
decoder_outputs = self.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs
return BaseModelOutputWithPast(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
hidden_states=decoder_outputs.hidden_states,
attentions=decoder_outputs.attentions,
)
@add_start_docstrings(
"The Musicgen Melody decoder model with a language modelling head on top.",
MUSICGEN_MELODY_START_DOCSTRING,
)
class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel):
def __init__(self, config: MusicgenMelodyDecoderConfig):
super().__init__(config)
self.model = MusicgenMelodyModel(config)
self.num_codebooks = config.num_codebooks
self.lm_heads = nn.ModuleList(
[nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_heads
def set_output_embeddings(self, new_embeddings):
self.lm_heads = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@add_start_docstrings_to_model_forward(MUSICGEN_MELODY_DECODER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=MusicgenMelodyOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
) -> Union[Tuple, MusicgenMelodyOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
Returns:
Tuple or MusicgenMelodyOutputWithPast: Depending on `return_dict`, returns either a tuple or an instance
of `MusicgenMelodyOutputWithPast`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)
loss = None
if labels is not None:
raise NotImplementedError("Training is not implemented for MusicgenMelody.")
lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return MusicgenMelodyOutputWithPast(
loss=loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(
self,
input_ids,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
past_key_values=None,
use_cache=True,
delay_pattern_mask=None,
guidance_scale=None,
**kwargs,
):
"""
Prepare inputs for the generation process, tailored for Music generation tasks.
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The input token IDs.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding tokens.
encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Hidden states from the encoder.
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid attending to encoder padding tokens.
head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask for the attention heads.
past_key_values (tuple of `torch.Tensor` of shape `(batch_size, num_heads, past_sequence_length, hidden_size)`):
Cached key and value states for fast decoding.
use_cache (bool, *optional*):
Whether to use caching mechanism for fast decoding.
delay_pattern_mask (`torch.Tensor` of shape `(batch_size, sequence_length, num_codebooks, vocab_size)`, *optional*):
Mask indicating patterns of delay for pattern-based music generation.
guidance_scale (float, *optional*):
Scaling factor for guidance during generation.
Returns:
dict: Dictionary containing prepared inputs for the generation process.
"""
):
if delay_pattern_mask is None:
input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
input_ids,
pad_token_id=self.generation_config.pad_token_id,
max_length=self.generation_config.max_length,
)
input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
if guidance_scale is not None and guidance_scale > 1:
input_ids = input_ids.repeat((2, 1))
if attention_mask is not None:
attention_mask = attention_mask.repeat((2, 1))
if encoder_hidden_states is not None:
encoder_hidden_states = torch.concatenate(
[encoder_hidden_states, torch.zeros_like(encoder_hidden_states)], dim=0
)
if encoder_attention_mask is not None:
encoder_attention_mask = torch.concatenate(
[encoder_attention_mask, torch.zeros_like(encoder_attention_mask)], dim=0
)
if past_key_values is not None:
input_ids = input_ids[:, -1:]
encoder_hidden_states = None
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"encoder_hidden_states": encoder_hidden_states,
"encoder_attention_mask": encoder_attention_mask,
"head_mask": head_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
"""Apply a delay pattern mask to the decoder input ids, only preserving predictions where
the mask is set to -1, and otherwise setting to the value detailed in the mask."""
seq_len = input_ids.shape[-1]
decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
return input_ids
@torch.no_grad()
def generate(
self,
inputs: Optional[torch.Tensor] = None,
generation_config: Optional[GenerationConfig] = None,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
synced_gpus: Optional[bool] = None,
streamer: Optional["BaseStreamer"] = None,
**kwargs,
@add_start_docstrings(
"The composite Musicgen Melody model with a text and audio conditional models, a MusicgenMelody decoder and an audio encoder, "
"for music generation tasks with one or both of text and audio prompts.",
MUSICGEN_MELODY_START_DOCSTRING,
"""
text_encoder (`Optional[PreTrainedModel]`, *optional*): Text encoder.
audio_encoder (`Optional[PreTrainedModel]`, *optional*): Audio code decoder.
decoder (`Optional[MusicgenMelodyForCausalLM]`, *optional*): MusicGen Melody decoder used to generate audio codes.
"""
)
class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
config_class = MusicgenMelodyConfig
main_input_name = "input_ids"
supports_gradient_checkpointing = True
def __init__(
self,
config: MusicgenMelodyConfig = None,
text_encoder: Optional[PreTrainedModel] = None,
audio_encoder: Optional[PreTrainedModel] = None,
decoder: Optional[MusicgenMelodyForCausalLM] = None,
):
if config is None and None in (text_encoder, audio_encoder, decoder):
raise ValueError(
"Either a configuration has to be provided, or all three of text encoder, audio encoder and Musicgen Melody decoder."
)
if config is None:
config = MusicgenMelodyConfig.from_sub_models_config(
text_encoder.config, audio_encoder.config, decoder.config
)
else:
if not isinstance(config, self.config_class):
raise ValueError(f"Config: {config} has to be of type {self.config_class}")
super().__init__(config)
if text_encoder is None:
text_encoder = AutoModelForTextEncoding.from_config(config.text_encoder)
if audio_encoder is None:
audio_encoder = AutoModel.from_config(config.audio_encoder)
if decoder is None:
decoder = MusicgenMelodyForCausalLM(config.decoder)
self.text_encoder = text_encoder
self.audio_encoder = audio_encoder
self.decoder = decoder
self.text_encoder.config = self.config.text_encoder
self.audio_encoder.config = self.config.audio_encoder
self.decoder.config = self.config.decoder
if self.text_encoder.get_output_embeddings() is not None:
raise ValueError(
f"The encoder {self.text_encoder} should not have a LM Head. Please use a model without and LM Head"
)
if self.text_encoder.config.hidden_size != self.decoder.config.hidden_size:
self.enc_to_dec_proj = nn.Linear(self.text_encoder.config.hidden_size, self.decoder.config.hidden_size)
if self.config.num_chroma != self.decoder.config.hidden_size:
self.audio_enc_to_dec_proj = nn.Linear(self.config.num_chroma, self.decoder.config.hidden_size)
self.post_init()
def _init_weights(self, module):
std = self.decoder.config.initializer_factor
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
def tie_weights(self):
if self.config.tie_encoder_decoder:
decoder_base_model_prefix = self.decoder.base_model_prefix
self._tie_encoder_decoder_weights(
self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
)
def get_text_encoder(self):
return self.text_encoder
def get_encoder(self):
return self.get_text_encoder()
def get_decoder(self):
return self.decoder
def get_input_embeddings(self):
return self.text_encoder.get_input_embeddings()
def get_output_embeddings(self):
return self.decoder.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
return self.decoder.set_output_embeddings(new_embeddings)
@classmethod
def from_sub_models_pretrained(
cls,
text_encoder_pretrained_model_name_or_path: str = None,
audio_encoder_pretrained_model_name_or_path: str = None,
decoder_pretrained_model_name_or_path: str = None,
*model_args,
**kwargs,
@add_start_docstrings_to_model_forward(MUSICGEN_MELODY_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=MusicgenMelodyOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.BoolTensor] = None,
input_features: Optional[torch.FloatTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
def prepare_inputs_for_generation(
self,
decoder_input_ids,
encoder_hidden_states=None,
past_key_values=None,
attention_mask=None,
decoder_attention_mask=None,
decoder_head_mask=None,
use_cache=None,
decoder_delay_pattern_mask=None,
guidance_scale=None,
**kwargs,
):
if decoder_delay_pattern_mask is None:
decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
decoder_input_ids,
self.generation_config.pad_token_id,
max_length=self.generation_config.max_length,
)
decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)
if guidance_scale is not None and guidance_scale > 1:
decoder_input_ids = decoder_input_ids.repeat((2, 1))
if decoder_attention_mask is not None:
decoder_attention_mask = decoder_attention_mask.repeat((2, 1))
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
encoder_hidden_states = None
return {
"input_ids": None,
"encoder_hidden_states": encoder_hidden_states,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"decoder_head_mask": decoder_head_mask,
"use_cache": use_cache,
}
def _prepare_decoder_input_ids_for_generation(
self,
batch_size: int,
model_input_name: str,
model_kwargs: Dict[str, torch.Tensor],
decoder_start_token_id: int = None,
bos_token_id: int = None,
device: torch.device = None,
) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
"""为使用编码器-解码器模型生成准备 `decoder_input_ids`"""
if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
decoder_input_ids = model_kwargs.pop("decoder_input_ids")
elif "input_ids" in model_kwargs and model_input_name != "input_ids":
decoder_input_ids = model_kwargs.pop("input_ids")
else:
decoder_input_ids = None
decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
if device is None:
device = self.device
decoder_input_ids_start = (
torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
* decoder_start_token_id
)
if decoder_input_ids is None:
decoder_input_ids = decoder_input_ids_start
elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
if "decoder_attention_mask" in model_kwargs:
decoder_attention_mask = model_kwargs["decoder_attention_mask"]
decoder_attention_mask = torch.cat(
(torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
dim=-1,
)
model_kwargs["decoder_attention_mask"] = decoder_attention_mask
return decoder_input_ids, model_kwargs
def _prepare_encoder_hidden_states_kwargs_for_generation(
self,
inputs_tensor: torch.Tensor,
model_kwargs,
model_input_name: Optional[str] = None,
guidance_scale: Optional[float] = None,
):
"""为生成准备编码器隐藏状态的参数"""
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
"""根据标签准备解码器的输入ids"""
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
def resize_token_embeddings(self, *args, **kwargs):
"""调整标记嵌入大小的方法,通过 EncoderDecoderModel 直接不支持。请使用包装对象的相应方法(model.encoder.resize_token_embeddings(...) 或 model.decoder.resize_token_embeddings(...))"""
raise NotImplementedError(
"Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
" respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
" model.decoder.resize_token_embeddings(...))"
)
def _maybe_initialize_input_ids_for_generation(
self,
inputs: Optional[torch.Tensor] = None,
bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
) -> torch.LongTensor:
"""Initializes input ids for generation, if necessary."""
if inputs is not None:
return inputs
if bos_token_id is None:
raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
batch_size = 1
for value in model_kwargs.values():
if isinstance(value, torch.Tensor):
batch_size = value.shape[0]
break
return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
@torch.no_grad()
def generate(
self,
inputs: Optional[torch.Tensor] = None,
generation_config: Optional[GenerationConfig] = None,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
synced_gpus: Optional[bool] = None,
streamer: Optional["BaseStreamer"] = None,
**kwargs,
):
"""Generates sequences using the model."""
def _update_model_kwargs_for_generation(
self,
outputs: ModelOutput,
model_kwargs: Dict[str, Any],
is_encoder_decoder: bool = False,
standardize_cache_format: bool = False,
model_inputs: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""Updates model keyword arguments for generation."""
model_kwargs["past_key_values"] = self._extract_past_from_model_output(
outputs, standardize_cache_format=standardize_cache_format
)
if getattr(outputs, "state", None) is not None:
model_kwargs["state"] = outputs.state
if "token_type_ids" in model_kwargs:
token_type_ids = model_kwargs["token_type_ids"]
model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
if "decoder_attention_mask" in model_kwargs:
decoder_attention_mask = model_kwargs["decoder_attention_mask"]
model_kwargs["decoder_attention_mask"] = torch.cat(
[decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
dim=-1,
)
return model_kwargs
.\models\musicgen_melody\processing_musicgen_melody.py
"""
Text/audio processor class for MusicGen Melody
"""
from typing import List, Optional
import numpy as np
from ...processing_utils import ProcessorMixin
from ...utils import to_numpy
class MusicgenMelodyProcessor(ProcessorMixin):
r"""
Constructs a MusicGen Melody processor which wraps a Wav2Vec2 feature extractor - for raw audio waveform processing - and a T5 tokenizer into a single processor
class.
[`MusicgenProcessor`] offers all the functionalities of [`MusicgenMelodyFeatureExtractor`] and [`T5Tokenizer`]. See
[`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
Args:
feature_extractor (`MusicgenMelodyFeatureExtractor`):
An instance of [`MusicgenMelodyFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`T5Tokenizer`):
An instance of [`T5Tokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "MusicgenMelodyFeatureExtractor"
tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
"""
Retrieves decoder prompt IDs from the tokenizer.
Args:
task (str, optional): Task identifier. Defaults to None.
language (str, optional): Language identifier. Defaults to None.
no_timestamps (bool, optional): Flag indicating whether to exclude timestamps. Defaults to True.
Returns:
List[int]: List of decoder prompt IDs based on the provided task, language, and timestamps preferences.
"""
return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
def __call__(self, audio=None, text=None, **kwargs):
"""
主方法,用于准备模型的一个或多个序列和音频。如果 `audio` 不为 `None`,则将 `audio` 和 `kwargs` 参数传递给 MusicgenMelodyFeatureExtractor 的 [`~MusicgenMelodyFeatureExtractor.__call__`] 来预处理音频。如果 `text` 不为 `None`,则将 `text` 和 `kwargs` 参数传递给 PreTrainedTokenizer 的 [`~PreTrainedTokenizer.__call__`]。请参考上述两个方法的文档字符串获取更多信息。
Args:
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
要准备的音频或音频批处理。每个音频可以是 NumPy 数组或 PyTorch 张量。如果是 NumPy 数组/PyTorch 张量,则每个音频应为形状为 (T) 的单声道或立体声信号,其中 T 是音频的样本长度。
text (`str`, `List[str]`, `List[List[str]]`):
要编码的序列或序列批处理。每个序列可以是字符串或字符串列表(预分词字符串)。如果作为字符串列表(预分词)提供序列,则必须设置 `is_split_into_words=True`(以消除与序列批处理的歧义)。
kwargs (*optional*):
剩余的关键字参数字典,将传递给特征提取器和/或标记器。
Returns:
[`BatchEncoding`]: 一个 [`BatchEncoding`],具有以下字段:
- **input_ids** -- 要输入模型的令牌 ID 列表。在 `text` 不为 `None` 时返回。
- **input_features** -- 要输入模型的音频输入特征。在 `audio` 不为 `None` 时返回。
- **attention_mask** -- 列表的令牌索引,指定模型在 `text` 不为 `None` 时应注意哪些令牌。
当仅指定 `audio` 时,返回时间戳的注意力蒙版。
"""
sampling_rate = kwargs.pop("sampling_rate", None)
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if text is not None:
inputs = self.tokenizer(text, **kwargs)
if audio is not None:
audio_inputs = self.feature_extractor(audio, sampling_rate=sampling_rate, **kwargs)
if text is None:
return audio_inputs
elif audio is None:
return inputs
else:
inputs["input_features"] = audio_inputs["input_features"]
return inputs
audio_values = kwargs.pop("audio", None)
attention_mask = kwargs.pop("attention_mask", None)
if len(args) > 0:
audio_values = args[0]
args = args[1:]
if audio_values is not None:
return self._decode_audio(audio_values, attention_mask=attention_mask)
else:
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to T5Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def _decode_audio(self, audio_values, attention_mask: Optional = None) -> List[np.ndarray]:
"""
This method strips any padding from the audio values to return a list of numpy audio arrays.
"""
audio_values = to_numpy(audio_values)
bsz, channels, seq_len = audio_values.shape
if attention_mask is None:
return list(audio_values)
attention_mask = to_numpy(attention_mask)
difference = seq_len - attention_mask.shape[-1]
padding_value = 1 - self.feature_extractor.padding_value
attention_mask = np.pad(attention_mask, ((0, 0), (0, difference)), "constant", constant_values=padding_value)
audio_values = audio_values.tolist()
for i in range(bsz):
sliced_audio = np.asarray(audio_values[i])[
attention_mask[i][None, :] != self.feature_extractor.padding_value
]
audio_values[i] = sliced_audio.reshape(channels, -1)
return audio_values
def get_unconditional_inputs(self, num_samples=1, return_tensors="pt"):
"""
Helper function to get null inputs for unconditional generation, enabling the model to be used without the
feature extractor or tokenizer.
Args:
num_samples (int, *optional*):
Number of audio samples to unconditionally generate.
Example:
```
>>> from transformers import MusicgenMelodyForConditionalGeneration, MusicgenMelodyProcessor
>>> model = MusicgenMelodyForConditionalGeneration.from_pretrained("facebook/musicgen-melody")
>>> # get the unconditional (or 'null') inputs for the model
>>> processor = MusicgenMelodyProcessor.from_pretrained("facebook/musicgen-melody")
>>> unconditional_inputs = processor.get_unconditional_inputs(num_samples=1)
>>> audio_samples = model.generate(**unconditional_inputs, max_new_tokens=256)
```"""
inputs = self.tokenizer([""] * num_samples, return_tensors=return_tensors, return_attention_mask=True)
inputs["attention_mask"][:] = 0
return inputs
.\models\musicgen_melody\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_torchaudio_available,
)
_import_structure = {
"configuration_musicgen_melody": [
"MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP",
"MusicgenMelodyConfig",
"MusicgenMelodyDecoderConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_musicgen_melody"] = [
"MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST",
"MusicgenMelodyForConditionalGeneration",
"MusicgenMelodyForCausalLM",
"MusicgenMelodyModel",
"MusicgenMelodyPreTrainedModel",
]
try:
if not is_torchaudio_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_musicgen_melody"] = ["MusicgenMelodyFeatureExtractor"]
_import_structure["processing_musicgen_melody"] = ["MusicgenMelodyProcessor"]
if TYPE_CHECKING:
from .configuration_musicgen_melody import (
MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP,
MusicgenMelodyConfig,
MusicgenMelodyDecoderConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_musicgen_melody import (
MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST,
MusicgenMelodyForCausalLM,
MusicgenMelodyForConditionalGeneration,
MusicgenMelodyModel,
MusicgenMelodyPreTrainedModel,
)
try:
if not is_torchaudio_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
from .processing_musicgen_melody import MusicgenMelodyProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mvp\configuration_mvp.py
""" MVP model configuration"""
import warnings
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MVP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"RUCAIBox/mvp": "https://huggingface.co/RUCAIBox/mvp/resolve/main/config.json",
}
class MvpConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MvpModel`]. It is used to instantiate a MVP model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the MVP [RUCAIBox/mvp](https://huggingface.co/RUCAIBox/mvp)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import MvpConfig, MvpModel
>>> # Initializing a MVP RUCAIBox/mvp style configuration
>>> configuration = MvpConfig()
>>> # Initializing a model (with random weights) from the RUCAIBox/mvp style configuration
>>> model = MvpModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "mvp"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=50267,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
activation_function="gelu",
d_model=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
classifier_dropout=0.0,
scale_embedding=False,
use_cache=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
is_encoder_decoder=True,
decoder_start_token_id=2,
forced_eos_token_id=2,
use_prompt=False,
prompt_length=100,
prompt_mid_dim=800,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
max_position_embeddings=max_position_embeddings,
encoder_layers=encoder_layers,
encoder_ffn_dim=encoder_ffn_dim,
encoder_attention_heads=encoder_attention_heads,
decoder_layers=decoder_layers,
decoder_ffn_dim=decoder_ffn_dim,
decoder_attention_heads=decoder_attention_heads,
encoder_layerdrop=encoder_layerdrop,
decoder_layerdrop=decoder_layerdrop,
activation_function=activation_function,
d_model=d_model,
dropout=dropout,
attention_dropout=attention_dropout,
activation_dropout=activation_dropout,
init_std=init_std,
classifier_dropout=classifier_dropout,
scale_embedding=scale_embedding,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
use_prompt=use_prompt,
prompt_length=prompt_length,
prompt_mid_dim=prompt_mid_dim,
**kwargs,
)
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
self.use_prompt = use_prompt
self.prompt_length = prompt_length
self.prompt_mid_dim = prompt_mid_dim
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
self.forced_bos_token_id = self.bos_token_id
warnings.warn(
f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
"The config can simply be saved and uploaded again to be fixed."
)
.\models\mvp\modeling_mvp.py
""" PyTorch MVP model."""
import copy
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqQuestionAnsweringModelOutput,
Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_mvp import MvpConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "RUCAIBox/mvp"
_CONFIG_FOR_DOC = "MvpConfig"
_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
MVP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"RUCAIBox/mvp",
"RUCAIBox/mvp-data-to-text",
"RUCAIBox/mvp-open-dialog",
"RUCAIBox/mvp-question-answering",
"RUCAIBox/mvp-question-generation",
"RUCAIBox/mvp-story",
"RUCAIBox/mvp-summarization",
"RUCAIBox/mvp-task-dialog",
"RUCAIBox/mtl-data-to-text",
"RUCAIBox/mtl-multi-task",
"RUCAIBox/mtl-open-dialog",
"RUCAIBox/mtl-question-answering",
"RUCAIBox/mtl-question-generation",
"RUCAIBox/mtl-story",
"RUCAIBox/mtl-summarization",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
Args:
input_ids (torch.Tensor): Tensor of input ids.
pad_token_id (int): The id of the padding token in the model's configuration.
decoder_start_token_id (int): The id of the decoder's start token.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class MvpLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
"""`input_ids' shape is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids.shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
).expand(bsz, -1)
return super().forward(positions + self.offset)
class MvpAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
attn_prompt: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
pass
def __init__(self, config: MvpConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = MvpAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
`(2, encoder_attention_heads, pro_len, head_dim)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
attn_prompt=self_attn_prompt,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class MvpDecoderLayer(nn.Module):
def __init__(self, config: MvpConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = MvpAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = MvpAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
self_attn_prompt: Optional[torch.Tensor] = None,
cross_attn_prompt: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class MvpClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(
self,
input_dim: int,
inner_dim: int,
num_classes: int,
pooler_dropout: float,
):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class MvpPrompt(nn.Module):
"""Layer-wise prompt for encoder or decoder."""
def __init__(self, config, num_layers, num_heads):
super().__init__()
self.prompt_length = config.prompt_length
self.num_layers = num_layers
self.num_heads = num_heads
self.head_dim = config.d_model // num_heads
self.dropout = nn.Dropout(p=config.dropout)
self.prompt_embedding = nn.Embedding(config.prompt_length, config.d_model)
self.prompt_trans = nn.Sequential(
nn.Linear(config.d_model, config.prompt_mid_dim),
nn.GELU(),
nn.Linear(config.prompt_mid_dim, num_layers * 2 * config.d_model),
)
def forward(self, prompt_ids: torch.Tensor) -> Tuple[torch.Tensor]:
prompt = self.prompt_trans(self.prompt_embedding(prompt_ids))
prompt = prompt.view(self.prompt_length, self.num_layers * 2, self.num_heads, self.head_dim)
prompt = self.dropout(prompt)
prompt = prompt.permute([1, 2, 0, 3]).split(2)
return prompt
class MvpPreTrainedModel(PreTrainedModel):
config_class = MvpConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@property
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs
MVP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MvpConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MVP_INPUTS_DOCSTRING = r"""
Placeholder for inputs documentation.
"""
MVP_CONDITIONAL_GENERATION_EXAMPLE = r"""
Example of summarization:
Fine-tuning a model
```
>>> import torch
>>> from transformers import AutoTokenizer, MvpForConditionalGeneration
>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
>>> inputs = tokenizer(
... "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
... return_tensors="pt",
... )
>>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]
>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()
```
Inference after the model fine-tuned
```
>>> with torch.no_grad():
... generated_ids = model.generate(**inputs)
>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
```
"""
MVP_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
Example of single-label classification:
Fine-tuning a model on `num_labels` classes
```
Placeholder for sequence classification sample.
# 导入PyTorch库
import torch
# 从transformers库中导入AutoTokenizer和MvpForSequenceClassification类
from transformers import AutoTokenizer, MvpForSequenceClassification
# 设置类别数为2,示例中是一个二分类任务
num_labels = 2
# 使用预训练模型"RUCAIBox/mvp"初始化分词器
tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
# 使用预训练模型"RUCAIBox/mvp"初始化序列分类模型,并指定类别数
model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)
# 对输入文本进行分词和转换为PyTorch张量格式
inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
# 设置输入文本对应的真实标签
labels = torch.tensor(1)
# 使用模型进行前向传播并计算损失
loss = model(**inputs, labels=labels).loss
# 根据损失计算梯度
loss.backward()
# 在模型微调后进行推理
# 禁用梯度计算
with torch.no_grad():
# 获取模型的输出日志概率
logits = model(**inputs).logits
# 获取预测的类别ID,即输出概率最高的类别
predicted_class_id = logits.argmax()
"""
MVP_QUESTION_ANSWERING_SAMPLE = r"""
Example:
Fine-tuning a model for extrative question answering, and our model also supports generative question answering
using `BartForConditionalGeneration`
```
>>> import torch
>>> from transformers import AutoTokenizer, MvpForQuestionAnswering
>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")
>>> inputs = tokenizer(
... "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
... return_tensors="pt",
... )
>>> target_start_index = torch.tensor([18])
>>> target_end_index = torch.tensor([19])
>>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
>>> loss.backward()
```
Inference after the model fine-tuned
```
>>> with torch.no_grad():
... outputs = model(**inputs)
>>> answer_start_index = outputs.start_logits.argmax()
>>> answer_end_index = outputs.end_logits.argmax()
>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
>>> predict_answer = tokenizer.decode(predict_answer_tokens)
```
"""
class MvpEncoder(MvpPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`MvpEncoderLayer`].
Args:
config: MvpConfig
embed_tokens (nn.Embedding): output embedding
use_prompt (bool): whether to use prompt
"""
def __init__(
self, config: MvpConfig, embed_tokens: Optional[nn.Embedding] = None, use_prompt: Optional[bool] = False
):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
self.embed_positions = MvpLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([MvpEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.use_prompt = use_prompt
if use_prompt:
self.prompt_length = config.prompt_length
self.self_attn_prompt = MvpPrompt(
config,
config.encoder_layers,
config.encoder_attention_heads,
)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class MvpDecoder(MvpPreTrainedModel):
"""
Transformer 解码器,由 config.decoder_layers 层组成。每层是一个 `MvpDecoderLayer` 对象。
Args:
config: MvpConfig 对象,配置参数
embed_tokens (nn.Embedding): 输出的嵌入层
use_prompt (bool): 是否使用提示语
"""
def __init__(
self, config: MvpConfig, embed_tokens: Optional[nn.Embedding] = None, use_prompt: Optional[bool] = False
):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = MvpLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([MvpDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.use_prompt = use_prompt
if use_prompt:
self.prompt_length = config.prompt_length
self.self_attn_prompt = MvpPrompt(
config,
config.decoder_layers,
config.decoder_attention_heads,
)
self.cross_attn_prompt = MvpPrompt(
config,
config.decoder_layers,
config.decoder_attention_heads,
)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
...
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: MvpConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.use_prompt = config.use_prompt
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = MvpEncoder(config, self.shared, config.use_prompt)
self.decoder = MvpDecoder(config, self.shared, config.use_prompt)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def set_lightweight_tuning(self):
assert self.use_prompt, "If you want to use lightweight tuning, make sure that `use_prompt=True`."
self.requires_grad_(False)
self.encoder.self_attn_prompt.requires_grad_(True)
self.decoder.self_attn_prompt.requires_grad_(True)
self.decoder.cross_attn_prompt.requires_grad_(True)
@add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING
)
class MvpForConditionalGeneration(MvpPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: MvpConfig):
super().__init__(config)
self.model = MvpModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_num_tokens)
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_lightweight_tuning(self):
self.model.set_lightweight_tuning()
self.lm_head.requires_grad_(False)
@add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(MVP_CONDITIONAL_GENERATION_EXAMPLE)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Returns either a tuple or a `Seq2SeqLMOutput` containing masked language model output.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if use_cache:
logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"""
Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
MVP_START_DOCSTRING,
)
class MvpForSequenceClassification(MvpPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: MvpConfig, **kwargs):
super().__init__(config, **kwargs)
self.model = MvpModel(config)
self.classification_head = MvpClassificationHead(
config.d_model,
config.d_model,
config.num_labels,
config.classifier_dropout,
)
self.post_init()
def set_lightweight_tuning(self):
self.model.set_lightweight_tuning()
self.classification_head.requires_grad_(False)
@add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
@add_end_docstrings(MVP_SEQUENCE_CLASSIFICATION_SAMPLE)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
MVP Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MVP_START_DOCSTRING,
)
class MvpForQuestionAnswering(MvpPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.model = MvpModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
def set_lightweight_tuning(self):
self.model.set_lightweight_tuning()
self.qa_outputs.requires_grad_(False)
@add_start_docstrings_to_model_forward(MVP_INPUTS_DOCSTRING)
@add_end_docstrings(MVP_QUESTION_ANSWERING_SAMPLE)
def forward(
self,
input_ids: torch.Tensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
class MvpDecoderWrapper(MvpPreTrainedModel):
"""
这个包装类是一个辅助类,用于在使用 [`EncoderDecoderModel`] 框架与因果语言模型结合时正确加载预训练检查点。
"""
def __init__(self, config):
super().__init__(config)
self.decoder = MvpDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
class MvpForCausalLM(MvpPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = MvpDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
def set_lightweight_tuning(self):
self.model.set_lightweight_tuning()
self.lm_head.requires_grad_(False)
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
...
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
...
):
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past