Transformers 源码解析(一百零四)
.\models\siglip\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_siglip": [
"SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
"SiglipConfig",
"SiglipTextConfig",
"SiglipVisionConfig",
],
"processing_siglip": ["SiglipProcessor"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_siglip"] = ["SiglipTokenizer"]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_siglip"] = ["SiglipImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_siglip"] = [
"SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
"SiglipModel",
"SiglipPreTrainedModel",
"SiglipTextModel",
"SiglipVisionModel",
"SiglipForImageClassification",
]
if TYPE_CHECKING:
from .configuration_siglip import (
SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
SiglipConfig,
SiglipTextConfig,
SiglipVisionConfig,
)
from .processing_siglip import SiglipProcessor
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_siglip import SiglipTokenizer
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_siglip import SiglipImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_siglip import (
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
SiglipForImageClassification,
SiglipModel,
SiglipPreTrainedModel,
SiglipTextModel,
SiglipVisionModel,
)
else:
pass
import sys
导入 sys 模块,用于访问和操作 Python 解释器的运行时环境和变量。
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
将当前模块注册为一个懒加载模块。这行代码的作用是将当前模块的模块对象(通过__name__获取)关联到一个自定义的 _LazyModule 实例,这个实例接受当前模块的名称、文件路径(通过 globals()["__file__"] 获取)、一个导入结构(_import_structure)、和一个模块规范(module_spec=__spec__)作为参数。
.\models\speecht5\configuration_speecht5.py
import functools
import operator
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/speecht5_asr": "https://huggingface.co/microsoft/speecht5_asr/resolve/main/config.json",
"microsoft/speecht5_tts": "https://huggingface.co/microsoft/speecht5_tts/resolve/main/config.json",
"microsoft/speecht5_vc": "https://huggingface.co/microsoft/speecht5_vc/resolve/main/config.json",
}
SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP = {
"microsoft/speecht5_hifigan": "https://huggingface.co/microsoft/speecht5_hifigan/resolve/main/config.json",
}
class SpeechT5Config(PretrainedConfig):
model_type = "speecht5"
attribute_map = {"num_attention_heads": "encoder_attention_heads", "num_hidden_layers": "encoder_layers"}
def __init__(
self,
vocab_size=81,
hidden_size=768,
encoder_layers=12,
encoder_attention_heads=12,
encoder_ffn_dim=3072,
encoder_layerdrop=0.1,
decoder_layers=6,
decoder_ffn_dim=3072,
decoder_attention_heads=12,
decoder_layerdrop=0.1,
hidden_act="gelu",
positional_dropout=0.1,
hidden_dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
scale_embedding=False,
feat_extract_norm="group",
feat_proj_dropout=0.0,
feat_extract_activation="gelu",
conv_dim=(512, 512, 512, 512, 512, 512, 512),
conv_stride=(5, 2, 2, 2, 2, 2, 2),
conv_kernel=(10, 3, 3, 3, 3, 2, 2),
conv_bias=False,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
mask_feature_min_masks=0,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
decoder_start_token_id=2,
num_mel_bins=80,
speech_decoder_prenet_layers=2,
speech_decoder_prenet_units=256,
speech_decoder_prenet_dropout=0.5,
speaker_embedding_dim=512,
speech_decoder_postnet_layers=5,
speech_decoder_postnet_units=256,
speech_decoder_postnet_kernel=5,
speech_decoder_postnet_dropout=0.5,
reduction_factor=2,
max_speech_positions=4000,
max_text_positions=450,
encoder_max_relative_position=160,
use_guided_attention_loss=True,
guided_attention_loss_num_heads=2,
guided_attention_loss_sigma=0.4,
guided_attention_loss_scale=10.0,
use_cache=True,
is_encoder_decoder=True,
**kwargs,
):
def inputs_to_logits_ratio(self):
return functools.reduce(operator.mul, self.conv_stride, 1)
class SpeechT5HifiGanConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SpeechT5HifiGanModel`]. It is used to instantiate
a SpeechT5 HiFi-GAN vocoder model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the SpeechT5
[microsoft/speecht5_hifigan](https://huggingface.co/microsoft/speecht5_hifigan) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
model_in_dim (`int`, *optional*, defaults to 80):
The number of frequency bins in the input log-mel spectrogram.
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
upsample_initial_channel (`int`, *optional*, defaults to 512):
The number of input channels into the upsampling network.
upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
length of *upsample_rates* defines the number of convolutional layers and has to match the length of
*upsample_kernel_sizes*.
upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 8, 8]`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
*upsample_rates*.
resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
fusion (MRF) module.
resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
multi-receptive field fusion (MRF) module.
initializer_range (`float`, *optional*, defaults to 0.01):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
leaky_relu_slope (`float`, *optional*, defaults to 0.1):
The angle of the negative slope used by the leaky ReLU activation.
normalize_before (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
Example:
```
# 定义模型类型为 "hifigan"
model_type = "hifigan"
# 初始化模型类,设置模型参数
def __init__(
self,
model_in_dim=80, # 输入维度,默认为80
sampling_rate=16000, # 采样率,默认为16000
upsample_initial_channel=512, # 上采样初始通道数,默认为512
upsample_rates=[4, 4, 4, 4], # 上采样倍率列表,默认为[4, 4, 4, 4]
upsample_kernel_sizes=[8, 8, 8, 8], # 上采样卷积核大小列表,默认为[8, 8, 8, 8]
resblock_kernel_sizes=[3, 7, 11], # ResBlock 的卷积核大小列表,默认为[3, 7, 11]
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], # ResBlock 的膨胀率列表,默认为[[1, 3, 5], [1, 3, 5], [1, 3, 5]]
initializer_range=0.01, # 初始化范围,默认为0.01
leaky_relu_slope=0.1, # LeakyReLU 的斜率,默认为0.1
normalize_before=True, # 是否在前归一化,默认为True
**kwargs, # 其他参数
):
self.model_in_dim = model_in_dim # 设置模型输入维度
self.sampling_rate = sampling_rate # 设置采样率
self.upsample_initial_channel = upsample_initial_channel # 设置上采样初始通道数
self.upsample_rates = upsample_rates # 设置上采样倍率列表
self.upsample_kernel_sizes = upsample_kernel_sizes # 设置上采样卷积核大小列表
self.resblock_kernel_sizes = resblock_kernel_sizes # 设置ResBlock的卷积核大小列表
self.resblock_dilation_sizes = resblock_dilation_sizes # 设置ResBlock的膨胀率列表
self.initializer_range = initializer_range # 设置初始化范围
self.leaky_relu_slope = leaky_relu_slope # 设置LeakyReLU的斜率
self.normalize_before = normalize_before # 设置是否在前归一化
super().__init__(**kwargs) # 调用父类的初始化方法,传入其他参数
.\models\speecht5\convert_hifigan.py
import argparse
import numpy as np
import torch
from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig, logging
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.speecht5")
def load_weights(checkpoint, hf_model, config):
hf_model.apply_weight_norm()
hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
for i in range(len(config.upsample_rates)):
hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
for j in range(len(config.resblock_dilation_sizes)):
hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
hf_model.remove_weight_norm()
@torch.no_grad()
def convert_hifigan_checkpoint(
checkpoint_path,
stats_path,
pytorch_dump_folder_path,
config_path=None,
repo_id=None,
):
if config_path is not None:
config = SpeechT5HifiGanConfig.from_pretrained(config_path)
else:
config = SpeechT5HifiGanConfig()
model = SpeechT5HifiGan(config)
orig_checkpoint = torch.load(checkpoint_path)
load_weights(orig_checkpoint["model"]["generator"], model, config)
stats = np.load(stats_path)
mean = stats[0].reshape(-1)
scale = stats[1].reshape(-1)
model.mean = torch.from_numpy(mean).float()
model.scale = torch.from_numpy(scale).float()
model.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
parser.add_argument("--stats_path", required=True, default=None, type=str, help="Path to stats.npy file")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
args = parser.parse_args()
convert_hifigan_checkpoint(
args.checkpoint_path,
args.stats_path,
args.pytorch_dump_folder_path,
args.config_path,
args.push_to_hub
)
.\models\speecht5\convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
import argparse
import torch
from transformers import (
SpeechT5Config,
SpeechT5FeatureExtractor,
SpeechT5ForSpeechToSpeech,
SpeechT5ForSpeechToText,
SpeechT5ForTextToSpeech,
SpeechT5Processor,
SpeechT5Tokenizer,
logging,
)
from transformers.tokenization_utils import AddedToken
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.speecht5")
MAPPING_SPEECH_ENCODER_PRENET = {
"speech_encoder_prenet.layer_norm": "speecht5.encoder.prenet.feature_projection.layer_norm",
"speech_encoder_prenet.post_extract_proj": "speecht5.encoder.prenet.feature_projection.projection",
"speech_encoder_prenet.pos_conv.0": "speecht5.encoder.prenet.pos_conv_embed.conv",
"speech_encoder_prenet.mask_emb": "speecht5.encoder.prenet.masked_spec_embed",
}
MAPPING_TEXT_ENCODER_PRENET = {
"text_encoder_prenet.encoder_prenet.0": "speecht5.encoder.prenet.embed_tokens",
"text_encoder_prenet.encoder_prenet.1.alpha": "speecht5.encoder.prenet.encode_positions.alpha",
}
MAPPING_SPEECH_DECODER_PRENET = {
"speech_decoder_prenet.decoder_prenet.0.0.prenet.0.0": "speecht5.decoder.prenet.layers.0",
"speech_decoder_prenet.decoder_prenet.0.0.prenet.1.0": "speecht5.decoder.prenet.layers.1",
"speech_decoder_prenet.decoder_prenet.0.1": "speecht5.decoder.prenet.final_layer",
"speech_decoder_prenet.decoder_prenet.1.alpha": "speecht5.decoder.prenet.encode_positions.alpha",
"speech_decoder_prenet.spkembs_layer.0": "speecht5.decoder.prenet.speaker_embeds_layer",
}
MAPPING_SPEECH_DECODER_POSTNET = {
"speech_decoder_postnet.feat_out": "speech_decoder_postnet.feat_out",
"speech_decoder_postnet.prob_out": "speech_decoder_postnet.prob_out",
"speech_decoder_postnet.postnet.postnet.0.0": "speech_decoder_postnet.layers.0.conv",
"speech_decoder_postnet.postnet.postnet.0.1": "speech_decoder_postnet.layers.0.batch_norm",
"speech_decoder_postnet.postnet.postnet.1.0": "speech_decoder_postnet.layers.1.conv",
"speech_decoder_postnet.postnet.postnet.1.1": "speech_decoder_postnet.layers.1.batch_norm",
"speech_decoder_postnet.postnet.postnet.2.0": "speech_decoder_postnet.layers.2.conv",
"speech_decoder_postnet.postnet.postnet.2.1": "speech_decoder_postnet.layers.2.batch_norm",
}
"speech_decoder_postnet.postnet.postnet.3.0": "speech_decoder_postnet.layers.3.conv",
"speech_decoder_postnet.postnet.postnet.3.1": "speech_decoder_postnet.layers.3.batch_norm",
"speech_decoder_postnet.postnet.postnet.4.0": "speech_decoder_postnet.layers.4.conv",
"speech_decoder_postnet.postnet.postnet.4.1": "speech_decoder_postnet.layers.4.batch_norm",
}
MAPPING_TEXT_DECODER_PRENET = {
"text_decoder_prenet.embed_tokens": "speecht5.decoder.prenet.embed_tokens",
}
MAPPING_TEXT_DECODER_POSTNET = {
"text_decoder_postnet.output_projection": "text_decoder_postnet.lm_head",
}
MAPPING_ENCODER = {
"encoder.layers.*.self_attn.k_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.k_proj",
"encoder.layers.*.self_attn.v_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.v_proj",
"encoder.layers.*.self_attn.q_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.q_proj",
"encoder.layers.*.self_attn.out_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.out_proj",
"encoder.layers.*.self_attn_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.layer_norm",
"encoder.layers.*.fc1": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.intermediate_dense",
"encoder.layers.*.fc2": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.output_dense",
"encoder.layers.*.final_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "speecht5.encoder.wrapped_encoder.layer_norm",
"encoder.pos_emb.pe_k": "speecht5.encoder.wrapped_encoder.embed_positions.pe_k",
}
MAPPING_DECODER = {
"decoder.layers.*.self_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.k_proj",
"decoder.layers.*.self_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.v_proj",
"decoder.layers.*.self_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.q_proj",
"decoder.layers.*.self_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.out_proj",
"decoder.layers.*.self_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.self_attn_layer_norm",
"decoder.layers.*.encoder_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.k_proj",
"decoder.layers.*.encoder_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.v_proj",
"decoder.layers.*.encoder_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.q_proj",
"decoder.layers.*.encoder_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.out_proj",
"decoder.layers.*.encoder_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn_layer_norm",
"decoder.layers.*.fc1": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.intermediate_dense",
"decoder.layers.*.fc2": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.output_dense",
"decoder.layers.*.final_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.final_layer_norm",
}
MAPPING_S2T = {
**MAPPING_SPEECH_ENCODER_PRENET,
**MAPPING_ENCODER,
**MAPPING_DECODER,
**MAPPING_TEXT_DECODER_PRENET,
**MAPPING_TEXT_DECODER_POSTNET,
}
MAPPING_T2S = {
**MAPPING_TEXT_ENCODER_PRENET,
**MAPPING_ENCODER,
**MAPPING_DECODER,
**MAPPING_SPEECH_DECODER_PRENET,
**MAPPING_SPEECH_DECODER_POSTNET,
}
MAPPING_S2S = {
**MAPPING_SPEECH_ENCODER_PRENET,
**MAPPING_ENCODER,
**MAPPING_DECODER,
**MAPPING_SPEECH_DECODER_PRENET,
**MAPPING_SPEECH_DECODER_POSTNET,
}
TOP_LEVEL_KEYS = []
IGNORE_KEYS = [
"encoder.version",
"encoder.layers.*.norm_k.weight",
"encoder.layers.*.norm_k.bias",
"decoder.version",
"decoder.layers.*.norm_k.weight",
"decoder.layers.*.norm_k.bias",
"decoder.pos_emb.pe_k",
"speech_encoder_prenet.embed_positions._float_tensor",
"text_decoder_prenet.embed_positions._float_tensor",
]
IGNORE_KEYS_S2T = IGNORE_KEYS + [
"encoder.proj",
"text_encoder_prenet.*",
"speech_decoder_prenet.*",
"speech_decoder_postnet.*",
]
IGNORE_KEYS_T2S = IGNORE_KEYS + [
"encoder.proj",
"speech_encoder_prenet.*",
"text_decoder_prenet.*",
"text_decoder_postnet.*",
]
IGNORE_KEYS_S2S = IGNORE_KEYS + [
"encoder.proj",
"text_encoder_prenet.*",
"text_decoder_prenet.*",
"text_decoder_postnet.*",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
if hf_shape != value.shape:
raise ValueError(
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
elif weight_type == "running_mean":
hf_pointer.running_mean.data = value
elif weight_type == "running_var":
hf_pointer.running_var.data = value
elif weight_type == "num_batches_tracked":
hf_pointer.num_batches_tracked.data = value
else:
hf_pointer.data = value
logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
def should_ignore(name, ignore_keys):
for key in ignore_keys:
if key.endswith(".*"):
if name.startswith(key[:-1]):
return True
elif ".*." in key:
prefix, suffix = key.split(".*.")
if prefix in name and suffix in name:
return True
elif key in name:
return True
return False
def recursively_load_weights(fairseq_dict, hf_model, task):
unused_weights = []
if task == "s2t":
feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
MAPPING = MAPPING_S2T
IGNORE_KEYS = IGNORE_KEYS_S2T
elif task == "t2s":
feature_encoder = None
MAPPING = MAPPING_T2S
IGNORE_KEYS = IGNORE_KEYS_T2S
elif task == "s2s":
feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
MAPPING = MAPPING_S2S
IGNORE_KEYS = IGNORE_KEYS_S2S
else:
raise ValueError(f"Unsupported task: {task}")
for name, value in fairseq_dict.items():
if should_ignore(name, IGNORE_KEYS):
logger.info(f"{name} was ignored")
continue
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_encoder,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
if "*" in key:
prefix, suffix = key.split(".*.")
if prefix in name and suffix in name:
key = suffix
if key in name:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name:
weight_type = "bias"
elif "weight" in name:
weight_type = "weight"
elif "running_mean" in name:
weight_type = "running_mean"
elif "running_var" in name:
weight_type = "running_var"
elif "num_batches_tracked" in name:
weight_type = "num_batches_tracked"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
@torch.no_grad()
def convert_speecht5_checkpoint(
task,
checkpoint_path,
pytorch_dump_folder_path,
config_path=None,
vocab_path=None,
repo_id=None,
):
"""
将模型的权重复制/粘贴/调整到transformers设计中。
"""
if config_path is not None:
config = SpeechT5Config.from_pretrained(config_path)
else:
config = SpeechT5Config()
if task == "s2t":
config.max_length = config.max_text_positions
model = SpeechT5ForSpeechToText(config)
elif task == "t2s":
config.max_speech_positions = 1876
config.max_text_positions = 600
config.max_length = config.max_speech_positions
model = SpeechT5ForTextToSpeech(config)
elif task == "s2s":
config.max_speech_positions = 1876
config.max_length = config.max_speech_positions
model = SpeechT5ForSpeechToSpeech(config)
else:
raise ValueError(f"Unknown task name: {task}")
if vocab_path:
tokenizer = SpeechT5Tokenizer(vocab_path, model_max_length=config.max_text_positions)
mask_token = AddedToken("<mask>", lstrip=True, rstrip=False)
tokenizer.mask_token = mask_token
tokenizer.add_special_tokens({"mask_token": mask_token})
tokenizer.add_tokens(["<ctc_blank>"])
feature_extractor = SpeechT5FeatureExtractor()
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(pytorch_dump_folder_path)
fairseq_checkpoint = torch.load(checkpoint_path)
recursively_load_weights(fairseq_checkpoint["model"], model, task)
model.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
processor.push_to_hub(repo_id)
model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--task",
default="s2t",
type=str,
help="Type of the SpeechT5 model you'd like to convert. Should be one of 's2t', 't2s', 's2s'.",
)
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--vocab_path", default=None, type=str, help="Path to SentencePiece model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
args = parser.parse_args()
convert_speecht5_checkpoint(
args.task,
args.checkpoint_path,
args.pytorch_dump_folder_path,
args.config_path,
args.vocab_path,
args.push_to_hub,
)
.\models\speecht5\feature_extraction_speecht5.py
"""SpeechT5 的特征提取器类。"""
import warnings
from typing import Any, Dict, List, Optional, Union
import numpy as np
from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import PaddingStrategy, TensorType, logging
logger = logging.get_logger(__name__)
class SpeechT5FeatureExtractor(SequenceFeatureExtractor):
r"""
构造一个 SpeechT5 特征提取器。
此类可以通过(可选地)将原始语音信号归一化为零均值单位方差,以供 SpeechT5 语音编码器预网络使用。
此类还可以从原始语音中提取对数梅尔滤波器组特征,以供 SpeechT5 语音解码器预网络使用。
此特征提取器继承自 [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`],其中包含大多数主要方法。
用户应参考此超类以获取有关这些方法的更多信息。
"""
"""
Args:
feature_size (`int`, *optional*, defaults to 1):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding values.
do_normalize (`bool`, *optional*, defaults to `False`):
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
improve the performance for some models.
num_mel_bins (`int`, *optional*, defaults to 80):
The number of mel-frequency bins in the extracted spectrogram features.
hop_length (`int`, *optional*, defaults to 16):
Number of ms between windows. Otherwise referred to as "shift" in many papers.
win_length (`int`, *optional*, defaults to 64):
Number of ms per window.
win_function (`str`, *optional*, defaults to `"hann_window"`):
Name for the window function used for windowing, must be accessible via `torch.{win_function}`
frame_signal_scale (`float`, *optional*, defaults to 1.0):
Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
fmin (`float`, *optional*, defaults to 80):
Minimum mel frequency in Hz.
fmax (`float`, *optional*, defaults to 7600):
Maximum mel frequency in Hz.
mel_floor (`float`, *optional*, defaults to 1e-10):
Minimum value of mel frequency banks.
reduction_factor (`int`, *optional*, defaults to 2):
Spectrogram length reduction factor. This argument is deprecated.
return_attention_mask (`bool`, *optional*, defaults to `True`):
Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
"""
model_input_names = ["input_values", "attention_mask"]
def __init__(
self,
feature_size: int = 1,
sampling_rate: int = 16000,
padding_value: float = 0.0,
do_normalize: bool = False,
num_mel_bins: int = 80,
hop_length: int = 16,
win_length: int = 64,
win_function: str = "hann_window",
frame_signal_scale: float = 1.0,
fmin: float = 80,
fmax: float = 7600,
mel_floor: float = 1e-10,
reduction_factor: int = 2,
return_attention_mask: bool = True,
**kwargs,
):
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
self.do_normalize = do_normalize
self.return_attention_mask = return_attention_mask
self.num_mel_bins = num_mel_bins
self.hop_length = hop_length
self.win_length = win_length
self.win_function = win_function
self.frame_signal_scale = frame_signal_scale
self.fmin = fmin
self.fmax = fmax
self.mel_floor = mel_floor
self.reduction_factor = reduction_factor
self.sample_size = win_length * sampling_rate // 1000
self.sample_stride = hop_length * sampling_rate // 1000
self.n_fft = optimal_fft_length(self.sample_size)
self.n_freqs = (self.n_fft // 2) + 1
self.window = window_function(window_length=self.sample_size, name=self.win_function, periodic=True)
self.mel_filters = mel_filter_bank(
num_frequency_bins=self.n_freqs,
num_mel_filters=self.num_mel_bins,
min_frequency=self.fmin,
max_frequency=self.fmax,
sampling_rate=self.sampling_rate,
norm="slaney",
mel_scale="slaney",
)
if frame_signal_scale != 1.0:
warnings.warn(
"The argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformers",
FutureWarning,
)
if reduction_factor != 2.0:
warnings.warn(
"The argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers",
FutureWarning,
)
@staticmethod
def zero_mean_unit_var_norm(
input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
) -> List[np.ndarray]:
"""
每个数组在列表中被归一化为零均值和单位方差
"""
if attention_mask is not None:
attention_mask = np.array(attention_mask, np.int32)
normed_input_values = []
for vector, length in zip(input_values, attention_mask.sum(-1)):
normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
if length < normed_slice.shape[0]:
normed_slice[length:] = padding_value
normed_input_values.append(normed_slice)
else:
normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
return normed_input_values
def _extract_mel_features(
self,
one_waveform: np.ndarray,
) -> np.ndarray:
"""
Extracts log-mel filterbank features for one waveform array (unbatched).
"""
log_mel_spec = spectrogram(
one_waveform,
window=self.window,
frame_length=self.sample_size,
hop_length=self.sample_stride,
fft_length=self.n_fft,
mel_filters=self.mel_filters,
mel_floor=self.mel_floor,
log_mel="log10",
)
return log_mel_spec.T
def __call__(
self,
audio: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
audio_target: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
**kwargs,
):
"""
Process audio input according to specified parameters.
"""
def _process_audio(
self,
speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
is_target: bool = False,
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
"""
Internal method to process audio data.
"""
def to_dict(self) -> Dict[str, Any]:
"""
Convert the object's properties to a dictionary representation.
"""
output = super().to_dict()
names = ["window", "mel_filters", "sample_size", "sample_stride", "n_fft", "n_freqs"]
for name in names:
if name in output:
del output[name]
return output
.\models\speecht5\modeling_speecht5.py
""" PyTorch SpeechT5 model."""
import math
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, L1Loss
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqSpectrogramOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 1
_CONFIG_FOR_DOC = "SpeechT5Config"
SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/speecht5_asr",
"microsoft/speecht5_tts",
"microsoft/speecht5_vc",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
def shift_spectrograms_right(input_values: torch.Tensor, reduction_factor: int = 1):
"""
Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
"""
if reduction_factor > 1:
input_values = input_values[:, reduction_factor - 1 :: reduction_factor]
shifted_input_values = input_values.new_zeros(input_values.shape)
shifted_input_values[:, 1:] = input_values[:, :-1].clone()
shifted_input_values.masked_fill_(shifted_input_values == -100.0, 0.0)
return shifted_input_values
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.LongTensor] = None,
min_masks: int = 0,
) -> np.ndarray:
"""
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.
Args:
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
the first element is the batch size and the second element is the length of the axis to span.
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
independently generated mask spans of length `mask_length` is computed by
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask
min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
"""
batch_size, sequence_length = shape
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
if mask_length > sequence_length:
raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
)
epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length):
"""Given input length, compute how many spans should be masked"""
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
num_masked_span = max(num_masked_span, min_masks)
if num_masked_span * mask_length > sequence_length:
num_masked_span = sequence_length // mask_length
if input_length - (mask_length - 1) < num_masked_span:
num_masked_span = max(input_length - (mask_length - 1), 0)
return num_masked_span
input_lengths = (
attention_mask.sum(-1).detach().tolist()
if attention_mask is not None
else [sequence_length for _ in range(batch_size)]
)
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
spec_aug_mask_idxs = []
max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths:
num_masked_span = compute_num_masked_span(input_length)
spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
)
if len(spec_aug_mask_idx) == 0:
dummy_mask_idx = sequence_length - 1
else:
dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate(
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
)
spec_aug_mask_idxs.append(spec_aug_mask_idx)
spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
spec_aug_mask_idxs = np.broadcast_to(
spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
)
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length
)
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
if spec_aug_mask_idxs.max() > sequence_length - 1:
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
return spec_aug_mask
class SpeechT5NoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
class SpeechT5LayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = hidden_states.transpose(-2, -1)
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states.transpose(-2, -1)
hidden_states = self.activation(hidden_states)
return hidden_states
class SpeechT5GroupNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
self.activation = ACT2FN[config.feat_extract_activation]
self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.layer_norm(hidden_states)
hidden_states = self.activation(hidden_states)
return hidden_states
class SpeechT5SinusoidalPositionalEmbedding(nn.Module):
"""本模块生成任意长度的正弦余弦位置嵌入。"""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.weights = nn.Parameter(emb_weights)
self.weights.requires_grad = False
self.weights.detach_()
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
创造正弦余弦嵌入式权重矩阵,遵循文档中归纳的公式,生成为一种特定应用的基础矢量组。
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
bsz, seq_len = input_ids.size()
position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
max_pos = self.padding_idx + 1 + seq_len
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
def create_position_ids_from_input_ids(
self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
):
"""
将非填充符号替换为它们的位置编号。位置编号从 padding_idx+1 开始。填充符号被忽略。这是基于fairseq的`utils.make_positions`修改的版本。
Args:
x: torch.Tensor 输入张量
Returns:
torch.Tensor 包含位置编号的张量
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
class SpeechT5PositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
padding=config.num_conv_pos_embeddings // 2,
groups=config.num_conv_pos_embedding_groups,
)
weight_norm = nn.utils.weight_norm
if hasattr(nn.utils.parametrizations, "weight_norm"):
weight_norm = nn.utils.parametrizations.weight_norm
if is_deepspeed_zero3_enabled():
import deepspeed
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
self.conv = weight_norm(self.conv, name="weight", dim=2)
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
self.conv = weight_norm(self.conv, name="weight", dim=2)
self.padding = SpeechT5SamePadLayer(config.num_conv_pos_embeddings)
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.conv(hidden_states)
hidden_states = self.padding(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
class SpeechT5ScaledPositionalEncoding(nn.Module):
"""
Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
"""
def __init__(self, dropout, dim, max_len=5000):
pe = torch.zeros(max_len, dim)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.int64).float() * -(math.log(10000.0) / dim)))
pe[:, 0::2] = torch.sin(position.float() * div_term)
pe[:, 1::2] = torch.cos(position.float() * div_term)
pe = pe.unsqueeze(0)
super().__init__()
self.register_buffer("pe", pe, persistent=False)
self.dropout = nn.Dropout(p=dropout)
self.dim = dim
self.alpha = torch.nn.Parameter(torch.tensor(1.0))
def forward(self, emb):
emb = emb + self.alpha * self.pe[:, : emb.size(1)]
emb = self.dropout(emb)
return emb
class SpeechT5RelativePositionalEncoding(torch.nn.Module):
def __init__(self, dim, max_length=1000):
super().__init__()
self.dim = dim
self.max_length = max_length
self.pe_k = torch.nn.Embedding(2 * max_length, dim)
def forward(self, hidden_states):
seq_len = hidden_states.shape[1]
pos_seq = torch.arange(0, seq_len).long().to(hidden_states.device)
pos_seq = pos_seq[:, None] - pos_seq[None, :]
pos_seq[pos_seq < -self.max_length] = -self.max_length
pos_seq[pos_seq >= self.max_length] = self.max_length - 1
pos_seq = pos_seq + self.max_length
return self.pe_k(pos_seq)
class SpeechT5SamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, :-self.num_pad_remove]
return hidden_states
class SpeechT5FeatureEncoder(nn.Module):
"""从原始音频波形构建特征"""
def __init__(self, config):
super().__init__()
if config.feat_extract_norm == "group":
conv_layers = [SpeechT5GroupNormConvLayer(config, layer_id=0)] + [
SpeechT5NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
]
elif config.feat_extract_norm == "layer":
conv_layers = [
SpeechT5LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
]
else:
raise ValueError(
f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
)
self.conv_layers = nn.ModuleList(conv_layers)
self.gradient_checkpointing = False
self._requires_grad = True
def _freeze_parameters(self):
for param in self.parameters():
param.requires_grad = False
self._requires_grad = False
def forward(self, input_values):
hidden_states = input_values[:, None]
if self._requires_grad and self.training:
hidden_states.requires_grad = True
for conv_layer in self.conv_layers:
if self._requires_grad and self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
conv_layer.__call__,
hidden_states,
)
else:
hidden_states = conv_layer(hidden_states)
return hidden_states
class SpeechT5FeatureProjection(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
self.dropout = nn.Dropout(config.feat_proj_dropout)
def forward(self, hidden_states):
norm_hidden_states = self.layer_norm(hidden_states)
hidden_states = self.projection(norm_hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states, norm_hidden_states
class SpeechT5SpeechEncoderPrenet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.feature_encoder = SpeechT5FeatureEncoder(config)
self.feature_projection = SpeechT5FeatureProjection(config)
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
self.pos_conv_embed = SpeechT5PositionalConvEmbedding(config)
self.pos_sinusoidal_embed = SpeechT5SinusoidalPositionalEmbedding(
config.max_speech_positions + config.pad_token_id + 1,
config.hidden_size,
config.pad_token_id,
)
def freeze_feature_encoder(self):
self.feature_encoder._freeze_parameters()
def forward(
self,
input_values: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
mask_time_indices: Optional[torch.FloatTensor] = None,
):
extract_features = self.feature_encoder(input_values)
extract_features = extract_features.transpose(1, 2)
if attention_mask is not None:
attention_mask = self._get_feature_vector_attention_mask(
extract_features.shape[1],
attention_mask,
)
hidden_states, extract_features = self.feature_projection(extract_features)
hidden_states = self._mask_hidden_states(
hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
)
positional_conv_embedding = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + positional_conv_embedding
if attention_mask is not None:
padding_mask = attention_mask.ne(1).long()
else:
padding_mask = torch.zeros(hidden_states.shape[:2], dtype=torch.long, device=hidden_states.device)
positional_sinusoidal_embeddings = self.pos_sinusoidal_embed(padding_mask)
hidden_states = hidden_states + positional_sinusoidal_embeddings
return hidden_states, attention_mask
non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
batch_size = attention_mask.shape[0]
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
return attention_mask
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
计算卷积层的输出长度
"""
def _conv_out_length(input_length, kernel_size, stride):
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
return input_lengths
def _mask_hidden_states(
self,
hidden_states: torch.FloatTensor,
mask_time_indices: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
):
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0
return hidden_states
class SpeechT5SpeechDecoderPrenet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layers = nn.ModuleList(
[
nn.Linear(
config.num_mel_bins if i == 0 else config.speech_decoder_prenet_units,
config.speech_decoder_prenet_units,
)
for i in range(config.speech_decoder_prenet_layers)
]
)
self.final_layer = nn.Linear(config.speech_decoder_prenet_units, config.hidden_size)
self.encode_positions = SpeechT5ScaledPositionalEncoding(
config.positional_dropout,
config.hidden_size,
config.max_speech_positions,
)
self.speaker_embeds_layer = nn.Linear(config.speaker_embedding_dim + config.hidden_size, config.hidden_size)
def _consistent_dropout(self, inputs_embeds, p):
mask = torch.bernoulli(inputs_embeds[0], p=p)
all_masks = mask.unsqueeze(0).repeat(inputs_embeds.size(0), 1, 1)
return torch.where(all_masks == 1, inputs_embeds, 0) * 1 / (1 - p)
def forward(
self,
input_values: torch.Tensor,
speaker_embeddings: Optional[torch.Tensor] = None,
):
inputs_embeds = input_values
for layer in self.layers:
inputs_embeds = nn.functional.relu(layer(inputs_embeds))
inputs_embeds = self._consistent_dropout(inputs_embeds, self.config.speech_decoder_prenet_dropout)
inputs_embeds = self.final_layer(inputs_embeds)
inputs_embeds = self.encode_positions(inputs_embeds)
if speaker_embeddings is not None:
speaker_embeddings = nn.functional.normalize(speaker_embeddings)
speaker_embeddings = speaker_embeddings.unsqueeze(1).expand(-1, inputs_embeds.size(1), -1)
inputs_embeds = torch.cat([inputs_embeds, speaker_embeddings], dim=-1)
inputs_embeds = nn.functional.relu(self.speaker_embeds_layer(inputs_embeds))
return inputs_embeds
def __init__(self, config, layer_id=0):
super().__init__()
if layer_id == 0:
in_conv_dim = config.num_mel_bins
else:
in_conv_dim = config.speech_decoder_postnet_units
if layer_id == config.speech_decoder_postnet_layers - 1:
out_conv_dim = config.num_mel_bins
else:
out_conv_dim = config.speech_decoder_postnet_units
self.conv = nn.Conv1d(
in_conv_dim,
out_conv_dim,
kernel_size=config.speech_decoder_postnet_kernel,
stride=1,
padding=(config.speech_decoder_postnet_kernel - 1) // 2,
bias=False,
)
self.batch_norm = nn.BatchNorm1d(out_conv_dim)
if layer_id < config.speech_decoder_postnet_layers - 1:
self.activation = nn.Tanh()
else:
self.activation = None
self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.batch_norm(hidden_states)
if self.activation is not None:
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class SpeechT5SpeechDecoderPostnet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
self.prob_out = nn.Linear(config.hidden_size, config.reduction_factor)
self.layers = nn.ModuleList(
[SpeechT5BatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
)
def forward(self, hidden_states: torch.Tensor):
outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
outputs_after_postnet = self.postnet(outputs_before_postnet)
logits = self.prob_out(hidden_states).view(hidden_states.size(0), -1)
return outputs_before_postnet, outputs_after_postnet, logits
def postnet(self, hidden_states: torch.Tensor):
layer_output = hidden_states.transpose(1, 2)
for layer in self.layers:
layer_output = layer(layer_output)
return hidden_states + layer_output.transpose(1, 2)
class SpeechT5TextEncoderPrenet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.encode_positions = SpeechT5ScaledPositionalEncoding(
config.positional_dropout,
config.hidden_size,
config.max_text_positions,
)
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(self, input_ids: torch.Tensor):
inputs_embeds = self.embed_tokens(input_ids)
inputs_embeds = self.encode_positions(inputs_embeds)
return inputs_embeds
class SpeechT5TextDecoderPrenet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.dropout = nn.Dropout(config.positional_dropout)
self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.embed_positions = SpeechT5SinusoidalPositionalEmbedding(
config.max_text_positions + config.pad_token_id + 1,
config.hidden_size,
config.pad_token_id,
)
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
):
if input_ids is not None:
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
else:
raise ValueError("You have to specify `decoder_input_ids`")
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
positions = self.embed_positions(input_ids, past_key_values_length)
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
inputs_embeds += positions
inputs_embeds = self.dropout(inputs_embeds)
return inputs_embeds, attention_mask
class SpeechT5TextDecoderPostnet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
def forward(self, hidden_states: torch.Tensor):
return self.lm_head(hidden_states)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
class SpeechT5Attention(nn.Module):
"""
Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
https://aclanthology.org/N18-2074.pdf)
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
position_bias: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
raise NotImplementedError
class SpeechT5FeedForward(nn.Module):
def __init__(self, config, intermediate_size):
super().__init__()
self.intermediate_dropout = nn.Dropout(config.activation_dropout)
self.intermediate_dense = nn.Linear(config.hidden_size, intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.output_dense = nn.Linear(intermediate_size, config.hidden_size)
self.output_dropout = nn.Dropout(config.hidden_dropout)
def forward(self, hidden_states):
hidden_states = self.intermediate_dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.output_dense(hidden_states)
hidden_states = self.output_dropout(hidden_states)
return hidden_states
class SpeechT5EncoderLayer(nn.Module):
def __init__(self, config: SpeechT5Config):
super().__init__()
self.attention = SpeechT5Attention(
embed_dim=config.hidden_size,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
)
self.dropout = nn.Dropout(config.hidden_dropout)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.feed_forward = SpeechT5FeedForward(config, config.encoder_ffn_dim)
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
position_bias: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
"""
Args:
hidden_states (`torch.FloatTensor`):
输入层的隐藏状态,形状为 `(batch, seq_len, hidden_size)`
attention_mask (`torch.FloatTensor`):
注意力掩码,形状为 `(batch, 1, tgt_len, src_len)`,其中填充元素用极大负值表示
layer_head_mask (`torch.FloatTensor`):
给定层中注意力头的掩码,形状为 `(config.encoder_attention_heads,)`
position_bias (`torch.FloatTensor`):
相对位置嵌入,形状为 `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
output_attentions (`bool`, *可选*):
是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 以获取更多细节。
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.attention(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
position_bias=position_bias,
output_attentions=output_attentions,
)
hidden_states = self.dropout(hidden_states)
hidden_states = residual + hidden_states
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states + self.feed_forward(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class SpeechT5DecoderLayer(nn.Module):
def __init__(self, config: SpeechT5Config):
super().__init__()
self.self_attn = SpeechT5Attention(
embed_dim=config.hidden_size,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = nn.Dropout(config.hidden_dropout)
self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.encoder_attn = SpeechT5Attention(
config.hidden_size,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.feed_forward = SpeechT5FeedForward(config, config.decoder_ffn_dim)
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SpeechT5Config
base_model_prefix = "speecht5"
main_input_name = "input_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, SpeechT5PositionalConvEmbedding):
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
nn.init.constant_(module.conv.bias, 0)
elif isinstance(module, SpeechT5FeatureProjection):
k = math.sqrt(1 / module.projection.in_features)
nn.init.uniform_(module.projection.weight, a=-k, b=k)
nn.init.uniform_(module.projection.bias, a=-k, b=k)
elif isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-k, b=k)
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class SpeechT5EncoderWithSpeechPrenet(SpeechT5PreTrainedModel):
"""
Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
hidden features.
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
self.prenet = SpeechT5SpeechEncoderPrenet(config)
self.wrapped_encoder = SpeechT5Encoder(config)
self.post_init()
def forward(
self,
input_values: torch.FloatTensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
hidden_states, attention_mask = self.prenet(input_values, attention_mask)
outputs = self.wrapped_encoder(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return outputs
class SpeechT5EncoderWithTextPrenet(SpeechT5PreTrainedModel):
"""
Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
self.prenet = SpeechT5TextEncoderPrenet(config)
self.wrapped_encoder = SpeechT5Encoder(config)
self.post_init()
def get_input_embeddings(self):
return self.prenet.get_input_embeddings()
def set_input_embeddings(self, value):
self.prenet.set_input_embeddings(value)
def forward(
self,
input_values: torch.FloatTensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
hidden_states = self.prenet(input_values)
outputs = self.wrapped_encoder(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return outputs
class SpeechT5EncoderWithoutPrenet(SpeechT5PreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
[`SpeechT5Model`].
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
self.wrapped_encoder = SpeechT5Encoder(config)
self.post_init()
def forward(
self,
input_values: torch.FloatTensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
return self.wrapped_encoder(
hidden_states=input_values,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
class SpeechT5Decoder(SpeechT5PreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`].
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
self.layerdrop = config.decoder_layerdrop
self.layers = nn.ModuleList([SpeechT5DecoderLayer(config) for _ in range(config.decoder_layers)])
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
hidden_states: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class SpeechT5DecoderWithSpeechPrenet(SpeechT5PreTrainedModel):
"""
Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
features.
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
self.prenet = SpeechT5SpeechDecoderPrenet(config)
self.wrapped_decoder = SpeechT5Decoder(config)
self.post_init()
def forward(
self,
input_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
speaker_embeddings: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
outputs = self.wrapped_decoder(
hidden_states=decoder_hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return outputs
class SpeechT5DecoderWithTextPrenet(SpeechT5PreTrainedModel):
"""
Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
self.prenet = SpeechT5TextDecoderPrenet(config)
self.wrapped_decoder = SpeechT5Decoder(config)
self.post_init()
def get_input_embeddings(self):
return self.prenet.get_input_embeddings()
def set_input_embeddings(self, value):
self.prenet.set_input_embeddings(value)
def forward(
self,
input_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)
outputs = self.wrapped_decoder(
hidden_states=decoder_hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return outputs
class SpeechT5DecoderWithoutPrenet(SpeechT5PreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
[`SpeechT5Model`].
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
self.wrapped_decoder = SpeechT5Decoder(config)
self.post_init()
def forward(
self,
input_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
outputs = self.wrapped_decoder(
hidden_states=input_values,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return outputs
class SpeechT5GuidedMultiheadAttentionLoss(nn.Module):
"""
Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
"""
def __init__(self, config: SpeechT5Config):
super().__init__()
self.sigma = config.guided_attention_loss_sigma
self.scale = config.guided_attention_loss_scale
def forward(
self, attentions: torch.FloatTensor, input_masks: torch.BoolTensor, output_masks: torch.BoolTensor
) -> torch.Tensor:
"""
Compute the attention loss.
Args:
attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
Batch of multi-head attention weights
input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
Input attention mask as booleans.
output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
Target attention mask as booleans.
Returns:
`torch.Tensor` with the loss value
"""
guided_attn_masks = self._make_guided_attention_masks(input_masks, output_masks, attentions.device)
masks = output_masks.unsqueeze(-1) & input_masks.unsqueeze(-2)
masks = masks.to(attentions.device).unsqueeze(1)
losses = guided_attn_masks * attentions
loss = torch.mean(losses.masked_select(masks))
return self.scale * loss
def _make_guided_attention_masks(self, input_masks, output_masks, device):
input_lengths = input_masks.sum(-1)
output_lengths = output_masks.sum(-1)
guided_attn_masks = torch.zeros((len(input_masks), output_masks.shape[1], input_masks.shape[1]), device=device)
for idx, (ilen, olen) in enumerate(zip(input_lengths, output_lengths)):
guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(ilen, olen, self.sigma, device)
return guided_attn_masks.unsqueeze(1)
@staticmethod
def _make_guided_attention_mask(input_length, output_length, sigma, device):
grid_y, grid_x = torch.meshgrid(
torch.arange(input_length, device=device),
torch.arange(output_length, device=device),
indexing="xy",
)
grid_x = grid_x.float() / output_length
grid_y = grid_y.float() / input_length
return 1.0 - torch.exp(-((grid_y - grid_x) ** 2) / (2 * (sigma**2)))
def __init__(self, config: SpeechT5Config):
super().__init__()
self.use_guided_attention_loss = config.use_guided_attention_loss
self.guided_attention_loss_num_heads = config.guided_attention_loss_num_heads
self.reduction_factor = config.reduction_factor
self.l1_criterion = L1Loss()
self.bce_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor(5.0))
if self.use_guided_attention_loss:
self.attn_criterion = SpeechT5GuidedMultiheadAttentionLoss(config)
def forward(
self,
attention_mask: torch.LongTensor,
outputs_before_postnet: torch.FloatTensor,
outputs_after_postnet: torch.FloatTensor,
logits: torch.FloatTensor,
labels: torch.FloatTensor,
cross_attentions: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
padding_mask = labels != -100.0
labels = labels.masked_select(padding_mask)
outputs_before_postnet = outputs_before_postnet.masked_select(padding_mask)
outputs_after_postnet = outputs_after_postnet.masked_select(padding_mask)
l1_loss = self.l1_criterion(outputs_after_postnet, labels) + self.l1_criterion(outputs_before_postnet, labels)
masks = padding_mask[:, :, 0]
stop_labels = torch.cat([~masks * 1.0, torch.ones(masks.size(0), 1).to(masks.device)], dim=1)
stop_labels = stop_labels[:, 1:].masked_select(masks)
logits = logits.masked_select(masks)
bce_loss = self.bce_criterion(logits, stop_labels)
loss = l1_loss + bce_loss
if self.use_guided_attention_loss:
attn = torch.cat([x[:, : self.guided_attention_loss_num_heads] for x in cross_attentions], dim=1)
input_masks = attention_mask == 1
output_masks = padding_mask[:, :, 0]
if self.reduction_factor > 1:
output_masks = output_masks[:, self.reduction_factor - 1 :: self.reduction_factor]
attn_loss = self.attn_criterion(attn, input_masks, output_masks)
loss += attn_loss
return loss
SPEECHT5_BASE_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SpeechT5Config`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
[`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
[`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
states.
"""
SPEECHT5_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SpeechT5Config`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SPEECHT5_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.",
SPEECHT5_BASE_START_DOCSTRING,
)
class SpeechT5Model(SpeechT5PreTrainedModel):
def __init__(
self,
config: SpeechT5Config,
encoder: Optional[nn.Module] = None,
decoder: Optional[nn.Module] = None,
):
super().__init__(config)
self.config = config
self.encoder = SpeechT5EncoderWithoutPrenet(config) if encoder is None else encoder
self.decoder = SpeechT5DecoderWithoutPrenet(config) if decoder is None else decoder
self.post_init()
def get_input_embeddings(self):
if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
return self.encoder.get_input_embeddings()
if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
return self.decoder.get_input_embeddings()
return None
def set_input_embeddings(self, value):
if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
self.encoder.set_input_embeddings(value)
if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
self.decoder.set_input_embeddings(value)
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
if isinstance(self.encoder, SpeechT5EncoderWithSpeechPrenet):
self.encoder.prenet.freeze_feature_encoder()
@add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_values: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_values: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
decoder_head_mask: Optional[torch.FloatTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
speaker_embeddings: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""SpeechT5 Model with a speech encoder and a text decoder.""",
SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel):
_tied_weights_keys = ["text_decoder_postnet.lm_head.weight"]
def __init__(self, config: SpeechT5Config):
super().__init__(config)
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
" vocabulary size of the language model head. Please instantiate the model as follows:"
" `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
" your model's configuration."
)
speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
text_decoder = SpeechT5DecoderWithTextPrenet(config)
self.speecht5 = SpeechT5Model(config, speech_encoder, text_decoder)
self.text_decoder_postnet = SpeechT5TextDecoderPostnet(config)
self.post_init()
def get_encoder(self):
return self.speecht5.get_encoder()
def get_decoder(self):
return self.speecht5.get_decoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.get_encoder().prenet.freeze_feature_encoder()
def get_output_embeddings(self):
return self.text_decoder_postnet.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
self.text_decoder_postnet.set_output_embeddings(new_embeddings)
@add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
decoder_head_mask: Optional[torch.FloatTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
if speaker_embeddings is None:
raise ValueError(
"""`speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
the code snippet provided in this link:
https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
"""
)
if attention_mask is None:
encoder_attention_mask = 1 - (input_values == model.config.pad_token_id).int()
else:
encoder_attention_mask = attention_mask
bsz = input_values.size(0)
encoder_out = model.speecht5.encoder(
input_values=input_values,
attention_mask=encoder_attention_mask,
return_dict=True,
)
encoder_last_hidden_state = encoder_out.last_hidden_state
if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
encoder_out[0].shape[1], encoder_attention_mask
)
maxlen = int(encoder_last_hidden_state.size(1) * maxlenratio / model.config.reduction_factor)
minlen = int(encoder_last_hidden_state.size(1) * minlenratio / model.config.reduction_factor)
output_sequence = encoder_last_hidden_state.new_zeros(bsz, 1, model.config.num_mel_bins)
spectrogram = []
cross_attentions = []
past_key_values = None
idx = 0
result_spectrogram = {}
while True:
idx += 1
decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
decoder_out = model.speecht5.decoder.wrapped_decoder(
hidden_states=decoder_hidden_states[:, -1:],
attention_mask=None,
encoder_hidden_states=encoder_last_hidden_state,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=True,
output_attentions=output_cross_attentions,
return_dict=True,
)
if output_cross_attentions:
cross_attentions.append(torch.cat(decoder_out.cross_attentions, dim=0))
last_decoder_output = decoder_out.last_hidden_state.squeeze(1)
past_key_values = decoder_out.past_key_values
spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
spectrum = spectrum.view(bsz, model.config.reduction_factor, model.config.num_mel_bins)
spectrogram.append(spectrum)
new_spectrogram = spectrum[:, -1, :].view(bsz, 1, model.config.num_mel_bins)
output_sequence = torch.cat((output_sequence, new_spectrogram), dim=1)
prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
if idx < minlen:
continue
else:
if idx < maxlen:
meet_thresholds = torch.sum(prob, dim=-1) >= threshold
meet_indexes = torch.where(meet_thresholds)[0].tolist()
else:
meet_indexes = range(len(prob))
meet_indexes = [i for i in meet_indexes if i not in result_spectrogram]
if len(meet_indexes) > 0:
spectrograms = torch.stack(spectrogram)
spectrograms = spectrograms.transpose(0, 1).flatten(1, 2)
spectrograms = model.speech_decoder_postnet.postnet(spectrograms)
for meet_index in meet_indexes:
result_spectrogram[meet_index] = spectrograms[meet_index]
if len(result_spectrogram) >= bsz:
break
spectrograms = [result_spectrogram[i] for i in range(len(result_spectrogram))]
`
if not return_output_lengths:
spectrogram = spectrograms[0] if bsz == 1 else torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
if vocoder is not None:
outputs = vocoder(spectrogram)
else:
outputs = spectrogram
if output_cross_attentions:
cross_attentions = torch.cat(cross_attentions, dim=2)
if bsz > 1:
cross_attentions = cross_attentions.view(
bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
)
outputs = (outputs, cross_attentions)
else:
spectrogram_lengths = []
for i in range(bsz):
spectrogram_lengths.append(spectrograms[i].size(0))
if vocoder is None:
spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
outputs = (spectrograms, spectrogram_lengths)
else:
waveforms = vocoder(spectrograms)
waveform_lengths = [int(waveforms.size(1) / max(spectrogram_lengths)) * i for i in spectrogram_lengths]
outputs = (waveforms, waveform_lengths)
if output_cross_attentions:
cross_attentions = torch.cat(cross_attentions, dim=2)
cross_attentions = cross_attentions.view(
bsz, int(cross_attentions.size(0) / bsz), *cross_attentions.size()[-3:]
)
outputs = (*outputs, cross_attentions)
return outputs
@add_start_docstrings(
"""SpeechT5 Model with a text encoder and a speech decoder.""",
SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel):
main_input_name = "input_ids"
def __init__(self, config: SpeechT5Config):
super().__init__(config)
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
" vocabulary size of the language model head. Please instantiate the model as follows:"
" `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
" your model's configuration."
)
text_encoder = SpeechT5EncoderWithTextPrenet(config)
speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
self.speecht5 = SpeechT5Model(config, text_encoder, speech_decoder)
self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)
self.post_init()
def get_encoder(self):
return self.speecht5.get_encoder()
def get_decoder(self):
return self.speecht5.get_decoder()
@add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqSpectrogramOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_values: Optional[torch.FloatTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
decoder_head_mask: Optional[torch.FloatTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
speaker_embeddings: Optional[torch.FloatTensor] = None,
labels: Optional[torch.FloatTensor] = None,
stop_labels: Optional[torch.Tensor] = None,
):
pass
@torch.no_grad()
def generate(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.LongTensor] = None,
speaker_embeddings: Optional[torch.FloatTensor] = None,
threshold: float = 0.5,
minlenratio: float = 0.0,
maxlenratio: float = 20.0,
vocoder: Optional[nn.Module] = None,
output_cross_attentions: bool = False,
return_output_lengths: bool = False,
**kwargs,
):
pass
def generate_speech(
self,
input_ids: torch.LongTensor,
speaker_embeddings: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
threshold: float = 0.5,
minlenratio: float = 0.0,
maxlenratio: float = 20.0,
vocoder: Optional[nn.Module] = None,
output_cross_attentions: bool = False,
return_output_lengths: bool = False,
@add_start_docstrings(
"""SpeechT5 Model with a speech encoder and a speech decoder.""",
SPEECHT5_START_DOCSTRING,
)
class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
"""
SpeechT5ForSpeechToSpeech is a specialized model for speech-to-speech tasks, incorporating
both an encoder and a decoder for processing speech data.
"""
def __init__(self, config: SpeechT5Config):
super().__init__(config)
speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
self.speecht5 = SpeechT5Model(config, speech_encoder, speech_decoder)
self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)
self.post_init()
def get_encoder(self):
"""
Returns the speech encoder from the SpeechT5 model.
"""
return self.speecht5.get_encoder()
def get_decoder(self):
"""
Returns the speech decoder from the SpeechT5 model.
"""
return self.speecht5.get_decoder()
def freeze_feature_encoder(self):
"""
Calling this function will freeze the gradient computation for the feature encoder,
preventing its parameters from being updated during training.
"""
self.get_encoder().prenet.freeze_feature_encoder()
@add_start_docstrings_to_model_forward(SPEECHT5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqSpectrogramOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_values: Optional[torch.FloatTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
decoder_head_mask: Optional[torch.FloatTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
speaker_embeddings: Optional[torch.FloatTensor] = None,
labels: Optional[torch.FloatTensor] = None,
stop_labels: Optional[torch.Tensor] = None,
):
"""
Forward pass of the SpeechT5 model for speech-to-speech conversion tasks.
"""
@torch.no_grad()
def generate_speech(
self,
input_values: torch.FloatTensor,
speaker_embeddings: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
threshold: float = 0.5,
minlenratio: float = 0.0,
maxlenratio: float = 20.0,
vocoder: Optional[nn.Module] = None,
output_cross_attentions: bool = False,
return_output_lengths: bool = False,
):
"""
Generates speech output based on input values, optionally using speaker embeddings.
"""
注释:
这段代码定义了一个名为SpeechT5ForSpeechToSpeech的Python类,用于处理语音到语音的转换任务。该类继承自SpeechT5PreTrainedModel,并组合了一个自定义的语音编码器、解码器及其他相关组件,实现了前向传播方法和生成语音的方法。
class HifiGanResidualBlock(nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
super().__init__()
self.leaky_relu_slope = leaky_relu_slope
self.convs1 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=dilation[i],
padding=self.get_padding(kernel_size, dilation[i]),
)
for i in range(len(dilation))
]
)
self.convs2 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=1,
padding=self.get_padding(kernel_size, 1),
)
for _ in range(len(dilation))
]
)
def get_padding(self, kernel_size, dilation=1):
return (kernel_size * dilation - dilation) // 2
def apply_weight_norm(self):
for layer in self.convs1:
nn.utils.weight_norm(layer)
for layer in self.convs2:
nn.utils.weight_norm(layer)
def remove_weight_norm(self):
for layer in self.convs1:
nn.utils.remove_weight_norm(layer)
for layer in self.convs2:
nn.utils.remove_weight_norm(layer)
def forward(self, hidden_states):
for conv1, conv2 in zip(self.convs1, self.convs2):
residual = hidden_states
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv1(hidden_states)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv2(hidden_states)
hidden_states = hidden_states + residual
return hidden_states
@add_start_docstrings(
"""HiFi-GAN vocoder.""",
HIFIGAN_START_DOCSTRING,
)
class SpeechT5HifiGan(PreTrainedModel):
config_class = SpeechT5HifiGanConfig
main_input_name = "spectrogram"
def __init__(self, config: SpeechT5HifiGanConfig):
super().__init__(config)
self.num_kernels = len(config.resblock_kernel_sizes)
self.num_upsamples = len(config.upsample_rates)
self.conv_pre = nn.Conv1d(
config.model_in_dim,
config.upsample_initial_channel,
kernel_size=7,
stride=1,
padding=3,
)
self.upsampler = nn.ModuleList()
for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
self.upsampler.append(
nn.ConvTranspose1d(
config.upsample_initial_channel // (2**i),
config.upsample_initial_channel // (2 ** (i + 1)),
kernel_size=kernel_size,
stride=upsample_rate,
padding=(kernel_size - upsample_rate) // 2,
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.upsampler)):
channels = config.upsample_initial_channel // (2 ** (i + 1))
for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
self.register_buffer("mean", torch.zeros(config.model_in_dim))
self.register_buffer("scale", torch.ones(config.model_in_dim))
self.post_init()
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
def apply_weight_norm(self):
nn.utils.weight_norm(self.conv_pre)
for layer in self.upsampler:
nn.utils.weight_norm(layer)
for layer in self.resblocks:
layer.apply_weight_norm()
nn.utils.weight_norm(self.conv_post)
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.conv_pre)
for layer in self.upsampler:
nn.utils.remove_weight_norm(layer)
for layer in self.resblocks:
layer.remove_weight_norm()
nn.utils.remove_weight_norm(self.conv_post)
def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
r"""
Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
waveform.
Args:
spectrogram (`torch.FloatTensor`):
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
Returns:
`torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
"""
if self.config.normalize_before:
spectrogram = (spectrogram - self.mean) / self.scale
is_batched = spectrogram.dim() == 3
if not is_batched:
spectrogram = spectrogram.unsqueeze(0)
hidden_states = spectrogram.transpose(2, 1)
hidden_states = self.conv_pre(hidden_states)
for i in range(self.num_upsamples):
hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
hidden_states = self.upsampler[i](hidden_states)
res_state = self.resblocks[i * self.num_kernels](hidden_states)
for j in range(1, self.num_kernels):
res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
hidden_states = res_state / self.num_kernels
hidden_states = nn.functional.leaky_relu(hidden_states)
hidden_states = self.conv_post(hidden_states)
hidden_states = torch.tanh(hidden_states)
if not is_batched:
waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
else:
waveform = hidden_states.squeeze(1)
return waveform
.\models\speecht5\number_normalizer.py
"""Number Normalizer class for SpeechT5."""
import re
class EnglishNumberNormalizer:
def __init__(self):
self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
self.teens = [
"",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
]
self.tens = ["", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
self.thousands = [
"",
"thousand",
"million",
"billion",
"trillion",
"quadrillion",
"quintillion",
"sextillion",
"septillion",
"octillion",
"nonillion",
"decillion",
]
self.currency_symbols = {
"$": " dollars",
"€": " euros",
"£": " pounds",
"¢": " cents",
"¥": " japanese yen",
"﷼": " saudi riyal",
"₹": " indian rupees",
"₽": " russian rubles",
"฿": " thai baht",
"₺": " turkish liras",
"₴": " ukrainian hryvnia",
"₣": " swiss francs",
"₡": " costa rican colon",
"₱": " philippine peso",
"₪": " israeli shekels",
"₮": " mongolian tögrög",
"₩": " south korean won",
"₦": " nigerian naira",
"₫": " vietnamese Đồng",
}
def spell_number(self, num):
if num == 0:
return "zero"
parts = []
for i in range(0, len(self.thousands)):
if num % 1000 != 0:
part = ""
hundreds = num % 1000 // 100
tens_units = num % 100
if hundreds > 0:
part += self.ones[hundreds] + " hundred"
if tens_units > 0:
part += " and "
if tens_units >= 11 and tens_units <= 19:
part += self.teens[tens_units - 10]
else:
tens_digit = self.tens[tens_units // 10]
ones_digit = self.ones[tens_units % 10]
if tens_digit:
part += tens_digit
if ones_digit:
if tens_digit:
part += " "
part += ones_digit
parts.append(part)
num //= 1000
return " ".join(reversed(parts))
def convert(self, number):
"""
Converts an individual number passed in string form to spelt-out form
将传入的字符串形式的数字转换为拼写形式
"""
if "." in number:
integer_part, decimal_part = number.split(".")
else:
integer_part, decimal_part = number, "00"
currency_symbol = ""
for symbol, name in self.currency_symbols.items():
if integer_part.startswith(symbol):
currency_symbol = name
integer_part = integer_part[len(symbol) :]
break
if integer_part.startswith("-"):
if integer_part[1:].startswith(symbol):
currency_symbol = name
integer_part = "-" + integer_part[len(symbol) + 1 :]
break
minus_prefix = ""
if integer_part.startswith("-"):
minus_prefix = "minus "
integer_part = integer_part[1:]
elif integer_part.startswith("minus"):
minus_prefix = "minus "
integer_part = integer_part[len("minus") :]
percent_suffix = ""
if "%" in integer_part or "%" in decimal_part:
percent_suffix = " percent"
integer_part = integer_part.replace("%", "")
decimal_part = decimal_part.replace("%", "")
integer_part = integer_part.zfill(3 * ((len(integer_part) - 1) // 3 + 1))
parts = []
for i in range(0, len(integer_part), 3):
chunk = int(integer_part[i : i + 3])
if chunk > 0:
part = self.spell_number(chunk)
unit = self.thousands[len(integer_part[i:]) // 3 - 1]
if unit:
part += " " + unit
parts.append(part)
spelled_integer = " ".join(parts)
if decimal_part == "00":
return (
f"{minus_prefix}{spelled_integer}{percent_suffix}{currency_symbol}"
if minus_prefix or currency_symbol
else f"{spelled_integer}{percent_suffix}"
)
else:
spelled_decimal = " ".join([self.spell_number(int(digit)) for digit in decimal_part])
return (
f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}{currency_symbol}"
if minus_prefix or currency_symbol
else f"{minus_prefix}{spelled_integer} point {spelled_decimal}{percent_suffix}"
)
def __call__(self, text):
"""
Convert numbers / number-like quantities in a string to their spelt-out counterparts
"""
pattern = r"(?<!\w)(-?\$?\€?\£?\¢?\¥?\₹?\₽?\฿?\₺?\₴?\₣?\₡?\₱?\₪?\₮?\₩?\₦?\₫?\﷼?\d+(?:\.\d{1,2})?%?)(?!\w)"
text = re.sub(r"(\d+,\d+)", lambda match: match.group(1).replace(",", ""), text)
converted_text = re.sub(pattern, lambda match: self.convert(match.group(1)), text)
converted_text = re.sub(" +", " ", converted_text)
return converted_text
.\models\speecht5\processing_speecht5.py
from ...processing_utils import ProcessorMixin
class SpeechT5Processor(ProcessorMixin):
"""
构造一个 SpeechT5Processor 类,将特征提取器和分词器封装成一个单一的处理器。
[`SpeechT5Processor`] 提供了 [`SpeechT5FeatureExtractor`] 和 [`SpeechT5Tokenizer`] 的所有功能。查看
[`~SpeechT5Processor.__call__`] 和 [`~SpeechT5Processor.decode`] 的文档字符串以获取更多信息。
Args:
feature_extractor (`SpeechT5FeatureExtractor`):
[`SpeechT5FeatureExtractor`] 的实例。特征提取器是必需的输入。
tokenizer (`SpeechT5Tokenizer`):
[`SpeechT5Tokenizer`] 的实例。分词器是必需的输入。
"""
feature_extractor_class = "SpeechT5FeatureExtractor"
tokenizer_class = "SpeechT5Tokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def __call__(self, *args, **kwargs):
"""
Processes audio and text input, as well as audio and text targets.
You can process audio by using the argument `audio`, or process audio targets by using the argument
`audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
[`~SpeechT5FeatureExtractor.__call__`].
You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].
Valid input combinations are:
- `text` only
- `audio` only
- `text_target` only
- `audio_target` only
- `text` and `audio_target`
- `audio` and `audio_target`
- `text` and `text_target`
- `audio` and `text_target`
Please refer to the docstring of the above two methods for more information.
"""
audio = kwargs.pop("audio", None)
text = kwargs.pop("text", None)
text_target = kwargs.pop("text_target", None)
audio_target = kwargs.pop("audio_target", None)
sampling_rate = kwargs.pop("sampling_rate", None)
if audio is not None and text is not None:
raise ValueError(
"Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?"
)
if audio_target is not None and text_target is not None:
raise ValueError(
"Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?"
)
if audio is None and audio_target is None and text is None and text_target is None:
raise ValueError(
"You need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process."
)
if audio is not None:
inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
elif text is not None:
inputs = self.tokenizer(text, **kwargs)
else:
inputs = None
if audio_target is not None:
targets = self.feature_extractor(audio_target=audio_target, *args, sampling_rate=sampling_rate, **kwargs)
labels = targets["input_values"]
elif text_target is not None:
targets = self.tokenizer(text_target, **kwargs)
labels = targets["input_ids"]
else:
targets = None
if inputs is None:
return targets
if targets is not None:
inputs["labels"] = labels
decoder_attention_mask = targets.get("attention_mask")
if decoder_attention_mask is not None:
inputs["decoder_attention_mask"] = decoder_attention_mask
return inputs
def pad(self, *args, **kwargs):
"""
Collates the audio and text inputs, as well as their targets, into a padded batch.
Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].
Valid input combinations are:
- `input_ids` only
- `input_values` only
- `labels` only, either log-mel spectrograms or text tokens
- `input_ids` and log-mel spectrogram `labels`
- `input_values` and text `labels`
Please refer to the docstring of the above two methods for more information.
"""
input_values = kwargs.pop("input_values", None)
input_ids = kwargs.pop("input_ids", None)
labels = kwargs.pop("labels", None)
if input_values is not None and input_ids is not None:
raise ValueError("Cannot process both `input_values` and `input_ids` inputs.")
if input_values is None and input_ids is None and labels is None:
raise ValueError(
"You need to specify either an `input_values`, `input_ids`, or `labels` input to be padded."
)
if input_values is not None:
inputs = self.feature_extractor.pad(input_values, *args, **kwargs)
elif input_ids is not None:
inputs = self.tokenizer.pad(input_ids, **kwargs)
else:
inputs = None
if labels is not None:
if "input_ids" in labels or (isinstance(labels, list) and "input_ids" in labels[0]):
targets = self.tokenizer.pad(labels, **kwargs)
labels = targets["input_ids"]
else:
feature_size_hack = self.feature_extractor.feature_size
self.feature_extractor.feature_size = self.feature_extractor.num_mel_bins
targets = self.feature_extractor.pad(labels, *args, **kwargs)
self.feature_extractor.feature_size = feature_size_hack
labels = targets["input_values"]
else:
targets = None
if inputs is None:
return targets
if targets is not None:
inputs["labels"] = labels
decoder_attention_mask = targets.get("attention_mask")
if decoder_attention_mask is not None:
inputs["decoder_attention_mask"] = decoder_attention_mask
return inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
这个方法将所有参数转发给 SpeechT5Tokenizer 的 [`~SpeechT5Tokenizer.decode`] 方法。
请参考该方法的文档字符串以获取更多信息。
"""
return self.tokenizer.decode(*args, **kwargs)
.\models\speecht5\tokenization_speecht5.py
"""Tokenization class for SpeechT5."""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
from .number_normalizer import EnglishNumberNormalizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spm_char.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/speecht5_asr": "https://huggingface.co/microsoft/speecht5_asr/resolve/main/spm_char.model",
"microsoft/speecht5_tts": "https://huggingface.co/microsoft/speecht5_tts/resolve/main/spm_char.model",
"microsoft/speecht5_vc": "https://huggingface.co/microsoft/speecht5_vc/resolve/main/spm_char.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/speecht5_asr": 1024,
"microsoft/speecht5_tts": 1024,
"microsoft/speecht5_vc": 1024,
}
class SpeechT5Tokenizer(PreTrainedTokenizer):
"""
Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) 文件的路径(通常具有 *.spm* 扩展名),
包含实例化分词器所需的词汇表。
bos_token (`str`, *optional*, defaults to `"<s>"`):
序列的开始标记。
eos_token (`str`, *optional*, defaults to `"</s>"`):
序列的结束标记。
unk_token (`str`, *optional*, defaults to `"<unk>"`):
未知标记。当词汇表中没有某个词时,将该词转换为此标记。
pad_token (`str`, *optional*, defaults to `"<pad>"`):
用于填充的标记,在将不同长度的序列进行批处理时使用。
normalize (`bool`, *optional*, defaults to `False`):
是否将文本中的数字量转换为其英文拼写的对应词。
sp_model_kwargs (`dict`, *optional*):
将传递给 `SentencePieceProcessor.__init__()` 方法的参数。可以用于设置 SentencePiece 的一些选项,
如启用子词正则化 (`enable_sampling`)、nbest_size 参数等。
- `enable_sampling`: 启用子词正则化。
- `nbest_size`: 对于unigram的采样参数。对于BPE-Dropout无效。
- `nbest_size = {0,1}`: 不执行采样。
- `nbest_size > 1`: 从 nbest_size 个结果中进行采样。
- `nbest_size < 0`: 假设 nbest_size 为无限大,使用前向过滤和后向采样算法从所有假设(lattice)中采样。
- `alpha`: unigram 采样的平滑参数,以及 BPE-dropout 合并操作的 dropout 概率。
Attributes:
sp_model (`SentencePieceProcessor`):
用于每次转换(字符串、标记和ID)的 *SentencePiece* 处理器。
"""
# 定义一些常量和映射
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
normalize=False,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
):
# 初始化函数,用于实例化一个新的 SentencePieceTokenizer 对象
) -> None:
# 初始化函数,设置参数并加载 SentencePiece 模型
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs # 如果未提供 sp_model_kwargs,则初始化为空字典
self.vocab_file = vocab_file # 设置词汇文件路径
self.normalize = normalize # 设置是否进行文本归一化
self._normalizer = None # 初始化归一化器为 None
# 使用给定的参数初始化 SentencePieceProcessor 对象
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) # 加载指定的词汇文件
# 调用父类的初始化方法,传递相关参数
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
normalize=normalize,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
# 准备文本用于分词处理
normalize = kwargs.pop("normalize", self.normalize) # 获取归一化参数,如果未指定则使用实例变量的值
if is_split_into_words:
text = " " + text # 如果文本已经分成单词,则在文本前加空格
if normalize:
text = self.normalizer(text) # 如果需要归一化,则对文本进行归一化处理
return (text, kwargs) # 返回处理后的文本和剩余的 kwargs 参数
@property
def vocab_size(self):
# 返回词汇表大小,即 SentencePiece 模型中的词汇数量
return self.sp_model.get_piece_size()
@property
def normalizer(self):
# 返回归一化器对象,如果未初始化则创建一个英文数字归一化器
if self._normalizer is None:
self._normalizer = EnglishNumberNormalizer()
return self._normalizer
@normalizer.setter
def normalizer(self, value):
# 设置归一化器对象
self._normalizer = value
def get_vocab(self):
# 返回词汇表,将词汇 ID 映射为对应的词汇(字符串形式)
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder) # 将额外的特殊标记也加入词汇表
return vocab
def __getstate__(self):
# 获取对象的状态,用于序列化
state = self.__dict__.copy()
state["sp_model"] = None # 将 sp_model 设为 None,以免在序列化时保存 SentencePieceProcessor 对象
return state
def __setstate__(self, d):
# 设置对象的状态,用于反序列化
self.__dict__ = d
# 为了向后兼容性
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
# 重新创建 SentencePieceProcessor 对象,并加载词汇文件
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
# 对文本进行分词处理,返回分词结果(字符串列表)
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
# 将词汇(token)转换为对应的 ID
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
# 将 ID 转换为对应的词汇(token)
token = self.sp_model.IdToPiece(index)
return token
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) into a single string."""
# 初始化空列表用于存储当前子 token 序列
current_sub_tokens = []
# 初始化输出字符串
out_string = ""
# 初始化标志来跟踪上一个 token 是否是特殊 token
prev_is_special = False
# 遍历 tokens 序列
for token in tokens:
# 检查当前 token 是否是特殊 token
if token in self.all_special_tokens:
# 如果当前 token 是特殊 token 并且上一个 token 不是特殊 token,则添加空格
if not prev_is_special:
out_string += " "
# 解码当前子 token 序列并添加当前 token 到输出字符串
out_string += self.sp_model.decode(current_sub_tokens) + token
# 更新标志表明当前 token 是特殊 token
prev_is_special = True
# 重置当前子 token 序列
current_sub_tokens = []
else:
# 将当前 token 添加到当前子 token 序列中
current_sub_tokens.append(token)
# 更新标志表明当前 token 不是特殊 token
prev_is_special = False
# 将剩余的子 token 序列解码并添加到输出字符串
out_string += self.sp_model.decode(current_sub_tokens)
# 返回去除首尾空格的输出字符串
return out_string.strip()
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
"""Build model inputs from a sequence by appending eos_token_id."""
# 如果只有一个输入序列,则在末尾添加 eos_token_id 并返回
if token_ids_1 is None:
return token_ids_0 + [self.eos_token_id]
# 如果有两个输入序列,将它们连接并在末尾添加 eos_token_id 后返回
return token_ids_0 + token_ids_1 + [self.eos_token_id]
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
# 如果输入的 token_ids_0 已经包含特殊 token,直接调用父类的方法并返回结果
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# 后缀添加 1 用于标识特殊 token
suffix_ones = [1]
# 如果只有一个输入序列,返回长度为 token_ids_0 的零列表加上后缀
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + suffix_ones
# 如果有两个输入序列,返回长度为 token_ids_0 和 token_ids_1 的零列表加上后缀
return ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 检查保存目录是否存在,如果不存在则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构建输出词汇表文件路径
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇表文件路径与输出词汇表文件路径不同且当前词汇表文件存在,则复制当前词汇表文件到输出路径
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 如果当前词汇表文件不存在,则将当前 sentencepiece 模型的序列化模型写入输出文件
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# 返回输出词汇表文件路径的元组
return (out_vocab_file,)
.\models\speecht5\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_torch_available,
)
_import_structure = {
"configuration_speecht5": [
"SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP",
"SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP",
"SpeechT5Config",
"SpeechT5HifiGanConfig",
],
"feature_extraction_speecht5": ["SpeechT5FeatureExtractor"],
"processing_speecht5": ["SpeechT5Processor"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_speecht5"] = ["SpeechT5Tokenizer"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_speecht5"] = [
"SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST",
"SpeechT5ForSpeechToText",
"SpeechT5ForSpeechToSpeech",
"SpeechT5ForTextToSpeech",
"SpeechT5Model",
"SpeechT5PreTrainedModel",
"SpeechT5HifiGan",
]
if TYPE_CHECKING:
from .configuration_speecht5 import (
SPEECHT5_PRETRAINED_CONFIG_ARCHIVE_MAP,
SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP,
SpeechT5Config,
SpeechT5HifiGanConfig,
)
from .feature_extraction_speecht5 import SpeechT5FeatureExtractor
from .processing_speecht5 import SpeechT5Processor
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_speecht5 import SpeechT5Tokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_speecht5 import (
SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST,
SpeechT5ForSpeechToSpeech,
SpeechT5ForSpeechToText,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
SpeechT5Model,
SpeechT5PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\speech_encoder_decoder\configuration_speech_encoder_decoder.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import AutoConfig
logger = logging.get_logger(__name__)
class SpeechEncoderDecoderConfig(PretrainedConfig):
r"""
[`SpeechEncoderDecoderConfig`] 是用于存储 [`SpeechEncoderDecoderModel`] 配置的类。
根据指定的参数实例化一个 Encoder-Decoder 模型,定义编码器和解码器的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。
Args:
kwargs (*可选*):
关键字参数的字典。特别是:
- **encoder** ([`PretrainedConfig`], *可选*) -- 定义编码器配置的配置对象实例。
- **decoder** ([`PretrainedConfig`], *可选*) -- 定义解码器配置的配置对象实例。
Examples:
```
>>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
>>> # 初始化一个 Wav2Vec2 和 BERT 风格的配置
>>> config_encoder = Wav2Vec2Config()
>>> config_decoder = BertConfig()
>>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
>>> # 从 Wav2Vec2 和 google-bert/bert-base-uncased 风格的配置初始化一个 Wav2Vec2Bert 模型
>>> model = SpeechEncoderDecoderModel(config=config)
>>> # 访问模型配置
>>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder
>>> # 将解码器配置设置为 causal lm
>>> config_decoder.is_decoder = True
>>> config_decoder.add_cross_attention = True
>>> # 保存模型,包括其配置
>>> model.save_pretrained("my-model")
>>> # 从预训练文件夹加载模型和配置
>>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained("my-model")
>>> model = SpeechEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
```"""
model_type = "speech-encoder-decoder"
is_composition = True
def __init__(self, **kwargs):
super().__init__(**kwargs)
if "encoder" not in kwargs or "decoder" not in kwargs:
raise ValueError(
f"A configuraton of type {self.model_type} cannot be instantiated because not both `encoder` and"
f" `decoder` sub-configurations are passed, but only {kwargs}"
)
encoder_config = kwargs.pop("encoder")
encoder_model_type = encoder_config.pop("model_type")
decoder_config = kwargs.pop("decoder")
decoder_model_type = decoder_config.pop("model_type")
self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
self.is_encoder_decoder = True
@classmethod
def from_encoder_decoder_configs(
cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
) -> PretrainedConfig:
"""
从预训练的 encoder 和 decoder 配置实例化一个 [`SpeechEncoderDecoderConfig`] (或其派生类) 对象。
Returns:
[`SpeechEncoderDecoderConfig`]: 配置对象的一个实例
"""
logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True
return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
.\models\speech_encoder_decoder\convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
import argparse
import fairseq
import torch
from torch import nn
from transformers import (
MBart50Tokenizer,
MBartConfig,
MBartForCausalLM,
SpeechEncoderDecoderConfig,
SpeechEncoderDecoderModel,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
Wav2Vec2Model,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"quantizer.weight_proj": "quantizer.weight_proj",
"quantizer.vars": "quantizer.codevectors",
"project_q": "project_q",
"final_proj": "project_hid",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
TOP_LEVEL_KEYS = [
"lm_head",
"quantizer.weight_proj",
"quantizer.codevectors",
"project_q",
"project_hid",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert hf_shape == value.shape, (
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.feature_extractor
adapter = hf_model.adapter
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
elif any(x in name for x in ["adaptor", "w2v_encoder.proj.", "w2v_proj_ln."]):
load_adapter(name, value, adapter, unused_weights)
is_used = True
else:
for key, mapped_key in MAPPING.items():
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name:
weight_type = "bias"
elif "weight" in name:
weight_type = "weight"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
" found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
def load_adapter(full_name, value, adapter, unused_weights):
name = full_name.split("adaptor.")[-1]
items = name.split(".")
if items[1].isdigit():
layer_id = int(items[1])
else:
layer_id = None
if "adaptor" not in full_name:
if "proj_ln" in full_name:
if "bias" in name:
assert (
value.shape == adapter.proj_layer_norm.bias.data.shape
), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.bias.data.shape} was found."
adapter.proj_layer_norm.bias.data = value
logger.info(f"Adapter proj layer norm bias was initialized from {full_name}.")
if "weight" in name:
assert (
value.shape == adapter.proj_layer_norm.weight.data.shape
), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.weight.data.shape} was found."
adapter.proj_layer_norm.weight.data = value
else:
if "bias" in name:
assert (
value.shape == adapter.proj.bias.data.shape
), f"{full_name} has size {value.shape}, but {adapter.proj.bias.data.shape} was found."
adapter.proj.bias.data = value
logger.info(f"Adapter proj layer bias was initialized from {full_name}.")
if "weight" in name:
assert (
value.shape == adapter.proj.weight.data.shape
), f"{full_name} has size {value.shape}, but {adapter.proj.weight.data.shape} was found."
adapter.proj.weight.data = value
elif isinstance(layer_id, int):
if "bias" in name:
assert (
value.shape == adapter.layers[layer_id].conv.bias.data.shape
), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.bias.data.shape} was found."
adapter.layers[layer_id].conv.bias.data = value
logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
elif "weight" in name:
assert (
value.shape == adapter.layers[layer_id].conv.weight.data.shape
), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.weight.data.shape} was found."
adapter.layers[layer_id].conv.weight.data = value
logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
else:
unused_weights.append(full_name)
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
@torch.no_grad()
def convert_wav2vec2_checkpoint(
checkpoint_path,
pytorch_dump_folder_path,
dict_path,
config_yaml_path,
encoder_config_path,
decoder_config_path,
add_adapter,
adapter_kernel_size,
adapter_stride,
decoder_start_token_id,
encoder_output_dim,
def copy_weights_to_transformers_model(
pytorch_dump_folder_path,
checkpoint_path,
dict_path,
config_yaml_path,
):
"""
Copy/paste/tweak model's weights to transformers design.
"""
encoder_config = Wav2Vec2Config.from_pretrained(
encoder_config_path,
add_adapter=True,
adapter_stride=adapter_stride,
adapter_kernel_size=adapter_kernel_size,
token_token=True,
output_hidden_size=encoder_output_dim,
)
decoder_config = MBartConfig.from_pretrained(decoder_config_path)
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[checkpoint_path],
arg_overrides={
"config_yaml": config_yaml_path,
"data": "/".join(dict_path.split("/")[:-1]),
"w2v_path": checkpoint_path,
"load_pretrained_decoder_from": None,
},
)
model = model[0].eval()
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, token_token=True)
hf_encoder = Wav2Vec2Model(encoder_config)
recursively_load_weights_wav2vec2(model.encoder, hf_encoder)
hf_decoder = MBartForCausalLM(decoder_config)
missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)
logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}")
logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}")
hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
hf_wav2vec.config.tie_word_embeddings = False
tokenizer = MBart50Tokenizer(dict_path)
tokenizer.save_pretrained(pytorch_dump_folder_path)
config = hf_wav2vec.config.to_dict()
config["pad_token_id"] = tokenizer.pad_token_id
config["bos_token_id"] = tokenizer.bos_token_id
config["eos_token_id"] = tokenizer.eos_token_id
config["tokenizer_class"] = "mbart50"
config["feature_extractor_type"] = "wav2vec2"
config["decoder_start_token_id"] = tokenizer.eos_token_id
config["forced_bos_token_id"] = 250004
config["forced_eos_token_id"] = tokenizer.eos_token_id
hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)
hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
feature_extractor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_yaml_path", default=None, type=str, help="Path to yaml file of fine-tuned model")
parser.add_argument(
"--encoder_config_path",
default="facebook/wav2vec2-xls-r-1b",
type=str,
help="Path to hf encoder wav2vec2 checkpoint config",
)
parser.add_argument(
"--decoder_config_path",
default="facebook/mbart-large-50-one-to-many-mmt",
type=str,
help="Path to hf decoder checkpoint config",
)
parser.add_argument("--add_adapter", default=True, type=bool, help="whether to add model adapter layers")
parser.add_argument("--adapter_stride", default=2, type=int, help="stride of adapter layers")
parser.add_argument("--adapter_kernel_size", default=3, type=int, help="kernel size of adapter layers")
parser.add_argument("--encoder_output_dim", default=1024, type=int, help="encoder output dim")
parser.add_argument("--start_token_id", default=250004, type=int, help="`decoder_start_token_id` of model config")
args = parser.parse_args()
convert_wav2vec2_checkpoint(
args.checkpoint_path,
args.pytorch_dump_folder_path,
args.dict_path,
args.config_yaml_path,
encoder_config_path=args.encoder_config_path,
decoder_config_path=args.decoder_config_path,
add_adapter=args.add_adapter,
adapter_kernel_size=args.adapter_kernel_size,
adapter_stride=args.adapter_stride,
decoder_start_token_id=args.start_token_id,
encoder_output_dim=args.encoder_output_dim,
)