Transformers 源码解析(四十八)
.\models\falcon\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_falcon"] = [
"FALCON_PRETRAINED_MODEL_ARCHIVE_LIST",
"FalconForCausalLM",
"FalconModel",
"FalconPreTrainedModel",
"FalconForSequenceClassification",
"FalconForTokenClassification",
"FalconForQuestionAnswering",
]
if TYPE_CHECKING:
from .configuration_falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_falcon import (
FALCON_PRETRAINED_MODEL_ARCHIVE_LIST,
FalconForCausalLM,
FalconForQuestionAnswering,
FalconForSequenceClassification,
FalconForTokenClassification,
FalconModel,
FalconPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\fastspeech2_conformer\configuration_fastspeech2_conformer.py
""" FastSpeech2Conformer model configuration"""
from typing import Dict
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/config.json",
}
FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"espnet/fastspeech2_conformer_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_hifigan/raw/main/config.json",
}
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"espnet/fastspeech2_conformer_with_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_with_hifigan/raw/main/config.json",
}
class FastSpeech2ConformerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to
instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the
FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig
>>> # Initializing a FastSpeech2Conformer style configuration
>>> configuration = FastSpeech2ConformerConfig()
>>> # Initializing a model from the FastSpeech2Conformer style configuration
>>> model = FastSpeech2ConformerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "fastspeech2_conformer"
attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"}
def __init__(
self,
hidden_size=384,
vocab_size=78,
num_mel_bins=80,
encoder_num_attention_heads=2,
encoder_layers=4,
encoder_linear_units=1536,
decoder_layers=4,
decoder_num_attention_heads=2,
decoder_linear_units=1536,
speech_decoder_postnet_layers=5,
speech_decoder_postnet_units=256,
speech_decoder_postnet_kernel=5,
positionwise_conv_kernel_size=3,
encoder_normalize_before=False,
decoder_normalize_before=False,
encoder_concat_after=False,
decoder_concat_after=False,
reduction_factor=1,
speaking_speed=1.0,
use_macaron_style_in_conformer=True,
use_cnn_in_conformer=True,
encoder_kernel_size=7,
decoder_kernel_size=31,
duration_predictor_layers=2,
duration_predictor_channels=256,
duration_predictor_kernel_size=3,
energy_predictor_layers=2,
energy_predictor_channels=256,
energy_predictor_kernel_size=3,
energy_predictor_dropout=0.5,
energy_embed_kernel_size=1,
energy_embed_dropout=0.0,
stop_gradient_from_energy_predictor=False,
pitch_predictor_layers=5,
pitch_predictor_channels=256,
pitch_predictor_kernel_size=5,
pitch_predictor_dropout=0.5,
pitch_embed_kernel_size=1,
pitch_embed_dropout=0.0,
stop_gradient_from_pitch_predictor=True,
encoder_dropout_rate=0.2,
encoder_positional_dropout_rate=0.2,
encoder_attention_dropout_rate=0.2,
decoder_dropout_rate=0.2,
decoder_positional_dropout_rate=0.2,
decoder_attention_dropout_rate=0.2,
duration_predictor_dropout_rate=0.2,
speech_decoder_postnet_dropout=0.5,
max_source_positions=5000,
use_masking=True,
use_weighted_masking=False,
num_speakers=None,
num_languages=None,
speaker_embed_dim=None,
is_encoder_decoder=True,
**kwargs,
`
class FastSpeech2ConformerHifiGanConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to
instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
FastSpeech2Conformer
[espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
model_in_dim (`int`, *optional*, defaults to 80):
The number of frequency bins in the input log-mel spectrogram.
upsample_initial_channel (`int`, *optional*, defaults to 512):
The number of input channels into the upsampling network.
upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
length of *upsample_rates* defines the number of convolutional layers and has to match the length of
*upsample_kernel_sizes*.
upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
*upsample_rates*.
resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
fusion (MRF) module.
resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
multi-receptive field fusion (MRF) module.
initializer_range (`float`, *optional*, defaults to 0.01):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
leaky_relu_slope (`float`, *optional*, defaults to 0.1):
The angle of the negative slope used by the leaky ReLU activation.
normalize_before (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
Example:
```
>>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig
"""
def __init__(self, model_in_dim=80, upsample_initial_channel=512, upsample_rates=[8, 8, 2, 2],
upsample_kernel_sizes=[16, 16, 4, 4], resblock_kernel_sizes=[3, 7, 11],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], initializer_range=0.01,
leaky_relu_slope=0.1, normalize_before=True, **kwargs):
super().__init__(**kwargs)
self.model_in_dim = model_in_dim
self.upsample_initial_channel = upsample_initial_channel
self.upsample_rates = upsample_rates
self.upsample_kernel_sizes = upsample_kernel_sizes
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.initializer_range = initializer_range
self.leaky_relu_slope = leaky_relu_slope
self.normalize_before = normalize_before
model_type = "hifigan"
def __init__(
self,
model_in_dim=80,
upsample_initial_channel=512,
upsample_rates=[8, 8, 2, 2],
upsample_kernel_sizes=[16, 16, 4, 4],
resblock_kernel_sizes=[3, 7, 11],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
initializer_range=0.01,
leaky_relu_slope=0.1,
normalize_before=True,
**kwargs,
):
self.model_in_dim = model_in_dim
self.upsample_initial_channel = upsample_initial_channel
self.upsample_rates = upsample_rates
self.upsample_kernel_sizes = upsample_kernel_sizes
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.initializer_range = initializer_range
self.leaky_relu_slope = leaky_relu_slope
self.normalize_before = normalize_before
super().__init__(**kwargs)
class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to
instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations,
defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the
FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and
FastSpeech2ConformerHifiGan
[espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
model_config (`typing.Dict`, *optional*):
Configuration of the text-to-speech model.
vocoder_config (`typing.Dict`, *optional*):
Configuration of the vocoder model.
model_config ([`FastSpeech2ConformerConfig`], *optional*):
Configuration of the text-to-speech model.
vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*):
Configuration of the vocoder model.
Example:
```
>>> from transformers import (
... FastSpeech2ConformerConfig,
... FastSpeech2ConformerHifiGanConfig,
... FastSpeech2ConformerWithHifiGanConfig,
... FastSpeech2ConformerWithHifiGan,
... )
>>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations.
>>> model_config = FastSpeech2ConformerConfig()
>>> vocoder_config = FastSpeech2ConformerHifiGanConfig()
>>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration
>>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict())
>>> # Initializing a model (with random weights)
>>> model = FastSpeech2ConformerWithHifiGan(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "fastspeech2_conformer_with_hifigan"
is_composition = True
def __init__(
self,
model_config: Dict = None,
vocoder_config: Dict = None,
**kwargs,
):
if model_config is None:
model_config = {}
logger.info("model_config is None. initializing the model with default values.")
if vocoder_config is None:
vocoder_config = {}
logger.info("vocoder_config is None. initializing the coarse model with default values.")
self.model_config = FastSpeech2ConformerConfig(**model_config)
self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config)
super().__init__(**kwargs)
.\models\fastspeech2_conformer\convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
import argparse
import json
import re
from pathlib import Path
from tempfile import TemporaryDirectory
import torch
import yaml
from transformers import (
FastSpeech2ConformerConfig,
FastSpeech2ConformerModel,
FastSpeech2ConformerTokenizer,
logging
)
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
CONFIG_MAPPING = {
"adim": "hidden_size",
"aheads": "num_attention_heads",
"conformer_dec_kernel_size": "decoder_kernel_size",
"conformer_enc_kernel_size": "encoder_kernel_size",
"decoder_normalize_before": "decoder_normalize_before",
"dlayers": "decoder_layers",
"dunits": "decoder_linear_units",
"duration_predictor_chans": "duration_predictor_channels",
"duration_predictor_kernel_size": "duration_predictor_kernel_size",
"duration_predictor_layers": "duration_predictor_layers",
"elayers": "encoder_layers",
"encoder_normalize_before": "encoder_normalize_before",
"energy_embed_dropout": "energy_embed_dropout",
"energy_embed_kernel_size": "energy_embed_kernel_size",
"energy_predictor_chans": "energy_predictor_channels",
"energy_predictor_dropout": "energy_predictor_dropout",
"energy_predictor_kernel_size": "energy_predictor_kernel_size",
"energy_predictor_layers": "energy_predictor_layers",
"eunits": "encoder_linear_units",
"pitch_embed_dropout": "pitch_embed_dropout",
"pitch_embed_kernel_size": "pitch_embed_kernel_size",
"pitch_predictor_chans": "pitch_predictor_channels",
"pitch_predictor_dropout": "pitch_predictor_dropout",
"pitch_predictor_kernel_size": "pitch_predictor_kernel_size",
"pitch_predictor_layers": "pitch_predictor_layers",
"positionwise_conv_kernel_size": "positionwise_conv_kernel_size",
"postnet_chans": "speech_decoder_postnet_units",
"postnet_filts": "speech_decoder_postnet_kernel",
"postnet_layers": "speech_decoder_postnet_layers",
"reduction_factor": "reduction_factor",
"stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor",
"stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor",
"transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate",
"transformer_dec_dropout_rate": "decoder_dropout_rate",
"transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate",
"transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate",
"transformer_enc_dropout_rate": "encoder_dropout_rate",
"transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate",
"use_cnn_in_conformer": "use_cnn_in_conformer",
"use_macaron_style_in_conformer": "use_macaron_style_in_conformer",
"use_masking": "use_masking",
"use_weighted_masking": "use_weighted_masking",
"idim": "input_dim",
"odim": "num_mel_bins",
"spk_embed_dim": "speaker_embed_dim",
"langs": "num_languages",
"spks": "num_speakers",
}
def remap_model_yaml_config(yaml_config_path):
with Path(yaml_config_path).open("r", encoding="utf-8") as f:
args = yaml.safe_load(f)
args = argparse.Namespace(**args)
remapped_config = {}
model_params = args.tts_conf["text2mel_params"]
for espnet_config_key, hf_config_key in CONFIG_MAPPING.items():
if espnet_config_key in model_params:
remapped_config[hf_config_key] = model_params[espnet_config_key]
return remapped_config, args.g2p, args.token_list
def convert_espnet_state_dict_to_hf(state_dict):
new_state_dict = {}
for key in state_dict:
if "tts.generator.text2mel." in key:
new_key = key.replace("tts.generator.text2mel.", "")
if "postnet" in key:
new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers")
new_key = new_key.replace(".0.weight", ".conv.weight")
new_key = new_key.replace(".1.weight", ".batch_norm.weight")
new_key = new_key.replace(".1.bias", ".batch_norm.bias")
new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean")
new_key = new_key.replace(".1.running_var", ".batch_norm.running_var")
new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked")
if "feat_out" in key:
if "weight" in key:
new_key = "speech_decoder_postnet.feat_out.weight"
if "bias" in key:
new_key = "speech_decoder_postnet.feat_out.bias"
if "encoder.embed.0.weight" in key:
new_key = new_key.replace("0.", "")
if "w_1" in key:
new_key = new_key.replace("w_1", "conv1")
if "w_2" in key:
new_key = new_key.replace("w_2", "conv2")
if "predictor.conv" in key:
new_key = new_key.replace(".conv", ".conv_layers")
pattern = r"(\d)\.(\d)"
replacement = (
r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm"
)
new_key = re.sub(pattern, replacement, new_key)
if "pitch_embed" in key or "energy_embed" in key:
new_key = new_key.replace("0", "conv")
if "encoders" in key:
new_key = new_key.replace("encoders", "conformer_layers")
new_key = new_key.replace("norm_final", "final_layer_norm")
new_key = new_key.replace("norm_mha", "self_attn_layer_norm")
new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm")
new_key = new_key.replace("norm_ff", "ff_layer_norm")
new_key = new_key.replace("norm_conv", "conv_layer_norm")
if "lid_emb" in key:
new_key = new_key.replace("lid_emb", "language_id_embedding")
if "sid_emb" in key:
new_key = new_key.replace("sid_emb", "speaker_id_embedding")
new_state_dict[new_key] = state_dict[key]
return new_state_dict
@torch.no_grad()
def convert_FastSpeech2ConformerModel_checkpoint(
checkpoint_path,
yaml_config_path,
pytorch_dump_folder_path,
repo_id=None,
):
model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path)
config = FastSpeech2ConformerConfig(**model_params)
model = FastSpeech2ConformerModel(config)
espnet_checkpoint = torch.load(checkpoint_path)
hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
model.load_state_dict(hf_compatible_state_dict)
model.save_pretrained(pytorch_dump_folder_path)
with TemporaryDirectory() as tempdir:
vocab = {token: id for id, token in enumerate(vocab)}
vocab_file = Path(tempdir) / "vocab.json"
with open(vocab_file, "w") as f:
json.dump(vocab, f)
should_strip_spaces = "no_space" in tokenizer_name
tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces)
tokenizer.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
parser.add_argument("--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert")
parser.add_argument("--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub.")
args = parser.parse_args()
convert_FastSpeech2ConformerModel_checkpoint(
args.checkpoint_path,
args.yaml_config_path,
args.pytorch_dump_folder_path,
args.push_to_hub,
)
.\models\fastspeech2_conformer\convert_hifigan.py
"""将 FastSpeech2Conformer HiFi-GAN 的检查点转换为模型。"""
import argparse
from pathlib import Path
import torch
import yaml
from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
def load_weights(checkpoint, hf_model, config):
"""加载权重到模型中。
Args:
checkpoint (dict): 检查点中的权重字典
hf_model (FastSpeech2ConformerHifiGan): 需要加载权重的模型实例
config (FastSpeech2ConformerHifiGanConfig): 模型的配置信息
"""
vocoder_key_prefix = "tts.generator.vocoder."
checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k}
hf_model.apply_weight_norm()
hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
for i in range(len(config.upsample_rates)):
hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
for j in range(len(config.resblock_dilation_sizes)):
hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
hf_model.remove_weight_norm()
def remap_hifigan_yaml_config(yaml_config_path):
"""重新映射 HiFi-GAN 的 YAML 配置。
Args:
yaml_config_path (str): YAML 配置文件的路径
"""
with Path(yaml_config_path).open("r", encoding="utf-8") as f:
args = yaml.safe_load(f)
args = argparse.Namespace(**args)
vocoder_type = args.tts_conf["vocoder_type"]
if vocoder_type != "hifigan_generator":
raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}")
remapped_dict = {}
vocoder_params = args.tts_conf["vocoder_params"]
key_mappings = {
"channels": "upsample_initial_channel",
"in_channels": "model_in_dim",
"resblock_dilations": "resblock_dilation_sizes",
"resblock_kernel_sizes": "resblock_kernel_sizes",
"upsample_kernel_sizes": "upsample_kernel_sizes",
"upsample_scales": "upsample_rates",
}
for espnet_config_key, hf_config_key in key_mappings.items():
remapped_dict[hf_config_key] = vocoder_params[espnet_config_key]
remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"]
remapped_dict["normalize_before"] = False
remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"]
return remapped_dict
@torch.no_grad()
def convert_hifigan_checkpoint(
checkpoint_path,
pytorch_dump_folder_path,
yaml_config_path=None,
repo_id=None,
):
if yaml_config_path is not None:
config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
else:
config = FastSpeech2ConformerHifiGanConfig()
model = FastSpeech2ConformerHifiGan(config)
orig_checkpoint = torch.load(checkpoint_path)
load_weights(orig_checkpoint, model, config)
model.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
model.push_to_hub(repo_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert")
parser.add_argument(
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
args = parser.parse_args()
convert_hifigan_checkpoint(
args.checkpoint_path,
args.pytorch_dump_folder_path,
args.yaml_config_path,
args.push_to_hub,
)
.\models\fastspeech2_conformer\convert_model_with_hifigan.py
import argparse
import torch
from transformers import (
FastSpeech2ConformerConfig,
FastSpeech2ConformerHifiGan,
FastSpeech2ConformerHifiGanConfig,
FastSpeech2ConformerModel,
FastSpeech2ConformerWithHifiGan,
FastSpeech2ConformerWithHifiGanConfig,
logging,
)
from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import (
convert_espnet_state_dict_to_hf,
remap_model_yaml_config,
)
from .convert_hifigan import load_weights, remap_hifigan_yaml_config
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
def convert_FastSpeech2ConformerWithHifiGan_checkpoint(
checkpoint_path,
yaml_config_path,
pytorch_dump_folder_path,
repo_id=None,
):
model_params, *_ = remap_model_yaml_config(yaml_config_path)
model_config = FastSpeech2ConformerConfig(**model_params)
model = FastSpeech2ConformerModel(model_config)
espnet_checkpoint = torch.load(checkpoint_path)
hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
model.load_state_dict(hf_compatible_state_dict)
config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
vocoder = FastSpeech2ConformerHifiGan(vocoder_config)
load_weights(espnet_checkpoint, vocoder, vocoder_config)
config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config)
with_hifigan_model = FastSpeech2ConformerWithHifiGan(config)
with_hifigan_model.model = model
with_hifigan_model.vocoder = vocoder
with_hifigan_model.save_pretrained(pytorch_dump_folder_path)
if repo_id:
print("Pushing to the hub...")
with_hifigan_model.push_to_hub(repo_id)
def parse_args():
parser = argparse.ArgumentParser(
description="Script for converting FastSpeech2Conformer with HifiGAN Model"
)
parser.add_argument(
"--pytorch_dump_folder_path",
required=True,
default=None,
type=str,
help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
args = parser.parse_args()
convert_FastSpeech2ConformerWithHifiGan_checkpoint(
args.checkpoint_path,
args.yaml_config_path,
args.pytorch_dump_folder_path,
args.push_to_hub,
)
.\models\fastspeech2_conformer\modeling_fastspeech2_conformer.py
""" PyTorch FastSpeech2Conformer model."""
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
from torch import nn
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import ModelOutput, add_start_docstrings, logging, replace_return_docstrings
from .configuration_fastspeech2_conformer import (
FastSpeech2ConformerConfig,
FastSpeech2ConformerHifiGanConfig,
FastSpeech2ConformerWithHifiGanConfig,
)
logger = logging.get_logger(__name__)
FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"espnet/fastspeech2_conformer",
]
@dataclass
class FastSpeech2ConformerModelOutput(ModelOutput):
"""
Output type of [`FastSpeech2ConformerModel`].
"""
loss: Optional[torch.FloatTensor] = None
spectrogram: torch.FloatTensor = None
encoder_last_hidden_state: torch.FloatTensor = None
encoder_hidden_states: tuple(torch.FloatTensor) = None
encoder_attentions: tuple(torch.FloatTensor) = None
decoder_hidden_states: tuple(torch.FloatTensor) = None
decoder_attentions: tuple(torch.FloatTensor) = None
duration_outputs: torch.LongTensor = None
pitch_outputs: torch.FloatTensor = None
energy_outputs: torch.FloatTensor = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
duration_outputs: torch.LongTensor = None
pitch_outputs: torch.FloatTensor = None
energy_outputs: torch.FloatTensor = None
@dataclass
class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
"""
Output type of [`FastSpeech2ConformerWithHifiGan`].
"""
waveform: torch.FloatTensor = None
_CONFIG_FOR_DOC = "FastSpeech2ConformerConfig"
FASTSPEECH2_CONFORMER_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FastSpeech2ConformerConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
HIFIGAN_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FastSpeech2ConformerConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FastSpeech2ConformerWithHifiGanConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0):
"""
Length regulator for feed-forward Transformer.
This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech`
https://arxiv.org/pdf/1905.09263.pdf. The length regulator expands char or phoneme-level embedding features to
frame-level by repeating each feature based on the corresponding predicted durations.
Args:
encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`):
Batch of sequences of char or phoneme embeddings.
duration_labels (`torch.LongTensor` of shape `(batch_size, time)`):
Batch of durations of each frame.
speaking_speed (`float`, *optional*, defaults to 1.0):
Value to control speed of speech.
Returns:
`torch.Tensor`:
Replicated input tensor based on durations (batch_size, time*, embedding_dim).
"""
if speaking_speed <= 0:
raise ValueError("`speaking_speed` must be greater than 0.")
elif speaking_speed != 1.0:
duration_labels = torch.round(duration_labels.float() * speaking_speed).long()
if duration_labels.sum() == 0:
duration_labels[duration_labels.sum(dim=1).eq(0)] = 1
max_len = torch.sum(duration_labels, dim=1).max()
hidden_states = torch.zeros(
(encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)),
dtype=torch.float,
device=encoded_embeddings.device,
)
for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)):
repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0)
hidden_states[i, : repeated.size(0)] = repeated
return hidden_states
class FastSpeech2ConformerDurationPredictor(nn.Module):
"""
Duration predictor module.
This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to
Speech' https://arxiv.org/pdf/1905.09263.pdf The duration predictor predicts a duration of each frame in log domain
from the hidden embeddings of encoder.
Note:
The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the
outputs are calculated in log domain but in `inference`, those are calculated in linear domain.
"""
def __init__(self, config: FastSpeech2ConformerConfig):
super().__init__()
self.conv_layers = nn.ModuleList()
self.log_domain_offset = 1.0
for layer_idx in range(config.duration_predictor_layers):
num_chans = config.duration_predictor_channels
input_channels = config.hidden_size if layer_idx == 0 else num_chans
layer = FastSpeech2ConformerPredictorLayer(
input_channels,
num_chans,
config.duration_predictor_kernel_size,
config.duration_predictor_dropout_rate,
)
self.conv_layers.append(layer)
self.linear = nn.Linear(config.duration_predictor_channels, 1)
def forward(self, encoder_hidden_states):
"""
Args:
encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
输入序列的批次数据.
input_dim是每个时间步的特征维度.
Returns:
`torch.Tensor`: 在对数域中预测的持续时间 `(batch_size, max_text_length)`.
"""
hidden_states = encoder_hidden_states.transpose(1, -1)
for layer in self.conv_layers:
hidden_states = layer(hidden_states)
hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1)
if not self.training:
hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long()
return hidden_states
class FastSpeech2ConformerBatchNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
if layer_id == 0:
in_conv_dim = config.num_mel_bins
else:
in_conv_dim = config.speech_decoder_postnet_units
if layer_id == config.speech_decoder_postnet_layers - 1:
out_conv_dim = config.num_mel_bins
else:
out_conv_dim = config.speech_decoder_postnet_units
self.conv = nn.Conv1d(
in_conv_dim,
out_conv_dim,
kernel_size=config.speech_decoder_postnet_kernel,
stride=1,
padding=(config.speech_decoder_postnet_kernel - 1) // 2,
bias=False,
)
self.batch_norm = nn.BatchNorm1d(out_conv_dim)
if layer_id < config.speech_decoder_postnet_layers - 1:
self.activation = nn.Tanh()
else:
self.activation = None
self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.batch_norm(hidden_states)
if self.activation is not None:
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
self.layers = nn.ModuleList(
[FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
)
def forward(self, hidden_states: torch.Tensor):
outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins)
layer_output = outputs_before_postnet.transpose(1, 2)
for layer in self.layers:
layer_output = layer(layer_output)
outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2)
return outputs_before_postnet, outputs_after_postnet
class FastSpeech2ConformerPredictorLayer(nn.Module):
def __init__(self, input_channels, num_chans, kernel_size, dropout_rate):
super().__init__()
self.conv = nn.Conv1d(
input_channels,
num_chans,
kernel_size,
stride=1,
padding=(kernel_size - 1) // 2,
)
self.activation = nn.ReLU()
self.layer_norm = nn.LayerNorm(num_chans)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, hidden_states):
hidden_states = self.conv(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = hidden_states.transpose(1, -1)
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states.transpose(1, -1)
hidden_states = self.dropout(hidden_states)
return hidden_states
class FastSpeech2ConformerVariancePredictor(nn.Module):
def __init__(
self,
config: FastSpeech2ConformerConfig,
num_layers=2,
num_chans=384,
kernel_size=3,
dropout_rate=0.5,
):
"""
Initilize variance predictor module.
Args:
config (`FastSpeech2ConformerConfig`): Configuration object for the model.
num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers.
num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers.
kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers.
dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate.
"""
super().__init__()
self.conv_layers = nn.ModuleList()
for idx in range(num_layers):
input_channels = config.hidden_size if idx == 0 else num_chans
layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate)
self.conv_layers.append(layer)
self.linear = nn.Linear(num_chans, 1)
def forward(self, encoder_hidden_states, padding_masks=None):
"""
Calculate forward propagation.
Args:
encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`):
Batch of input sequences.
padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*):
Batch of masks indicating padded part.
Returns:
Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`.
"""
hidden_states = encoder_hidden_states.transpose(1, -1)
for layer in self.conv_layers:
hidden_states = layer(hidden_states)
hidden_states = self.linear(hidden_states.transpose(1, 2))
if padding_masks is not None:
hidden_states = hidden_states.masked_fill(padding_masks, 0.0)
return hidden_states
class FastSpeech2ConformerVarianceEmbedding(nn.Module):
def __init__(
self,
in_channels=1,
out_channels=384,
kernel_size=1,
padding=0,
dropout_rate=0.0,
):
"""
Initialize variance embedding module.
Args:
in_channels (`int`, *optional*, defaults to 1): Number of input channels.
out_channels (`int`, *optional*, defaults to 384): Number of output channels.
kernel_size (`int`, *optional*, defaults to 1): Kernel size of the convolutional layer.
padding (`int`, *optional*, defaults to 0): Padding size of the convolutional layer.
dropout_rate (`float`, *optional*, defaults to 0.0): Dropout rate.
"""
super().__init__()
self.conv = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
padding=padding,
)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, hidden_states):
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.conv(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
class FastSpeech2ConformerAttention(nn.Module):
"""
Multi-Head attention layer with relative position encoding. Details can be found in
"""
"""
https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860.
"""
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
"""Construct an FastSpeech2ConformerAttention object."""
super().__init__()
self.num_heads = module_config["num_attention_heads"]
self.hidden_size = config.hidden_size
self.dim_key = self.hidden_size // self.num_heads
self.head_dim = self.hidden_size // self.num_heads
self.linear_q = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_k = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_v = nn.Linear(self.hidden_size, self.hidden_size)
self.linear_out = nn.Linear(self.hidden_size, self.hidden_size)
self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"])
self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
def shift_relative_position_tensor(self, pos_tensor):
"""
Args:
pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor.
"""
zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype)
pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1)
pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2))
pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1]
return pos_tensor
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
pos_emb: Optional[torch.Tensor] = None,
output_attentions: Optional[torch.Tensor] = False,
class FastSpeech2ConformerConvolutionModule(nn.Module):
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
super().__init__()
channels = config.hidden_size
kernel_size = module_config["kernel_size"]
self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True)
self.depthwise_conv = nn.Conv1d(
channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True
)
self.norm = nn.BatchNorm1d(channels)
self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True)
def forward(self, hidden_states):
"""
Compute convolution module.
Args:
hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
Returns:
`torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
"""
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.pointwise_conv1(hidden_states)
hidden_states = nn.functional.glu(hidden_states, dim=1)
hidden_states = self.depthwise_conv(hidden_states)
hidden_states = self.norm(hidden_states)
hidden_states = hidden_states * torch.sigmoid(hidden_states)
hidden_states = self.pointwise_conv2(hidden_states)
return hidden_states.transpose(1, 2)
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
super().__init__()
self.self_attn = FastSpeech2ConformerAttention(config, module_config)
self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
self.macaron_style = config.use_macaron_style_in_conformer
if self.macaron_style:
self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config)
self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size)
self.ff_scale = 0.5
else:
self.ff_scale = 1.0
self.use_cnn_module = config.use_cnn_in_conformer
if self.use_cnn_module:
self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config)
self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
self.final_layer_norm = nn.LayerNorm(config.hidden_size)
self.ff_layer_norm = nn.LayerNorm(config.hidden_size)
self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
self.dropout = nn.Dropout(module_config["dropout_rate"])
self.size = config.hidden_size
self.normalize_before = module_config["normalize_before"]
self.concat_after = module_config["concat_after"]
if self.concat_after:
self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size)
def forward(
self,
hidden_states: torch.Tensor,
pos_emb: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[torch.Tensor] = False,
class FastSpeech2ConformerMultiLayeredConv1d(nn.Module):
"""
Multi-layered conv1d for Transformer block.
This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer
block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech'
https://arxiv.org/pdf/1905.09263.pdf
"""
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
"""
Initialize FastSpeech2ConformerMultiLayeredConv1d module.
Args:
config (`FastSpeech2ConformerConfig`): Configuration object containing model parameters.
module_config (`dict`): Dictionary containing specific module configurations.
"""
super().__init__()
input_channels = config.hidden_size
hidden_channels = module_config["linear_units"]
kernel_size = config.positionwise_conv_kernel_size
self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2)
self.dropout = nn.Dropout(module_config["dropout_rate"])
def forward(self, hidden_states):
"""
Perform forward propagation through the module.
Args:
hidden_states (torch.Tensor): Input tensor of shape (batch_size, time, input_channels).
Returns:
torch.Tensor: Output tensor of shape (batch_size, time, hidden_channels).
"""
hidden_states = hidden_states.transpose(-1, 1)
hidden_states = self.conv1(hidden_states)
hidden_states = torch.relu(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = hidden_states.transpose(-1, 1)
return hidden_states
class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
"""
Relative positional encoding module (new implementation).
Args:
config (`FastSpeech2ConformerConfig`): Configuration object containing model parameters.
module_config (`dict`): Dictionary containing specific module configurations.
Details can be found in https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://arxiv.org/abs/1901.02860
"""
def __init__(self, config: FastSpeech2ConformerConfig, module_config):
"""
Construct a FastSpeech2ConformerRelPositionalEncoding object.
Args:
config (`FastSpeech2ConformerConfig`): Configuration object containing model parameters.
module_config (`dict`): Dictionary containing specific module configurations.
"""
super().__init__()
self.embed_dim = config.hidden_size
self.input_scale = math.sqrt(self.embed_dim)
self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"])
self.pos_enc = None
self.max_len = 5000
self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len))
def extend_pos_enc(self, x):
"""Reset the positional encodings."""
if self.pos_enc is not None:
if self.pos_enc.size(1) >= x.size(1) * 2 - 1:
if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device:
self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device)
return
pos_enc_positive = torch.zeros(x.size(1), self.embed_dim)
pos_enc_negative = torch.zeros(x.size(1), self.embed_dim)
position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.embed_dim, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.embed_dim)
)
pos_enc_positive[:, 0::2] = torch.sin(position * div_term)
pos_enc_positive[:, 1::2] = torch.cos(position * div_term)
pos_enc_negative[:, 0::2] = torch.sin(-1 * position * div_term)
pos_enc_negative[:, 1::2] = torch.cos(-1 * position * div_term)
pos_enc_positive = torch.flip(pos_enc_positive, [0]).unsqueeze(0)
pos_enc_negative = pos_enc_negative[1:].unsqueeze(0)
pos_enc = torch.cat([pos_enc_positive, pos_enc_negative], dim=1)
self.pos_enc = pos_enc.to(device=x.device, dtype=x.dtype)
def forward(self, feature_representation):
"""
Args:
feature_representation (`torch.Tensor` of shape (batch_size, time, `*`)):
Input tensor.
Returns:
`torch.Tensor`: Encoded tensor (batch_size, time, `*`).
"""
self.extend_pos_enc(feature_representation)
hidden_states = feature_representation * self.input_scale
center_idx = self.pos_enc.size(1) // 2
pos_emb = self.pos_enc[:, center_idx - hidden_states.size(1) + 1 : center_idx + hidden_states.size(1)]
return self.dropout(hidden_states), self.dropout(pos_emb)
class FastSpeech2ConformerEncoder(nn.Module):
"""
FastSpeech2ConformerEncoder encoder module.
Args:
config (`FastSpeech2ConformerConfig`):
FastSpeech2ConformerConfig instance. 模型配置参数对象
module_config (`dict`):
Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`.
包含编码器或解码器模块配置的字典,从 FastSpeech2ConformerConfig 中获取
use_encoder_input_layer (`bool`, *optional*, defaults to `False`):
Input layer type. 是否使用编码器输入层类型
"""
def __init__(
self,
config: FastSpeech2ConformerConfig,
module_config,
use_encoder_input_layer=False,
):
super().__init__()
self.embed = None
if use_encoder_input_layer:
self.embed = nn.Embedding(
num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=0
)
self.pos_enc = FastSpeech2ConformerRelPositionalEncoding(config, module_config)
self.conformer_layers = nn.ModuleList(
[FastSpeech2ConformerEncoderLayer(config, module_config) for _ in range(module_config["layers"])]
)
def forward(
self,
input_tensor: torch.LongTensor,
attention_mask: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = False,
return_dict: Optional[bool] = None,
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
输入序列标记在词汇表中的索引。默认情况下,将忽略填充标记。
可以使用 [`AutoTokenizer`] 获得索引。有关详细信息,请参见 [`PreTrainedTokenizer.encode`] 和
[`PreTrainedTokenizer.__call__`]。
[什么是输入 ID?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *可选*):
遮罩,用于避免在填充标记索引上执行注意力计算。遮罩值在 `[0, 1]` 范围内选择:
- 对于 **未遮罩** 的标记,为 1,
- 对于 **遮罩** 的标记,为 0。
[什么是注意力遮罩?](../glossary#attention-mask)
output_hidden_states (`bool`, *可选*):
是否返回所有层的隐藏状态。有关详细信息,请参见返回张量中的 `hidden_states`。
output_attentions (`bool`, *可选*):
是否返回所有注意力层的注意力张量。有关详细信息,请参见返回张量中的 `attentions`。
return_dict (`bool`, *可选*):
是否返回 [`~utils.ModelOutput`] 而不是简单元组。
Returns:
`torch.Tensor`:
形状为 `(batch, time, attention_dim)` 的输出张量。
"""
feature_representation = input_tensor
if self.embed is not None:
feature_representation = self.embed(feature_representation)
hidden_states, pos_emb = self.pos_enc(feature_representation)
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for conformer_layer in self.conformer_layers:
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = conformer_layer(hidden_states, pos_emb, attention_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
)
class FastSpeech2ConformerLoss(nn.Module):
def __init__(self, config: FastSpeech2ConformerConfig):
super().__init__()
use_masking = config.use_masking
use_weighted_masking = config.use_weighted_masking
if use_masking and use_weighted_masking:
raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.")
self.use_masking = use_masking
self.use_weighted_masking = use_weighted_masking
reduction = "none" if self.use_weighted_masking else "mean"
self.l1_criterion = nn.L1Loss(reduction=reduction)
self.mse_criterion = nn.MSELoss(reduction=reduction)
self.duration_criterion = nn.MSELoss(reduction=reduction)
self.log_domain_offset = 1.0
def forward(
self,
outputs_after_postnet,
outputs_before_postnet,
duration_outputs,
pitch_outputs,
energy_outputs,
spectrogram_labels,
duration_labels,
pitch_labels,
energy_labels,
duration_mask,
spectrogram_mask,
class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = FastSpeech2ConformerConfig
base_model_prefix = "fastspeech2_conformer"
main_input_name = "input_ids"
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, (nn.LayerNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
key = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-key, b=key)
elif isinstance(module, nn.Embedding):
module.weight.data.normal_()
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, FastSpeech2ConformerAttention):
nn.init.xavier_uniform_(module.pos_bias_u)
nn.init.xavier_uniform_(module.pos_bias_v)
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, FastSpeech2ConformerEncoder):
module.gradient_checkpointing = value
@add_start_docstrings(
"""FastSpeech2Conformer Model.""",
FASTSPEECH2_CONFORMER_START_DOCSTRING,
)
class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
"""
FastSpeech 2 module.
This is a module of FastSpeech 2 described in 'FastSpeech 2: Fast and High-Quality End-to-End Text to Speech'
https://arxiv.org/abs/2006.04558. Instead of quantized pitch and energy, we use token-averaged value introduced in
"""
FastPitch: Parallel Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers instead of regular
Transformers.
"""
@replace_return_docstrings(output_type=FastSpeech2ConformerModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.LongTensor] = None,
spectrogram_labels: Optional[torch.FloatTensor] = None,
duration_labels: Optional[torch.LongTensor] = None,
pitch_labels: Optional[torch.FloatTensor] = None,
energy_labels: Optional[torch.FloatTensor] = None,
speaker_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
speaker_embedding: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
# 从 transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock 复制的残差块类
class HifiGanResidualBlock(nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
super().__init__()
self.leaky_relu_slope = leaky_relu_slope
# 第一组卷积层列表,使用不同的扩张率创建卷积层
self.convs1 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=dilation[i],
padding=self.get_padding(kernel_size, dilation[i]),
)
for i in range(len(dilation))
]
)
# 第二组卷积层列表,每个卷积层的扩张率都为1,但使用相同的填充函数
self.convs2 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=1,
padding=self.get_padding(kernel_size, 1),
)
for _ in range(len(dilation))
]
)
# 计算卷积的填充量
def get_padding(self, kernel_size, dilation=1):
return (kernel_size * dilation - dilation) // 2
# 应用权重归一化到所有卷积层
def apply_weight_norm(self):
for layer in self.convs1:
nn.utils.weight_norm(layer)
for layer in self.convs2:
nn.utils.weight_norm(layer)
# 移除所有卷积层的权重归一化
def remove_weight_norm(self):
for layer in self.convs1:
nn.utils.remove_weight_norm(layer)
for layer in self.convs2:
nn.utils.remove_weight_norm(layer)
# 前向传播函数定义
def forward(self, hidden_states):
for conv1, conv2 in zip(self.convs1, self.convs2):
residual = hidden_states
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) # 应用 LeakyReLU 激活函数
hidden_states = conv1(hidden_states) # 第一组卷积层
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) # 再次应用 LeakyReLU 激活函数
hidden_states = conv2(hidden_states) # 第二组卷积层
hidden_states = hidden_states + residual # 加上残差连接
return hidden_states
# 从 transformers.models.speecht5.modeling_speecht5.SpeechT5HifiGan 复制的类,并将 SpeechT5 替换为 FastSpeech2Conformer
@add_start_docstrings(
"""HiFi-GAN vocoder.""",
HIFIGAN_START_DOCSTRING,
)
class FastSpeech2ConformerHifiGan(PreTrainedModel):
config_class = FastSpeech2ConformerHifiGanConfig
main_input_name = "spectrogram"
def __init__(self, config: FastSpeech2ConformerHifiGanConfig):
# 调用父类的初始化方法,传入配置参数
super().__init__(config)
# 计算使用的残差块卷积核数量和上采样率数量
self.num_kernels = len(config.resblock_kernel_sizes)
self.num_upsamples = len(config.upsample_rates)
# 创建一个卷积层,用于预处理输入特征
self.conv_pre = nn.Conv1d(
config.model_in_dim,
config.upsample_initial_channel,
kernel_size=7,
stride=1,
padding=3,
)
# 创建上采样层,根据配置中的参数创建多个反卷积层,并添加到模块列表中
self.upsampler = nn.ModuleList()
for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
self.upsampler.append(
nn.ConvTranspose1d(
config.upsample_initial_channel // (2**i),
config.upsample_initial_channel // (2 ** (i + 1)),
kernel_size=kernel_size,
stride=upsample_rate,
padding=(kernel_size - upsample_rate) // 2,
)
)
# 创建残差块层,根据配置中的参数创建多个残差块,并添加到模块列表中
self.resblocks = nn.ModuleList()
for i in range(len(self.upsampler)):
channels = config.upsample_initial_channel // (2 ** (i + 1))
for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
# 创建一个卷积层,用于后处理输出特征
self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
# 注册缓冲区,用于存储输入特征的均值和标准差
self.register_buffer("mean", torch.zeros(config.model_in_dim))
self.register_buffer("scale", torch.ones(config.model_in_dim))
# 调用初始化权重的方法,初始化各个层的权重
self.post_init()
def _init_weights(self, module):
"""初始化权重的方法."""
if isinstance(module, (nn.Linear, nn.Conv1d)):
# 对线性层和卷积层的权重进行正态分布初始化
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
# 如果存在偏置项,则将偏置项初始化为零
module.bias.data.zero_()
def apply_weight_norm(self):
# 对预处理卷积层和所有上采样层应用权重归一化
nn.utils.weight_norm(self.conv_pre)
for layer in self.upsampler:
nn.utils.weight_norm(layer)
# 对所有残差块应用权重归一化
for layer in self.resblocks:
layer.apply_weight_norm()
# 对后处理卷积层应用权重归一化
nn.utils.weight_norm(self.conv_post)
def remove_weight_norm(self):
# 移除预处理卷积层和所有上采样层的权重归一化
nn.utils.remove_weight_norm(self.conv_pre)
for layer in self.upsampler:
nn.utils.remove_weight_norm(layer)
# 移除所有残差块的权重归一化
for layer in self.resblocks:
layer.remove_weight_norm()
# 移除后处理卷积层的权重归一化
nn.utils.remove_weight_norm(self.conv_post)
def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
r"""
Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
waveform.
Args:
spectrogram (`torch.FloatTensor`):
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
Returns:
`torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
"""
# 如果需要在前处理阶段进行归一化,则对输入的频谱图进行归一化处理
if self.config.normalize_before:
spectrogram = (spectrogram - self.mean) / self.scale
# 检查输入的频谱图是否是批量数据
is_batched = spectrogram.dim() == 3
if not is_batched:
# 如果输入不是批量数据,则在第0维度上增加一个维度,使其变成批量数据
spectrogram = spectrogram.unsqueeze(0)
# 将频谱图的维度进行转置,以符合卷积层的输入要求
hidden_states = spectrogram.transpose(2, 1)
# 经过预处理的卷积层
hidden_states = self.conv_pre(hidden_states)
# 循环执行上采样操作
for i in range(self.num_upsamples):
# 应用 LeakyReLU 激活函数
hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
# 执行上采样操作
hidden_states = self.upsampler[i](hidden_states)
# 执行残差块操作
res_state = self.resblocks[i * self.num_kernels](hidden_states)
for j in range(1, self.num_kernels):
res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
# 对残差块结果进行均值处理
hidden_states = res_state / self.num_kernels
# 应用 LeakyReLU 激活函数
hidden_states = nn.functional.leaky_relu(hidden_states)
# 经过后处理的卷积层
hidden_states = self.conv_post(hidden_states)
# 应用 Tanh 激活函数,将输出范围限制在 [-1, 1]
hidden_states = torch.tanh(hidden_states)
if not is_batched:
# 如果输入不是批量数据,则去除批量维度,并将张量展平为一维音频波形
waveform = hidden_states.squeeze(0).transpose(1, 0).view(-1)
else:
# 如果输入是批量数据,则去除序列长度维度,使其变为一维
waveform = hidden_states.squeeze(1)
return waveform
# 为 FastSpeech2ConformerWithHifiGan 类添加文档字符串,描述其作为一个文本到语音模型(生成波形)的 FastSpeech2ConformerHifiGan 语音合成器。
@add_start_docstrings(
"The FastSpeech2ConformerModel with a FastSpeech2ConformerHifiGan vocoder head that performs text-to-speech (waveform).",
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING,
)
class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
# 指定配置类为 FastSpeech2ConformerWithHifiGanConfig
config_class = FastSpeech2ConformerWithHifiGanConfig
# 初始化方法,接受一个 FastSpeech2ConformerWithHifiGanConfig 类型的 config 参数
def __init__(self, config: FastSpeech2ConformerWithHifiGanConfig):
# 调用父类的初始化方法,传入 config 参数
super().__init__(config)
# 创建 FastSpeech2ConformerModel 模型对象,使用 config.model_config 进行配置
self.model = FastSpeech2ConformerModel(config.model_config)
# 创建 FastSpeech2ConformerHifiGan 语音合成器对象,使用 config.vocoder_config 进行配置
self.vocoder = FastSpeech2ConformerHifiGan(config.vocoder_config)
# 将 config 参数保存为实例属性
self.config = config
# 重写 forward 方法的文档字符串,指定输出类型为 FastSpeech2ConformerWithHifiGanOutput,配置类为 FastSpeech2ConformerWithHifiGanConfig
@replace_return_docstrings(
output_type=FastSpeech2ConformerWithHifiGanOutput, config_class=FastSpeech2ConformerWithHifiGanConfig
)
# 前向传播方法,接受多个输入参数,所有参数都是 torch 张量类型,有些参数可以为空
def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.LongTensor] = None,
spectrogram_labels: Optional[torch.FloatTensor] = None,
duration_labels: Optional[torch.LongTensor] = None,
pitch_labels: Optional[torch.FloatTensor] = None,
energy_labels: Optional[torch.FloatTensor] = None,
speaker_ids: Optional[torch.LongTensor] = None,
lang_ids: Optional[torch.LongTensor] = None,
speaker_embedding: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
# 以下参数没有在声明中列出,但在使用时会根据需要传入
**kwargs,
):
# 省略了具体的前向传播逻辑,需要在实际代码中查看
pass
.\models\fastspeech2_conformer\tokenization_fastspeech2_conformer.py
"""
Tokenization classes for FastSpeech2Conformer.
"""
import json
import os
from typing import Optional, Tuple
import regex
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging, requires_backends
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/vocab.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"espnet/fastspeech2_conformer": 4096,
}
class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
"""
Construct a FastSpeech2Conformer tokenizer.
Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
The begin of sequence token. Note that for FastSpeech2, it is the same as the `eos_token`.
eos_token (`str`, *optional*, defaults to `"<sos/eos>"`):
The end of sequence token. Note that for FastSpeech2, it is the same as the `bos_token`.
pad_token (`str`, *optional*, defaults to `"<blank>"`):
The token used for padding, for example when batching sequences of different lengths.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
should_strip_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to strip the spaces from the list of tokens.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
bos_token="<sos/eos>",
eos_token="<sos/eos>",
pad_token="<blank>",
unk_token="<unk>",
should_strip_spaces=False,
**kwargs,
):
requires_backends(self, "g2p_en")
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
import g2p_en
self.g2p = g2p_en.G2p()
self.decoder = {v: k for k, v in self.encoder.items()}
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
should_strip_spaces=should_strip_spaces,
**kwargs,
)
self.should_strip_spaces = should_strip_spaces
@property
def vocab_size(self):
return len(self.decoder)
def get_vocab(self):
"Returns vocab as a dict"
return dict(self.encoder, **self.added_tokens_encoder)
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
text = regex.sub(";", ",", text)
text = regex.sub(":", ",", text)
text = regex.sub("-", " ", text)
text = regex.sub("&", "and", text)
text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)
text = regex.sub(r"\s+", " ", text)
text = text.upper()
return text, kwargs
def _tokenize(self, text):
"""Returns a tokenized string."""
tokens = self.g2p(text)
if self.should_strip_spaces:
tokens = list(filter(lambda s: s != " ", tokens))
tokens.append(self.eos_token)
return tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
def decode(self, token_ids, **kwargs):
logger.warn(
"Phonemes cannot be reliably converted to a string due to the one-many mapping, converting to tokens instead."
)
return self.convert_ids_to_tokens(token_ids)
def convert_tokens_to_string(self, tokens, **kwargs):
logger.warn(
"Phonemes cannot be reliably converted to a string due to the one-many mapping, returning the tokens."
)
return tokens
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.get_vocab(), ensure_ascii=False))
return (vocab_file,)
def __getstate__(self):
state = self.__dict__.copy()
state["g2p"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import g2p_en
self.g2p = g2p_en.G2p()
except ImportError:
raise ImportError(
"You need to install g2p-en to use FastSpeech2ConformerTokenizer. "
"See https://pypi.org/project/g2p-en/ for installation."
)
.\models\fastspeech2_conformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_fastspeech2_conformer": [
"FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP",
"FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP",
"FastSpeech2ConformerConfig",
"FastSpeech2ConformerHifiGanConfig",
"FastSpeech2ConformerWithHifiGanConfig",
],
"tokenization_fastspeech2_conformer": ["FastSpeech2ConformerTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_fastspeech2_conformer"] = [
"FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"FastSpeech2ConformerWithHifiGan",
"FastSpeech2ConformerHifiGan",
"FastSpeech2ConformerModel",
"FastSpeech2ConformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_fastspeech2_conformer import (
FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP,
FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP,
FastSpeech2ConformerConfig,
FastSpeech2ConformerHifiGanConfig,
FastSpeech2ConformerWithHifiGanConfig,
)
from .tokenization_fastspeech2_conformer import FastSpeech2ConformerTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_fastspeech2_conformer import (
FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
FastSpeech2ConformerHifiGan,
FastSpeech2ConformerModel,
FastSpeech2ConformerPreTrainedModel,
FastSpeech2ConformerWithHifiGan,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\flaubert\configuration_flaubert.py
""" Flaubert configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"flaubert/flaubert_small_cased": "https://huggingface.co/flaubert/flaubert_small_cased/resolve/main/config.json",
"flaubert/flaubert_base_uncased": "https://huggingface.co/flaubert/flaubert_base_uncased/resolve/main/config.json",
"flaubert/flaubert_base_cased": "https://huggingface.co/flaubert/flaubert_base_cased/resolve/main/config.json",
"flaubert/flaubert_large_cased": "https://huggingface.co/flaubert/flaubert_large_cased/resolve/main/config.json",
}
class FlaubertConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`FlaubertModel`] or a [`TFFlaubertModel`]. It is
used to instantiate a FlauBERT model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the FlauBERT
[flaubert/flaubert_base_uncased](https://huggingface.co/flaubert/flaubert_base_uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "flaubert"
attribute_map = {
"hidden_size": "emb_dim",
"num_attention_heads": "n_heads",
"num_hidden_layers": "n_layers",
"n_words": "vocab_size",
}
def __init__(
self,
pre_norm=False,
layerdrop=0.0,
vocab_size=30145,
emb_dim=2048,
n_layers=12,
n_heads=16,
dropout=0.1,
attention_dropout=0.1,
gelu_activation=True,
sinusoidal_embeddings=False,
causal=False,
asm=False,
n_langs=1,
use_lang_emb=True,
max_position_embeddings=512,
embed_init_std=2048**-0.5,
layer_norm_eps=1e-12,
init_std=0.02,
bos_index=0,
eos_index=1,
pad_index=2,
unk_index=3,
mask_index=5,
is_encoder=True,
summary_type="first",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
start_n_top=5,
end_n_top=5,
mask_token_id=0,
lang_id=0,
pad_token_id=2,
bos_token_id=0,
**kwargs,
):
"""Constructs FlaubertConfig."""
self.pre_norm = pre_norm
self.layerdrop = layerdrop
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.n_layers = n_layers
self.n_heads = n_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.gelu_activation = gelu_activation
self.sinusoidal_embeddings = sinusoidal_embeddings
self.causal = causal
self.asm = asm
self.n_langs = n_langs
self.use_lang_emb = use_lang_emb
self.layer_norm_eps = layer_norm_eps
self.bos_index = bos_index
self.eos_index = eos_index
self.pad_index = pad_index
self.unk_index = unk_index
self.mask_index = mask_index
self.is_encoder = is_encoder
self.max_position_embeddings = max_position_embeddings
self.embed_init_std = embed_init_std
self.init_std = init_std
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_proj_to_labels = summary_proj_to_labels
self.summary_first_dropout = summary_first_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
self.mask_token_id = mask_token_id
self.lang_id = lang_id
if "n_words" in kwargs:
self.n_words = kwargs["n_words"]
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
class FlaubertOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\flaubert\modeling_flaubert.py
""" PyTorch Flaubert model, based on XLM."""
import itertools
import math
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import numpy as np
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import gelu
from ...modeling_outputs import (
BaseModelOutput,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_flaubert import FlaubertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
_CONFIG_FOR_DOC = "FlaubertConfig"
FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"flaubert/flaubert_small_cased",
"flaubert/flaubert_base_uncased",
"flaubert/flaubert_base_cased",
"flaubert/flaubert_large_cased",
]
def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
out.requires_grad = False
def get_masks(slen, lengths, causal, padding_mask=None):
"""
Generate hidden states mask, and optionally an attention mask.
"""
alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
if padding_mask is not None:
mask = padding_mask
else:
assert lengths.max().item() <= slen
mask = alen < lengths[:, None]
bs = lengths.size(0)
if causal:
attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
else:
attn_mask = mask
assert mask.size() == (bs, slen)
assert causal is False or attn_mask.size() == (bs, slen, slen)
return mask, attn_mask
class MultiHeadAttention(nn.Module):
NEW_ID = itertools.count()
def __init__(self, n_heads, dim, config):
super().__init__()
self.layer_id = next(MultiHeadAttention.NEW_ID)
self.dim = dim
self.n_heads = n_heads
self.dropout = config.attention_dropout
assert self.dim % self.n_heads == 0
self.q_lin = nn.Linear(dim, dim)
self.k_lin = nn.Linear(dim, dim)
self.v_lin = nn.Linear(dim, dim)
self.out_lin = nn.Linear(dim, dim)
self.pruned_heads = set()
def prune_heads(self, heads):
attention_head_size = self.dim // self.n_heads
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
self.q_lin = prune_linear_layer(self.q_lin, index)
self.k_lin = prune_linear_layer(self.k_lin, index)
self.v_lin = prune_linear_layer(self.v_lin, index)
self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.dim = attention_head_size * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, input, mask, kv=None, cache=None, head_mask=None, output_attentions=False):
"""
Self-attention (if kv is None) or attention over source sentence (provided by kv).
"""
bs, qlen, dim = input.size()
if kv is None:
klen = qlen if cache is None else cache["slen"] + qlen
else:
klen = kv.size(1)
n_heads = self.n_heads
dim_per_head = self.dim // n_heads
mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
def shape(x):
"""对输入张量进行线性投影"""
return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
def unshape(x):
"""计算上下文信息"""
return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
q = shape(self.q_lin(input))
if kv is None:
k = shape(self.k_lin(input))
v = shape(self.v_lin(input))
elif cache is None or self.layer_id not in cache:
k = v = kv
k = shape(self.k_lin(k))
v = shape(self.v_lin(v))
if cache is not None:
if self.layer_id in cache:
if kv is None:
k_, v_ = cache[self.layer_id]
k = torch.cat([k_, k], dim=2)
v = torch.cat([v_, v], dim=2)
else:
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
q = q / math.sqrt(dim_per_head)
scores = torch.matmul(q, k.transpose(2, 3))
mask = (mask == 0).view(mask_reshape).expand_as(scores)
scores.masked_fill_(mask, torch.finfo(scores.dtype).min)
weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)
if head_mask is not None:
weights = weights * head_mask
context = torch.matmul(weights, v)
context = unshape(context)
outputs = (self.out_lin(context),)
if output_attentions:
outputs = outputs + (weights,)
return outputs
class TransformerFFN(nn.Module):
def __init__(self, in_dim, dim_hidden, out_dim, config):
super().__init__()
self.dropout = config.dropout
self.lin1 = nn.Linear(in_dim, dim_hidden)
self.lin2 = nn.Linear(dim_hidden, out_dim)
self.act = gelu if config.gelu_activation else nn.functional.relu
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
def forward(self, input):
return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
def ff_chunk(self, input):
x = self.lin1(input)
x = self.act(x)
x = self.lin2(x)
x = nn.functional.dropout(x, p=self.dropout, training=self.training)
return x
FLAUBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
FLAUBERT_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
FLAUBERT_START_DOCSTRING,
)
class FlaubertPredLayer(nn.Module):
"""
Prediction layer (cross_entropy or adaptive_softmax).
"""
def __init__(self, config):
super().__init__()
self.asm = config.asm
self.n_words = config.n_words
self.pad_index = config.pad_index
dim = config.emb_dim
if config.asm is False:
self.proj = nn.Linear(dim, config.n_words, bias=True)
else:
self.proj = nn.AdaptiveLogSoftmaxWithLoss(
in_features=dim,
n_classes=config.n_words,
cutoffs=config.asm_cutoffs,
div_value=config.asm_div_value,
head_bias=True,
)
def forward(self, x, y=None):
"""Compute the loss, and optionally the scores."""
outputs = ()
if self.asm is False:
scores = self.proj(x)
outputs = (scores,) + outputs
if y is not None:
loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean")
outputs = (loss,) + outputs
else:
scores = self.proj.log_prob(x)
outputs = (scores,) + outputs
if y is not None:
_, loss = self.proj(x, y)
outputs = (loss,) + outputs
return outputs
class FlaubertPreTrainedModel(PreTrainedModel):
"""
处理权重初始化、预训练模型下载和加载的抽象类。
"""
config_class = FlaubertConfig
load_tf_weights = None
base_model_prefix = "transformer"
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
@property
def dummy_inputs(self):
inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
if self.config.use_lang_emb and self.config.n_langs > 1:
langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else:
langs_list = None
return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
def _init_weights(self, module):
"""初始化模型的权重。"""
if isinstance(module, nn.Embedding):
if self.config is not None and self.config.embed_init_std is not None:
nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
if isinstance(module, nn.Linear):
if self.config is not None and self.config.init_std is not None:
nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
if module.bias is not None:
nn.init.constant_(module.bias, 0.0)
if isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class FlaubertModel(FlaubertPreTrainedModel):
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, new_embeddings):
self.embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
对模型的注意力头进行修剪。
heads_to_prune: {层号: 需要在该层中修剪的头列表} 参见基类 PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads)
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
lengths: Optional[torch.LongTensor] = None,
cache: Optional[Dict[str, torch.FloatTensor]] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
FLAUBERT_START_DOCSTRING,
)
class FlaubertWithLMHeadModel(FlaubertPreTrainedModel):
_tied_weights_keys = ["pred_layer.proj.weight"]
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.pred_layer = FlaubertPredLayer(config)
self.post_init()
def get_output_embeddings(self):
return self.pred_layer.proj
def set_output_embeddings(self, new_embeddings):
self.pred_layer.proj = new_embeddings
def prepare_inputs_for_generation(self, input_ids, **kwargs):
mask_token_id = self.config.mask_token_id
lang_id = self.config.lang_id
effective_batch_size = input_ids.shape[0]
mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
input_ids = torch.cat([input_ids, mask_token], dim=1)
if lang_id is not None:
langs = torch.full_like(input_ids, lang_id)
else:
langs = None
return {"input_ids": input_ids, "langs": langs}
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<special1>",
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
output = transformer_outputs[0]
outputs = self.pred_layer(output, labels)
if not return_dict:
return outputs + transformer_outputs[1:]
return MaskedLMOutput(
loss=outputs[0] if labels is not None else None,
logits=outputs[0] if labels is None else outputs[1],
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
e.g. for GLUE tasks.
""",
FLAUBERT_START_DOCSTRING,
)
class FlaubertForSequenceClassification(FlaubertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.transformer = FlaubertModel(config)
self.sequence_summary = SequenceSummary(config)
self.post_init()
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
FLAUBERT_START_DOCSTRING,
)
class FlaubertForTokenClassification(FlaubertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = FlaubertModel(config)
self.dropout = nn.Dropout(config.dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, Optional[torch.Tensor]]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
FLAUBERT_START_DOCSTRING,
)
class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like
SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
FLAUBERT_START_DOCSTRING,
)
@dataclass
class FlaubertForQuestionAnsweringOutput(ModelOutput):
"""
Base class for outputs of question answering models using a `SquadHead`.
"""
class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
分类损失,作为开始标记、结束标记(如果提供的话还包括is_impossible)分类损失的总和。
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
开始标记的前config.start_n_top个可能性的对数概率(beam-search)。
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
开始标记的前config.start_n_top个可能性的索引(beam-search)。
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
结束标记的前config.start_n_top * config.end_n_top个可能性的对数概率(beam-search)。
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
结束标记的前config.start_n_top * config.end_n_top个可能性的索引(beam-search)。
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
答案的`is_impossible`标签的对数概率。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
模型每一层的隐藏状态的元组,包括每层的输出和初始嵌入的输出,形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
注意力权重的元组,每层一个,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`,用于计算自注意力头中的加权平均值。
class FlaubertForQuestionAnswering(FlaubertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.transformer = FlaubertModel(config)
self.qa_outputs = SQuADHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=FlaubertForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
is_impossible: Optional[torch.Tensor] = None,
cls_index: Optional[torch.Tensor] = None,
p_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass for FlaubertForQuestionAnswering model.
Args:
input_ids (torch.Tensor, optional): Indices of input sequence tokens.
attention_mask (torch.Tensor, optional): Mask to avoid performing attention on padding tokens.
langs (torch.Tensor, optional): Language IDs for multi-lingual models like XLM.
token_type_ids (torch.Tensor, optional): Segment token indices to indicate first and second portions of the inputs.
position_ids (torch.Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings.
lengths (torch.Tensor, optional): Lengths of each sequence to handle padded inputs.
cache (Dict[str, torch.Tensor], optional): Dictionary with precomputed hidden states.
head_mask (torch.Tensor, optional): Mask to nullify selected heads of the self-attention modules.
inputs_embeds (torch.Tensor, optional): Embedded representation of the inputs.
start_positions (torch.Tensor, optional): Start position for the answer span.
end_positions (torch.Tensor, optional): End position for the answer span.
is_impossible (torch.Tensor, optional): Whether the question has no possible answer.
cls_index (torch.Tensor, optional): Position of the classification token in the input sequence.
p_mask (torch.Tensor, optional): Mask of tokens which can't be in the answer.
output_attentions (bool, optional): Whether to output attentions weights.
output_hidden_states (bool, optional): Whether to output all hidden-states.
return_dict (bool, optional): Whether to return a single dictionary instead of a tuple of outputs.
Returns:
Union[FlaubertForQuestionAnsweringOutput, Tuple[torch.Tensor]]:
Depending on `return_dict`, either a dictionary with main outputs or a tuple of outputs.
"""
pass
class FlaubertForMultipleChoice(FlaubertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = FlaubertModel(config)
self.sequence_summary = SequenceSummary(config)
self.logits_proj = nn.Linear(config.num_labels, 1)
self.post_init()
@add_start_docstrings_to_model_forward(
FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass for FlaubertForMultipleChoice model.
Args:
input_ids (torch.Tensor, optional): Indices of input sequence tokens.
attention_mask (torch.Tensor, optional): Mask to avoid performing attention on padding tokens.
langs (torch.Tensor, optional): Language IDs for multi-lingual models like XLM.
token_type_ids (torch.Tensor, optional): Segment token indices to indicate first and second portions of the inputs.
position_ids (torch.Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings.
lengths (torch.Tensor, optional): Lengths of each sequence to handle padded inputs.
cache (Dict[str, torch.Tensor], optional): Dictionary with precomputed hidden states.
head_mask (torch.Tensor, optional): Mask to nullify selected heads of the self-attention modules.
inputs_embeds (torch.Tensor, optional): Embedded representation of the inputs.
labels (torch.Tensor, optional): Labels for computing the multiple choice classification loss.
output_attentions (bool, optional): Whether to output attentions weights.
output_hidden_states (bool, optional): Whether to output all hidden-states.
return_dict (bool, optional): Whether to return a single dictionary instead of a tuple of outputs.
Returns:
Union[MultipleChoiceModelOutput, Tuple[torch.Tensor]]:
Depending on `return_dict`, either a dictionary with main outputs or a tuple of outputs.
"""
pass
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
langs: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
lengths: Optional[torch.Tensor] = None,
cache: Optional[Dict[str, torch.Tensor]] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
langs = langs.view(-1, langs.size(-1)) if langs is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
if lengths is not None:
logger.warning(
"The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
"attention mask instead."
)
lengths = None
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
logits = self.logits_proj(logits)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
.\models\flaubert\modeling_tf_flaubert.py
import itertools
import random
import warnings
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFSequenceSummary,
TFSharedEmbeddings,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import (
check_embeddings_within_bounds,
shape_list,
stable_softmax,
)
from ...utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_flaubert import FlaubertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
_CONFIG_FOR_DOC = "FlaubertConfig"
TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
FLAUBERT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
# Parameters: config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
# Initializing with a config file does not load the weights associated with the model, only the
# configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
config ([`FlaubertConfig`]):
# Model configuration class containing all model parameters.
Model configuration class with all the parameters of the model.
# Initializing with a config file does not load the weights associated with the model, only the configuration.
Initializing with a config file does not load the weights associated with the model, only the configuration.
# Check out the [`~PreTrained Model method weights associated Model from explained. contains meaning unknown.
"""
FLAUBERT_INPUTS_DOCSTRING = r"""
"""
def get_masks(slen, lengths, causal, padding_mask=None):
"""
Generate hidden states mask, and optionally an attention mask.
"""
bs = shape_list(lengths)[0]
if padding_mask is not None:
mask = padding_mask
else:
alen = tf.range(slen, dtype=lengths.dtype)
mask = alen < tf.expand_dims(lengths, axis=1)
if causal:
attn_mask = tf.less_equal(
tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
)
else:
attn_mask = mask
tf.debugging.assert_equal(shape_list(mask), [bs, slen])
if causal:
tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])
return mask, attn_mask
class TFFlaubertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = FlaubertConfig
base_model_prefix = "transformer"
@property
def dummy_inputs(self):
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
if self.config.use_lang_emb and self.config.n_langs > 1:
return {
"input_ids": inputs_list,
"attention_mask": attns_list,
"langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
}
else:
return {"input_ids": inputs_list, "attention_mask": attns_list}
@add_start_docstrings(
"The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertModel(TFFlaubertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer")
@unpack_inputs
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[Tuple, TFBaseModelOutput]:
outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFFlaubertMultiHeadAttention(keras.layers.Layer):
NEW_ID = itertools.count()
def __init__(self, n_heads, dim, config, **kwargs):
super().__init__(**kwargs)
self.layer_id = next(TFFlaubertMultiHeadAttention.NEW_ID)
self.dim = dim
self.n_heads = n_heads
self.output_attentions = config.output_attentions
assert self.dim % self.n_heads == 0
self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
self.dropout = keras.layers.Dropout(config.attention_dropout)
self.pruned_heads = set()
self.dim = dim
def prune_heads(self, heads):
raise NotImplementedError
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_lin", None) is not None:
with tf.name_scope(self.q_lin.name):
self.q_lin.build([None, None, self.dim])
if getattr(self, "k_lin", None) is not None:
with tf.name_scope(self.k_lin.name):
self.k_lin.build([None, None, self.dim])
if getattr(self, "v_lin", None) is not None:
with tf.name_scope(self.v_lin.name):
self.v_lin.build([None, None, self.dim])
if getattr(self, "out_lin", None) is not None:
with tf.name_scope(self.out_lin.name):
self.out_lin.build([None, None, self.dim])
class TFFlaubertTransformerFFN(keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
super().__init__(**kwargs)
self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
self.dropout = keras.layers.Dropout(config.dropout)
self.in_dim = in_dim
self.dim_hidden = dim_hidden
def call(self, input, training=False):
x = self.lin1(input)
x = self.act(x)
x = self.lin2(x)
x = self.dropout(x, training=training)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.in_dim])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.dim_hidden])
@keras_serializable
class TFFlaubertMainLayer(keras.layers.Layer):
config_class = FlaubertConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.n_heads = config.n_heads
self.n_langs = config.n_langs
self.dim = config.emb_dim
self.hidden_dim = self.dim * 4
self.n_words = config.n_words
self.pad_index = config.pad_index
self.causal = config.causal
self.n_layers = config.n_layers
self.use_lang_emb = config.use_lang_emb
self.layerdrop = getattr(config, "layerdrop", 0.0)
self.pre_norm = getattr(config, "pre_norm", False)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.max_position_embeddings = config.max_position_embeddings
self.embed_init_std = config.embed_init_std
self.dropout = keras.layers.Dropout(config.dropout)
self.embeddings = TFSharedEmbeddings(
self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
)
self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
self.attentions = []
self.layer_norm1 = []
self.ffns = []
self.layer_norm2 = []
for i in range(self.n_layers):
self.attentions.append(
TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}")
)
self.layer_norm1.append(
keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}")
)
self.ffns.append(
TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}")
)
self.layer_norm2.append(
keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}")
)
def build(self, input_shape=None):
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.dim],
initializer=get_initializer(self.embed_init_std),
)
if self.n_langs > 1 and self.use_lang_emb:
with tf.name_scope("lang_embeddings"):
self.lang_embeddings = self.add_weight(
name="embeddings",
shape=[self.n_langs, self.dim],
initializer=get_initializer(self.embed_init_std),
)
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "layer_norm_emb", None) is not None:
with tf.name_scope(self.layer_norm_emb.name):
self.layer_norm_emb.build([None, None, self.dim])
for layer in self.attentions:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm1:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
for layer in self.ffns:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.layer_norm2:
with tf.name_scope(layer.name):
layer.build([None, None, self.dim])
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
@unpack_inputs
def call(
self,
input_ids: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
class TFFlaubertPredLayer(keras.layers.Layer):
"""
Prediction layer (cross_entropy or adaptive_softmax).
"""
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.asm = config.asm
self.n_words = config.n_words
self.pad_index = config.pad_index
if config.asm is False:
self.input_embeddings = input_embeddings
else:
raise NotImplementedError
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
def get_output_embeddings(self):
return self.input_embeddings
def set_output_embeddings(self, value):
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self):
return {"bias": self.bias}
def set_bias(self, value):
self.bias = value["bias"]
self.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
@dataclass
class TFFlaubertWithLMHeadModelOutput(ModelOutput):
"""
Base class for [`TFFlaubertWithLMHeadModel`] outputs.
Args:
logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
"""
The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
@add_start_docstrings(
"""
The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer")
self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
self.supports_xla_generation = False
def get_lm_head(self):
return self.pred_layer
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.pred_layer.name
def prepare_inputs_for_generation(self, inputs, **kwargs):
mask_token_id = self.config.mask_token_id
lang_id = self.config.lang_id
effective_batch_size = inputs.shape[0]
mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id
inputs = tf.concat([inputs, mask_token], axis=1)
if lang_id is not None:
langs = tf.ones_like(inputs) * lang_id
else:
langs = None
return {"input_ids": inputs, "langs": langs}
@unpack_inputs
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFFlaubertWithLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: np.ndarray | tf.Tensor | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs
):
) -> Union[Tuple, TFFlaubertWithLMHeadModelOutput]:
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
output = transformer_outputs[0]
outputs = self.pred_layer(output)
if not return_dict:
return (outputs,) + transformer_outputs[1:]
return TFFlaubertWithLMHeadModelOutput(
logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "pred_layer", None) is not None:
with tf.name_scope(self.pred_layer.name):
self.pred_layer.build(None)
@add_start_docstrings(
"""
Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
e.g. for GLUE tasks.
""",
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertForSequenceClassification(TFFlaubertPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFFlaubertMainLayer(config, name="transformer")
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@unpack_inputs
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
@add_start_docstrings(
"""
Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer")
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFFlaubertMainLayer(config, name="transformer")
self.dropout = keras.layers.Dropout(config.dropout)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = transformer_outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer")
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
self.logits_proj = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
self.config = config
@property
def dummy_inputs(self):
"""
Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
if self.config.use_lang_emb and self.config.n_langs > 1:
return {
"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
"langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
}
else:
return {
"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
}
@unpack_inputs
@add_start_docstrings_to_model_forward(
FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
langs: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
lengths: np.ndarray | tf.Tensor | None = None,
cache: Optional[Dict[str, tf.Tensor]] = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
-> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
flat_inputs_embeds = (
tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
if lengths is not None:
logger.warning(
"The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
"attention mask instead.",
)
lengths = None
transformer_outputs = self.transformer(
flat_input_ids,
flat_attention_mask,
flat_langs,
flat_token_type_ids,
flat_position_ids,
lengths,
cache,
head_mask,
flat_inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
logits = self.logits_proj(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "sequence_summary", None) is not None:
with tf.name_scope(self.sequence_summary.name):
self.sequence_summary.build(None)
if getattr(self, "logits_proj", None) is not None:
with tf.name_scope(self.logits_proj.name):
self.logits_proj.build([None, None, self.config.num_labels])
.\models\flaubert\tokenization_flaubert.py
def convert_to_unicode(text):
"""
Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
"""
def ensure_text(s, encoding="utf-8", errors="strict"):
if isinstance(s, bytes):
return s.decode(encoding, errors)
elif isinstance(s, str):
return s
else:
raise TypeError(f"not expecting type '{type(s)}'")
return ensure_text(text, encoding="utf-8", errors="ignore")
def get_pairs(word):
"""
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def replace_unicode_punct(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
"""
text = text.replace(",", ",")
text = re.sub(r"。\s*", ". ", text)
text = text.replace("、", ",")
text = text.replace("”", '"')
text = text.replace("“", '"')
text = text.replace("∶", ":")
text = text.replace(":", ":")
text = text.replace("?", "?")
text = text.replace("《", '"')
text = text.replace("》", '"')
text = text.replace(")", ")")
text = text.replace("!", "!")
text = text.replace("(", "(")
text = text.replace(";", ";")
text = text.replace("1", "1")
text = text.replace("」", '"')
text = text.replace("「", '"')
text = text.replace("0", "0")
text = text.replace("3", "3")
text = text.replace("2", "2")
text = text.replace("5", "5")
text = text.replace("6", "6")
text = text.replace("9", "9")
text = text.replace("7", "7")
text = text.replace("8", "8")
text = text.replace("4", "4")
text = re.sub(r".\s*", ". ", text)
text = text.replace("~", "~")
text = text.replace("’", "'")
text = text.replace("…", "...")
text = text.replace("━", "-")
text = text.replace("〈", "<")
text = text.replace("〉", ">")
text = text.replace("【", "[")
text = text.replace("】", "]")
text = text.replace("%", "%")
return text
def remove_non_printing_char(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
"""
output = []
for char in text:
cat = unicodedata.category(char)
if cat.startswith("C"):
continue
output.append(char)
return "".join(output)
class FlaubertTokenizer(PreTrainedTokenizer):
"""
Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
"__classify__") to a vocabulary.
- The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
# 定义一个类,继承自BertPreTrainedModel类,用于语言模型的预训练
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Vocabulary file. # 词汇表文件的路径
merges_file (`str`):
Merges file. # 合并文件的路径
do_lowercase (`bool`, *optional*, defaults to `False`):
Controls lower casing. # 控制是否进行小写处理的布尔值,默认为False
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. # 未知标记,用于表示词汇表中不存在的词语,默认为"<unk>"
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
# 在预训练期间使用的序列开始标记,也可用作序列分类器标记
<Tip>
在使用特殊标记构建序列时,这不是用于序列开头的标记。实际使用的标记是`cls_token`。
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
# 分隔符标记,在构建来自多个序列的序列时使用,例如用于序列分类或问题回答
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
# 填充标记,在批处理不同长度序列时使用
cls_token (`str`, *optional*, defaults to `"</s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
# 分类器标记,在进行序列分类时使用,整个序列而不是每个标记的分类
mask_token (`str`, *optional*, defaults to `"<special1>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
# 用于掩码值的标记,在使用掩码语言建模训练模型时使用,模型会预测这种标记
additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
List of additional special tokens.
# 额外特殊标记的列表
lang2id (`Dict[str, int]`, *optional*):
Dictionary mapping languages string identifiers to their IDs.
# 将语言字符串标识符映射到其ID的字典
id2lang (`Dict[int, str]`, *optional*):
Dictionary mapping language IDs to their string identifiers.
# 将语言ID映射到其字符串标识符的字典
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
do_lowercase=False,
unk_token="<unk>",
bos_token="<s>",
sep_token="</s>",
pad_token="<pad>",
cls_token="</s>",
mask_token="<special1>",
additional_special_tokens=[
"<special0>",
"<special1>",
"<special2>",
"<special3>",
"<special4>",
"<special5>",
"<special6>",
"<special7>",
"<special8>",
"<special9>",
],
lang2id=None,
id2lang=None,
**kwargs,
):
do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
if do_lowercase_and_remove_accent is not None:
logger.warning(
"`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything."
" `FlaubertTokenizer` will always set it to `False`."
)
self.do_lowercase_and_remove_accent = False
self.do_lowercase = do_lowercase
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use FlaubertTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.sm = sacremoses
self.cache_moses_punct_normalizer = {}
self.cache_moses_tokenizer = {}
self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
self.lang2id = lang2id
self.id2lang = id2lang
if lang2id is not None and id2lang is not None:
assert len(lang2id) == len(id2lang)
self.ja_word_tokenizer = None
self.zh_word_tokenizer = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
lang2id=lang2id,
id2lang=id2lang,
**kwargs,
)
@property
def do_lower_case(self):
return self.do_lowercase_and_remove_accent
def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
self.cache_moses_punct_normalizer[lang] = punct_normalizer
else:
punct_normalizer = self.cache_moses_punct_normalizer[lang]
return punct_normalizer.normalize(text)
def moses_tokenize(self, text, lang):
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
else:
moses_tokenizer = self.cache_moses_tokenizer[lang]
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
def moses_pipeline(self, text, lang):
text = replace_unicode_punct(text)
text = self.moses_punct_norm(text, lang)
text = remove_non_printing_char(text)
return text
def ja_tokenize(self, text):
if self.ja_word_tokenizer is None:
try:
import Mykytea
self.ja_word_tokenizer = Mykytea.Mykytea(
f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
)
except (AttributeError, ImportError):
logger.error(
"Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
" (https://github.com/chezou/Mykytea-python) with the following steps"
)
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
logger.error("2. autoreconf -i")
logger.error("3. ./configure --prefix=$HOME/local")
logger.error("4. make && make install")
logger.error("5. pip install kytea")
raise
return list(self.ja_word_tokenizer.getWS(text))
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + "</w>",)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word
def preprocess_text(self, text):
text = text.replace("``", '"').replace("''", '"')
text = convert_to_unicode(text)
text = unicodedata.normalize("NFC", text)
if self.do_lowercase:
text = text.lower()
return text
def _tokenize(self, text, bypass_tokenizer=False):
"""
Tokenize a string given language code using Moses.
Details of tokenization:
- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
- Install with `pip install sacremoses`
Args:
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
(bool). If True, we only apply BPE.
Returns:
List of tokens.
"""
lang = "fr"
if lang and self.lang2id and lang not in self.lang2id:
logger.error(
"Supplied language code not found in lang2id mapping. Please check that your language is supported by"
" the loaded pretrained model."
)
if bypass_tokenizer:
text = text.split()
else:
text = self.preprocess_text(text)
text = self.moses_pipeline(text, lang=lang)
text = self.moses_tokenize(text, lang=lang)
split_tokens = []
for token in text:
if token:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) into an id using the vocabulary."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocabulary."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings) into a single string."""
out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. An XLM sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input IDs with the appropriate special tokens added.
"""
bos = [self.bos_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return bos + token_ids_0 + sep
return bos + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve a mask of 1s and 0s indicating the presence of special tokens in the input sequences.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs corresponding to the second sequence.
already_has_special_tokens (`bool`, *optional*):
Whether the input lists already include special tokens.
Returns:
`List[int]`: List where each element is 1 if the corresponding token is special, otherwise 0.
"""
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def __getstate__(self):
state = self.__dict__.copy()
state["sm"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.sm = sacremoses