Transformers 源码解析(一百)
.\models\seamless_m4t\convert_fairseq2_to_hf.py
""" 将 Meta SeamlessM4T 检查点从 seamless_communication 转换为 HF."""
import argparse
import os
from pathlib import Path
import torch
from accelerate.utils.modeling import find_tied_parameters
from seamless_communication.models.inference.translator import Translator
from transformers import (
SeamlessM4TConfig,
SeamlessM4TFeatureExtractor,
SeamlessM4TModel,
SeamlessM4TProcessor,
SeamlessM4TTokenizer,
)
from transformers.utils import logging
UNIT_SUPPORTED_LANGUAGES = [
"__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__",
"__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__",
"__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__",
"__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__",
"__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",
]
VOCODER_SUPPORTED_LANGUAGES = [
"__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__",
"__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__",
"__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__",
"__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__",
"__ukr__", "__urd__", "__uzn__", "__vie__",
]
MEDIUM_SUPPORTED_LANGUAGES = ["ace","ace_Latn","acm","acq","aeb","afr","ajp","aka","amh","apc","arb","ars","ary","arz","asm","ast","awa","ayr","azb","azj","bak","bam","ban","bel","bem","ben","bho","bjn","bjn_Latn","bod","bos","bug","bul","cat","ceb","ces","cjk","ckb","crh","cym","dan","deu","dik","dyu","dzo","ell","eng","epo","est","eus","ewe","fao","pes","fij","fin","fon","fra","fur","fuv","gla","gle","glg","grn","guj","hat","hau","heb","hin","hne","hrv","hun","hye","ibo","ilo","ind","isl","ita","jav","jpn","kab","kac","kam","kan","kas","kas_Deva","kat","knc","knc_Latn","kaz","kbp","kea","khm","kik","kin","kir","kmb","kon","kor","kmr","lao","lvs","lij","lim","lin","lit","lmo","ltg","ltz","lua","lug","luo","lus","mag","mai","mal","mar","min","mkd","plt","mlt","mni","khk","mos","mri","zsm","mya","nld","nno","nob","npi","nso","nus","nya","oci","gaz","ory","pag","pan","pap","pol","por","prs","pbt","quy","ron","run","rus","sag","san","sat","scn","shn","sin","slk","slv","smo","sna","snd","som","sot","spa","als","srd","srp","ssw","sun","swe","swh","szl","tam","tat","tel","tgk","tgl","tha","tir","taq","taq_Tfng","tpi","tsn","tso","tuk","tum","tur","twi","tzm","uig","ukr","umb","urd","uzn","vec","vie","war","wol","xho","ydd","yor","yue","cmn","cmn_Hant","zul",]
LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
def assert_param_count(model_1, model_2):
count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
def param_count(model):
return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return torch.device(device)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
vocoder_convert_list = [
("ups", "hifi_gan.upsampler"),
("conv_pre", "hifi_gan.conv_pre"),
("resblocks", "hifi_gan.resblocks"),
("conv_post", "hifi_gan.conv_post"),
("lang", "language_embedding"),
("spkr", "speaker_embedding"),
("dict.", "unit_embedding."),
("dur_predictor.conv1.0", "dur_predictor.conv1"),
]
("dur_predictor.conv2.0", "dur_predictor.conv2"),
wav2vec_convert_list = [
("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
("speech_encoder.inner.layers", "encoder.layers"),
("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
("speech_encoder.adaptor_layers", "adapter.layers"),
("inner_proj", "intermediate_dense"),
("self_attn.output_proj", "self_attn.linear_out"),
("output_proj", "output_dense"),
("self_attn.k_proj", "self_attn.linear_k"),
("self_attn.v_proj", "self_attn.linear_v"),
("self_attn.q_proj", "self_attn.linear_q"),
("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
("conv.depthwise_conv", "conv_module.depthwise_conv"),
("conv.batch_norm", "conv_module.batch_norm"),
("conv_layer_norm", "conv_module.layer_norm"),
("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
("speech_encoder.proj2", "intermediate_ffn.output_dense"),
("speech_encoder.layer_norm", "inner_layer_norm"),
]
t2u_convert_list = [
("t2u_model.final_proj", "lm_head"),
("t2u_model.", "model."),
("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
("encoder_decoder_attn", "cross_attention"),
("linear_k", "k_proj"),
("linear_v", "v_proj"),
("linear_q", "q_proj"),
("ffn.inner_proj", "ffn.fc1"),
("ffn.output_proj", "ffn.fc2"),
("output_proj", "out_proj"),
("decoder_frontend.embed", "decoder.embed_tokens"),
]
text_convert_list = [
("text_encoder.", ""),
("text_decoder.", ""),
("text_encoder_frontend.embed", "embed_tokens"),
("text_decoder_frontend.embed", "embed_tokens"),
("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
("encoder_decoder_attn", "cross_attention"),
("linear_k", "k_proj"),
("linear_v", "v_proj"),
("linear_q", "q_proj"),
("ffn.inner_proj", "ffn.fc1"),
("ffn.output_proj", "ffn.fc2"),
("output_proj", "out_proj"),
("final_proj", "lm_head"),
]
CUR_PATH = os.path.dirname(os.path.abspath(__file__))
default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
def _load_hf_config(model_type="medium"):
kwargs = {
"vocab_size": 256206,
"t2u_vocab_size": 10082,
"hidden_size": 1024,
"max_position_embeddings": 4096,
"encoder_layers": 12,
"decoder_layers": 12,
"encoder_ffn_dim": 4096,
"decoder_ffn_dim": 4096,
"t2u_encoder_layers": 4,
"t2u_decoder_layers": 4,
"speech_encoder_layers": 12,
}
return SeamlessM4TConfig(**kwargs)
def _convert_model(
original_model,
hf_model,
convert_list,
device,
unwanted_prefix="model.",
filter_state_dict="speech",
exclude_state_dict=None,
):
state_dict = original_model.state_dict()
if isinstance(filter_state_dict, str):
def filter_func(x):
return filter_state_dict in x[0]
else:
def filter_func(item):
if exclude_state_dict is not None and exclude_state_dict in item[0]:
return False
for filter_el in filter_state_dict:
if filter_el in item[0]:
return True
return False
state_dict = dict(filter(filter_func, state_dict.items()))
for k, v in list(state_dict.items()):
new_k = k[len(unwanted_prefix):]
for old_layer_name, new_layer_name in convert_list:
if old_layer_name in new_k:
new_k = new_k.replace(old_layer_name, new_layer_name)
if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
new_k = new_k.replace("layer_norm", "final_layer_norm")
state_dict[new_k] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
extra_keys = set(extra_keys)
missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
raise ValueError(f"missing keys: {missing_keys}")
hf_model.load_state_dict(state_dict, strict=False)
n_params = param_count(hf_model)
logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
hf_model.eval()
hf_model.to(device)
del state_dict
return hf_model
def load_model(save_dir, model_type, repo_id):
"""
Meta SeamlessM4T is made of 8 main components:
- speech_encoder (#1) and speech_encoder_frontend (#2)
- t2u_model (#3)
- text_encoder (#4) and text_encoder_frontend (#5)
- text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
- final_proj (#7)
- vocoder (#8)
"""
device = _grab_best_device()
if model_type == "medium":
name = "seamlessM4T_medium"
else:
name = "seamlessM4T_large"
original_model = Translator(name, "vocoder_36langs", device, torch.float32)
langs = MEDIUM_SUPPORTED_LANGUAGES if model_type == "medium" else LARGE_SUPPORTED_LANGUAGES
langs = [f"__{lang}__" for lang in langs]
vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
save_dir = os.path.join(save_dir, name)
Path(save_dir).mkdir(exist_ok=True)
tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
tokenizer.save_pretrained(save_dir)
tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
raise ValueError(
f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
)
text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
t2u_lang_code_to_id = {
code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
}
vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
fe = SeamlessM4TFeatureExtractor(language_code=langs)
fe.save_pretrained(save_dir)
fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
processor.save_pretrained(save_dir)
processor.push_to_hub(repo_id=repo_id, create_pr=True)
processor = SeamlessM4TProcessor.from_pretrained(save_dir)
hf_config = _load_hf_config(model_type)
hf_model = SeamlessM4TModel(hf_config)
hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
hf_model.vocoder.apply_weight_norm()
hf_model.vocoder = _convert_model(
original_model,
hf_model.vocoder,
vocoder_convert_list,
device,
unwanted_prefix="vocoder.code_generator.",
filter_state_dict="vocoder",
)
hf_model.vocoder.remove_weight_norm()
wav2vec = hf_model.speech_encoder
hf_model.speech_encoder = _convert_model(
original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
)
hf_model.t2u_model = _convert_model(
original_model,
hf_model.t2u_model,
t2u_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict="t2u_model",
)
hf_model.text_encoder = _convert_model(
original_model,
hf_model.text_encoder,
text_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict=["model.text_encoder"],
exclude_state_dict="t2u_model",
)
)
hf_model.text_decoder = _convert_model(
original_model,
hf_model.text_decoder,
text_convert_list,
device,
unwanted_prefix="model.",
filter_state_dict=["model.text_decoder"],
exclude_state_dict="t2u_model",
)
hf_model.lm_head = _convert_model(
original_model,
hf_model.lm_head,
[("final_proj.", "")],
device,
unwanted_prefix="model.",
filter_state_dict=["model.final_proj"],
exclude_state_dict="t2u_model",
)
print(find_tied_parameters(hf_model))
count_1 = param_count(hf_model)
count_2 = param_count(original_model)
print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
del original_model
hf_model.generation_config._from_model_config = False
hf_model.save_pretrained(save_dir)
hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
hf_model = SeamlessM4TModel.from_pretrained(save_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_type",
default="medium",
type=str,
help="Model type.",
)
parser.add_argument(
"--save_dir",
default="/home/ubuntu/weights",
type=str,
help="Path to the output PyTorch model.",
)
parser.add_argument(
"--repo_id",
default="facebook/hf-seamless-m4t-medium",
type=str,
help="Repo ID.",
)
args = parser.parse_args()
load_model(args.save_dir, args.model_type, args.repo_id)
.\models\seamless_m4t\feature_extraction_seamless_m4t.py
"""
Feature extractor class for SeamlessM4T
"""
from typing import List, Optional, Union
import numpy as np
from ...utils import is_torch_available
if is_torch_available():
import torch
from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import PaddingStrategy, TensorType, logging
logger = logging.get_logger(__name__)
class SeamlessM4TFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a SeamlessM4T feature extractor.
This feature extractor inherits from [`SequenceFeatureExtractor`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
This class extracts mel-filter bank features from raw speech.
Args:
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
num_mel_bins (`int`, *optional*, defaults to 80):
Number of Mel-frequency bins.
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding vectors.
stride (`int`, *optional*, defaults to 2):
Stride used to reshape audios from shape (batch_size,num_frames,num_mel_bins) to
(batch_size,num_frames//stride,num_mel_bins*stride).
"""
model_input_names = ["input_features", "attention_mask"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
num_mel_bins=80,
padding_value=0.0,
stride=2,
**kwargs,
):
):
self.num_mel_bins = num_mel_bins
self.return_attention_mask = True
self.stride = stride
mel_filters = mel_filter_bank(
num_frequency_bins=256,
num_mel_filters=self.num_mel_bins,
min_frequency=20,
max_frequency=sampling_rate // 2,
sampling_rate=sampling_rate,
norm=None,
mel_scale="kaldi",
triangularize_in_mel_space=True,
)
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
self.window = window_function(400, "povey", periodic=False)
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
@staticmethod
def zero_mean_unit_var_norm(
input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
) -> List[np.ndarray]:
"""
每个数组都被归一化为零均值和单位方差
"""
if attention_mask is not None:
attention_mask = np.array(attention_mask, np.int32)
normed_input_values = []
for vector, length in zip(input_values, attention_mask.sum(-1)):
normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
if length < normed_slice.shape[0]:
normed_slice[length:] = padding_value
normed_input_values.append(normed_slice)
else:
normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
return normed_input_values
def _extract_fbank_features(
self,
waveform: np.ndarray,
) -> np.ndarray:
"""
使用 TorchAudio 获取梅尔滤波器组特征。注意,TorchAudio 要求输入为16位有符号整数,因此在特征提取之前不应对波形进行归一化。
"""
if len(waveform.shape) == 2:
waveform = waveform[0]
waveform = np.squeeze(waveform) * (2**15)
features = spectrogram(
waveform,
self.window,
frame_length=400,
hop_length=160,
fft_length=512,
power=2.0,
center=False,
preemphasis=0.97,
mel_filters=self.mel_filters,
log_mel="log",
mel_floor=1.192092955078125e-07,
remove_dc_offset=True,
).T
return features
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = True,
pad_to_multiple_of: Optional[int] = 2,
max_length: Optional[int] = None,
truncation: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
do_normalize_per_mel_bins: Optional[bool] = True,
**kwargs,
.\models\seamless_m4t\modeling_seamless_m4t.py
import copy
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Wav2Vec2BaseModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_seamless_m4t import SeamlessM4TConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/hf-seamless-m4t-medium"
_CONFIG_FOR_DOC = "SeamlessM4TConfig"
SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/hf-seamless-m4t-medium",
]
SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP = {
"microsoft/speecht5_hifigan": "https://huggingface.co/microsoft/speecht5_hifigan/resolve/main/config.json",
}
@dataclass
class SeamlessM4TGenerationOutput(ModelOutput):
"""
定义来自[`SeamlessM4TModel`], [`SeamlessM4TForTextToText`],
[`SeamlessM4TForTextToSpeech`], [`SeamlessM4TForSpeechToSpeech`] 和 [`SeamlessM4TForTextToSpeech`] 的生成输出类。
继承自ModelOutput类。
"""
Args:
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
模型预测的最终音频波形。
waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
`waveform` 批次中每个元素的样本长度,可选参数。
sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
生成的翻译序列。这是文本到文本或语音到文本模型的输出。
第二维 (sequence_length) 要么等于 `max_length`,要么由于 `eos_token_id` 导致所有批次提前结束而较短。
unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
生成的单元序列。这是文本到单元模型的输出。
第二维 (unit_sequence_length) 要么等于 `t2u_max_length`,要么由于 `t2u_eos_token_id` 导致所有批次提前结束而较短。
SEAMLESS_M4T_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~SeamlessM4TConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SEAMLESS_M4T_INPUTS_DOCSTRING_FIRST_PART = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
"""
SEAMLESS_M4T_INPUTS_DOCSTRING_TEXT_PART = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
"""
SEAMLESS_M4T_INPUTS_DOCSTRING_SPEECH_PART = r"""
Args:
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
[`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
"""
M4T_MODEL_INPUTS_DOCSTRING = SEAMLESS_M4T_INPUTS_DOCSTRING_FIRST_PART + SEAMLESS_M4T_INPUTS_DOCSTRING_LAST_PART
M4T_TEXT_INPUTS_DOCSTRING = SEAMLESS_M4T_INPUTS_DOCSTRING_TEXT_PART + SEAMLESS_M4T_INPUTS_DOCSTRING_LAST_PART
M4T_SPEECH_INPUTS_DOCSTRING = SEAMLESS_M4T_INPUTS_DOCSTRING_SPEECH_PART + SEAMLESS_M4T_INPUTS_DOCSTRING_LAST_PART
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
将输入的token向右移动一位。
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
"""
计算形式为`(batch, seq_len)`的注意力掩码,其中每个批次中的每个元素的注意力在对应的`seq_lens`停止。
Args:
hidden_states (`torch.FloatTensor`,形状为`(batch, seq_len, *)`):
要掩码的序列,其中`*`是任意数量的特定于序列的维度,包括没有。
seq_lens (`torch.Tensor`,形状为`(batch)`):
每个元素表示`hidden_states`中相同索引处的序列的长度
Returns:
`torch.FloatTensor`:形状为`(batch, seq_len)`的浮点数注意力掩码
"""
batch_size, mask_seq_len = hidden_states.shape[:2]
indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
mask = hidden_states.new_ones((batch_size, mask_seq_len))
mask = mask.masked_fill(bool_mask, 0)
return mask
def format_speech_generation_kwargs(kwargs):
"""
为生成语音的SeamlessM4T模型格式化kwargs,将kwargs分配给文本生成或语音生成模型。
Args:
kwargs (`dict`)`:
关键字参数有两种类型:
- 没有前缀时,它们将作为每个子模型的`generate`方法的`**kwargs`输入,除了`decoder_input_ids`只会传递给文本组件。
- 带有*text_*或*speech_*前缀时,它们将作为文本模型和语音模型的`generate`方法的输入。优先级高于没有前缀的关键字。
这意味着您可以为一种生成策略指定一种生成,而另一种不指定。
"""
kwargs_text = {}
kwargs_speech = {}
for key, value in kwargs.items():
if key.startswith("text_"):
key = key[len("text_") :]
kwargs_text[key] = value
elif key.startswith("speech_"):
key = key[len("speech_") :]
kwargs_speech[key] = value
else:
if key not in kwargs_text:
kwargs_text[key] = value
if key not in kwargs_speech:
kwargs_speech[key] = value
return kwargs_text, kwargs_speech
class SeamlessM4TConformerPositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
padding=config.num_conv_pos_embeddings // 2,
groups=config.num_conv_pos_embedding_groups,
)
weight_norm = nn.utils.weight_norm
if hasattr(nn.utils.parametrizations, "weight_norm"):
weight_norm = nn.utils.parametrizations.weight_norm
if is_deepspeed_zero3_enabled():
import deepspeed
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
self.conv = weight_norm(self.conv, name="weight", dim=2)
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
self.conv = weight_norm(self.conv, name="weight", dim=2)
self.padding = SeamlessM4TConformerSamePadLayer(config.num_conv_pos_embeddings)
self.activation = ACT2FN[config.speech_encoder_hidden_act]
def forward(self, hidden_states):
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.conv(hidden_states)
hidden_states = self.padding(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
class SeamlessM4TConformerRotaryPositionalEmbedding(nn.Module):
"""Rotary positional embedding
Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
"""
def __init__(self, config):
super().__init__()
dim = config.hidden_size // config.speech_encoder_attention_heads
base = config.rotary_embedding_base
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
self.register_buffer("inv_freq", inv_freq)
self.cached_sequence_length = None
self.cached_rotary_positional_embedding = None
def forward(self, hidden_states):
sequence_length = hidden_states.shape[1]
if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
return self.cached_rotary_positional_embedding
self.cached_sequence_length = sequence_length
time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
embeddings = torch.cat((freqs, freqs), dim=-1)
cos_embeddings = embeddings.cos()[:, None, None, :]
sin_embeddings = embeddings.sin()[:, None, None, :]
self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
return self.cached_rotary_positional_embedding
class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
"""相对位置编码模块。"""
def __init__(self, config):
super().__init__()
self.max_len = config.max_source_positions
self.d_model = config.hidden_size
self.pe = None
self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
def extend_pe(self, x):
if self.pe is not None:
if self.pe.size(1) >= x.size(1) * 2 - 1:
if self.pe.dtype != x.dtype or self.pe.device != x.device:
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
return
pe_positive = torch.zeros(x.size(1), self.d_model)
pe_negative = torch.zeros(x.size(1), self.d_model)
position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
)
pe_positive[:, 0::2] = torch.sin(position * div_term)
pe_positive[:, 1::2] = torch.cos(position * div_term)
pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
pe_negative = pe_negative[1:].unsqueeze(0)
pe = torch.cat([pe_positive, pe_negative], dim=1)
self.pe = pe.to(device=x.device, dtype=x.dtype)
def forward(self, hidden_states: torch.Tensor):
self.extend_pe(hidden_states)
start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
relative_position_embeddings = self.pe[:, start_idx:end_idx]
return relative_position_embeddings
class SeamlessM4TConformerSamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, :-self.num_pad_remove]
return hidden_states
class SeamlessM4TConformerFeatureProjection(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
self.dropout = nn.Dropout(config.speech_encoder_dropout)
def forward(self, hidden_states):
norm_hidden_states = self.layer_norm(hidden_states)
hidden_states = self.projection(norm_hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class SeamlessM4TConformerFeedForward(nn.Module):
def __init__(self, config, act_fn=None, dropout=None):
super().__init__()
dropout = dropout if dropout is not None else config.speech_encoder_dropout
act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act
self.intermediate_dropout = nn.Dropout(dropout)
self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size)
self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size)
self.output_dropout = nn.Dropout(dropout)
def forward(self, hidden_states):
hidden_states = self.intermediate_dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.intermediate_dropout(hidden_states)
hidden_states = self.output_dense(hidden_states)
hidden_states = self.output_dropout(hidden_states)
return hidden_states
class SeamlessM4TConformerConvolutionModule(nn.Module):
"""Convolution block used in the conformer block"""
def __init__(self, config):
super().__init__()
if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.pointwise_conv1 = nn.Conv1d(
config.hidden_size,
2 * config.hidden_size,
kernel_size=1,
stride=1,
padding=0,
bias=False,
)
self.glu = nn.GLU(dim=1)
self.depthwise_conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
config.conv_depthwise_kernel_size,
stride=1,
padding="same",
groups=config.hidden_size,
bias=False,
)
self.batch_norm = nn.BatchNorm1d(config.hidden_size)
self.activation = ACT2FN[config.speech_encoder_hidden_act]
self.pointwise_conv2 = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=1,
stride=1,
padding=0,
bias=False,
)
self.dropout = nn.Dropout(config.speech_encoder_dropout)
def forward(self, hidden_states, attention_mask=None):
hidden_states = self.layer_norm(hidden_states)
if attention_mask is not None:
hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.pointwise_conv1(hidden_states)
hidden_states = self.glu(hidden_states)
hidden_states = self.depthwise_conv(hidden_states)
hidden_states = self.batch_norm(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.pointwise_conv2(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
"""Construct a SeamlessM4TConformerSelfAttention object.
Can be enhanced with rotary or relative position embeddings.
"""
def __init__(self, config, use_position_embeddings=True):
super().__init__()
self.head_size = config.hidden_size // config.speech_encoder_attention_heads
self.num_heads = config.speech_encoder_attention_heads
self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None
self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(p=config.speech_encoder_dropout)
if self.position_embeddings_type == "relative":
self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
relative_position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
batch_size, sequence_length, hidden_size = hidden_states.size()
query_key_states = hidden_states
value_states = hidden_states
if self.position_embeddings_type == "rotary":
if relative_position_embeddings is None:
raise ValueError(
"`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
)
query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
query = query.transpose(1, 2)
key = key.transpose(1, 2)
value = value.transpose(1, 2)
if self.position_embeddings_type == "relative":
if relative_position_embeddings is None:
raise ValueError(
"`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'relative'"
)
scores = self._apply_relative_embeddings(
query=query, key=key, relative_position_embeddings=relative_position_embeddings
)
else:
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
if attention_mask is not None:
scores = scores + attention_mask
probs = torch.softmax(scores, dim=-1)
probs = self.dropout(probs)
hidden_states = torch.matmul(probs, value)
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
hidden_states = self.linear_out(hidden_states)
return hidden_states, probs
def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
batch_size, sequence_length, hidden_size = hidden_states.size()
hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
cos = relative_position_embeddings[0, :sequence_length, ...]
sin = relative_position_embeddings[1, :sequence_length, ...]
hidden_states = hidden_states.transpose(0, 1)
rotated_states_begin = hidden_states[..., : self.head_size // 2]
rotated_states_end = hidden_states[..., self.head_size // 2 :]
rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
hidden_states = (hidden_states * cos) + (rotated_states * sin)
hidden_states = hidden_states.transpose(0, 1)
hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
return hidden_states
`
def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
proj_relative_position_embeddings = proj_relative_position_embeddings.view(
relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
)
proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
query = query.transpose(1, 2)
q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
return scores
class SeamlessM4TConformerEncoderLayer(nn.Module):
"""Conformer block based on https://arxiv.org/abs/2005.08100."""
def __init__(self, config):
super().__init__()
embed_dim = config.hidden_size
dropout = config.speech_encoder_dropout
self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
self.ffn1 = SeamlessM4TConformerFeedForward(config)
self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
self.self_attn_dropout = nn.Dropout(dropout)
self.self_attn = SeamlessM4TConformerSelfAttention(config)
self.conv_module = SeamlessM4TConformerConvolutionModule(config)
self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
self.ffn2 = SeamlessM4TConformerFeedForward(config)
self.final_layer_norm = nn.LayerNorm(embed_dim)
def forward(
self,
hidden_states,
attention_mask: Optional[torch.Tensor] = None,
relative_position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
conv_attention_mask: Optional[torch.Tensor] = None,
):
hidden_states = hidden_states
residual = hidden_states
hidden_states = self.ffn1_layer_norm(hidden_states)
hidden_states = self.ffn1(hidden_states)
hidden_states = hidden_states * 0.5 + residual
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
relative_position_embeddings=relative_position_embeddings,
output_attentions=output_attentions,
)
hidden_states = self.self_attn_dropout(hidden_states)
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.ffn2_layer_norm(hidden_states)
hidden_states = self.ffn2(hidden_states)
hidden_states = hidden_states * 0.5 + residual
hidden_states = self.final_layer_norm(hidden_states)
return hidden_states, attn_weights
def __init__(self, config):
super().__init__()
self.config = config
if config.position_embeddings_type == "relative":
self.embed_positions = SeamlessM4TConformerRelPositionalEmbedding(config)
elif config.position_embeddings_type == "rotary":
self.embed_positions = SeamlessM4TConformerRotaryPositionalEmbedding(config)
else:
self.embed_positions = None
self.dropout = nn.Dropout(config.speech_encoder_dropout)
self.layers = nn.ModuleList(
[SeamlessM4TConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
class SeamlessM4TConformerAdapterLayer(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.hidden_size
dropout = config.adaptor_dropout
self.kernel_size = config.adaptor_kernel_size
self.stride = config.adaptor_stride
self.residual_layer_norm = nn.LayerNorm(embed_dim)
self.residual_conv = nn.Conv1d(
embed_dim,
2 * embed_dim,
self.kernel_size,
stride=self.stride,
padding=self.stride // 2,
)
self.activation = nn.GLU(dim=1)
self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
self.self_attn_conv = nn.Conv1d(
embed_dim,
2 * embed_dim,
self.kernel_size,
stride=self.stride,
padding=self.stride // 2,
)
self.self_attn = SeamlessM4TConformerSelfAttention(config, use_position_embeddings=False)
self.self_attn_dropout = nn.Dropout(dropout)
self.ffn_layer_norm = nn.LayerNorm(embed_dim)
self.ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=dropout)
def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
pad = self.kernel_size // 2
seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
return seq_lens.floor()
def forward(
self,
hidden_states,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
residual = self.residual_layer_norm(hidden_states)
residual = residual.transpose(1, 2)
residual = self.residual_conv(residual)
residual = self.activation(residual)
residual = residual.transpose(1, 2)
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
hidden_states = self.self_attn_conv(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = hidden_states.transpose(1, 2)
if attention_mask is not None:
sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
hidden_states.device
)
attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
attention_mask = _prepare_4d_attention_mask(
attention_mask,
hidden_states.dtype,
)
hidden_states, attn_weights = self.self_attn(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = self.self_attn_dropout(hidden_states)
hidden_states = hidden_states + residual
residual = hidden_states
hidden_states = self.ffn_layer_norm(hidden_states)
hidden_states = self.ffn(hidden_states) + residual
return hidden_states
class SeamlessM4TConformerAdapter(nn.Module):
def __init__(self, config):
super().__init__()
self.layers = nn.ModuleList(SeamlessM4TConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
def forward(self, hidden_states, attention_mask):
for layer in self.layers:
hidden_states = layer(hidden_states, attention_mask)
return hidden_states
class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.register_buffer("weights", emb_weights, persistent=False)
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
Build sinusoidal embeddings.
构建 sinusoidal embeddings,与 tensor2tensor 中的实现相匹配,但与 "Attention Is All You Need" 第 3.5 节中的描述略有不同。
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(
self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
):
if input_ids is not None:
bsz, seq_len = input_ids.size()
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
else:
bsz, seq_len = inputs_embeds.size()[:-1]
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
class SeamlessM4TAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[SeamlessM4TConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
class SeamlessM4TFeedForwardNetwork(nn.Module):
def __init__(self, config: SeamlessM4TConfig, ffn_dim: int):
super().__init__()
self.fc1 = nn.Linear(config.hidden_size, ffn_dim)
self.fc2 = nn.Linear(ffn_dim, config.hidden_size)
self.dropout = nn.Dropout(config.activation_dropout)
self.act = ACT2FN[config.activation_function]
def forward(self, hidden_states):
hidden_states = self.fc1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.fc2.weight, torch.Tensor)
and hidden_states.dtype != self.fc2.weight.dtype
and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
):
hidden_states = hidden_states.to(self.fc2.weight.dtype)
hidden_states = self.fc2(hidden_states)
return hidden_states
def __init__(self, config: SeamlessM4TConfig, encoder_ffn_dim=None, encoder_attention_heads=None):
super().__init__()
encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
encoder_attention_heads = (
config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads
)
self.embed_dim = config.hidden_size
self.self_attn = SeamlessM4TAttention(
embed_dim=self.embed_dim,
num_heads=encoder_attention_heads,
dropout=config.attention_dropout,
)
self.attn_dropout = nn.Dropout(config.dropout)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=encoder_ffn_dim)
self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
self.ffn_dropout = nn.Dropout(config.activation_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
output_attentions: bool = False,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`):
input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`):
attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
large negative values.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = self.attn_dropout(hidden_states)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.ffn_layer_norm(hidden_states)
hidden_states = self.ffn(hidden_states)
hidden_states = self.ffn_dropout(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class SeamlessM4TDecoderLayer(nn.Module):
def __init__(self, config: SeamlessM4TConfig, decoder_ffn_dim=None, decoder_attention_heads=None):
super().__init__()
decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
decoder_attention_heads = (
config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
)
self.embed_dim = config.hidden_size
self.self_attn = SeamlessM4TAttention(
embed_dim=self.embed_dim,
num_heads=decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.attn_dropout = nn.Dropout(config.dropout)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.cross_attention = SeamlessM4TAttention(
self.embed_dim, decoder_attention_heads, config.attention_dropout, is_decoder=True
)
self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=decoder_ffn_dim)
self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
self.ffn_dropout = nn.Dropout(config.activation_dropout)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class SeamlessM4TPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SeamlessM4TConfig
base_model_prefix = "seamless_m4t"
supports_gradient_checkpointing = True
_no_split_modules = ["SeamlessM4TEncoderLayer", "SeamlessM4TDecoderLayer", "SeamlessM4TConformerEncoderLayer"]
def _init_weights(self, module):
"""初始化模型权重"""
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, SeamlessM4TConformerSelfAttention):
if hasattr(module, "pos_bias_u"):
nn.init.xavier_uniform_(module.pos_bias_u)
if hasattr(module, "pos_bias_v"):
nn.init.xavier_uniform_(module.pos_bias_v)
elif isinstance(module, SeamlessM4TConformerPositionalConvEmbedding):
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
nn.init.constant_(module.conv.bias, 0)
elif isinstance(module, SeamlessM4TConformerFeatureProjection):
k = math.sqrt(1 / module.projection.in_features)
nn.init.uniform_(module.projection.weight, a=-k, b=k)
nn.init.uniform_(module.projection.bias, a=-k, b=k)
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-k, b=k)
) -> torch.Tensor:
"""
Computes the last hidden states.
Parameters:
hidden_states (`Tuple[Tuple[torch.Tensor]]`):
The generated hidden states. Tuple (one element for each generated token) of tuples (one element for
each layer of the decoder) of torch.FloatTensor of shape (batch_size*num_beams*num_return_sequences,
generated_length, hidden_size).
beam_indices (`torch.LongTensor`, *optional*):
Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
`(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
generate-time.
Return:
`torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`
containing
the last hidden states.
"""
last_hidden_states = torch.concat([hidden_states[-1] for hidden_states in hidden_states], dim=1)
if beam_indices is None:
return last_hidden_states
beam_indices_mask = beam_indices < 0
max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max()
beam_indices = beam_indices.clone()[:, :max_beam_length]
beam_indices_mask = beam_indices_mask[:, :max_beam_length]
beam_indices[beam_indices_mask] = 0
beam_indices = beam_indices.unsqueeze(-1)
beam_indices = beam_indices.expand(-1, -1, last_hidden_states.shape[-1])
last_hidden_states = torch.gather(last_hidden_states, 0, beam_indices)
return last_hidden_states
class SeamlessM4TSpeechEncoder(SeamlessM4TPreTrainedModel):
main_input_name = "input_features"
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.feature_projection = SeamlessM4TConformerFeatureProjection(config)
self.encoder = SeamlessM4TConformerEncoder(config)
self.intermediate_ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=0.0)
self.adapter = SeamlessM4TConformerAdapter(config) if config.add_adapter else None
self.inner_layer_norm = nn.LayerNorm(config.hidden_size)
self.post_init()
def forward(
self,
input_features: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
if input_features is None:
raise ValueError(
"""Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4TSpeechEncoder.forward`.
Make sure one of them is not `None`."""
)
hidden_states = self.feature_projection(input_features)
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = encoder_outputs[0]
expanded_hidden_states = self.intermediate_ffn(hidden_states)
hidden_states = hidden_states + 0.5 * expanded_hidden_states
if self.adapter is not None:
hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
hidden_states = self.inner_layer_norm(hidden_states)
if not return_dict:
return (hidden_states,) + encoder_outputs[1:]
return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4TEncoderLayer`].
"""
"""
SEAMLESS_M4T_START_DOCSTRING,
"""
"""
embed_tokens (`nn.Embedding`, *optional*):
Input embedding
is_t2u_encoder (`bool`, *optional*, defaults to `False`):
indicates if it belongs to the text-to-units model, in which case it won't have input embeddings
"""
@add_start_docstrings(
"Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SeamlessM4TEncoderLayer`].",
SEAMLESS_M4T_START_DOCSTRING,
"""
embed_tokens (`nn.Embedding`, *optional*):
Input embedding
""",
)
class SeamlessM4TEncoder(SeamlessM4TPreTrainedModel):
def __init__(
self,
config: SeamlessM4TConfig,
embed_tokens: Optional[nn.Embedding] = None,
is_t2u_encoder: bool = False,
):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id
embed_dim = config.hidden_size
self.is_t2u_encoder = is_t2u_encoder
self.max_source_positions = config.max_position_embeddings
if not self.is_t2u_encoder:
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding(
self.max_source_positions,
embed_dim,
self.padding_idx,
)
layers = []
for _ in range(config.encoder_layers):
layers.append(
SeamlessM4TEncoderLayer(
config,
encoder_attention_heads=config.encoder_attention_heads,
encoder_ffn_dim=config.encoder_ffn_dim,
)
)
self.layers = nn.ModuleList(layers)
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.gradient_checkpointing = False
self.post_init()
):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx)
self.embed_tokens.weight = embed_tokens.weight
else:
self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding(
self.max_target_positions,
config.hidden_size,
padding_idx=self.padding_idx,
)
layers = []
for _ in range(config.decoder_layers):
layers.append(
SeamlessM4TDecoderLayer(
config,
decoder_attention_heads=config.decoder_attention_heads,
decoder_ffn_dim=config.decoder_ffn_dim,
)
)
self.layers = nn.ModuleList(layers)
self.layer_norm = nn.LayerNorm(config.hidden_size)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4TEncoder`] without embeddings and the decoder is a [`SeamlessM4TDecoder`].",
SEAMLESS_M4T_START_DOCSTRING,
"""
embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
""",
)
class SeamlessM4TTextToUnitModel(SeamlessM4TPreTrainedModel):
def __init__(
self,
config: SeamlessM4TConfig,
embed_tokens_decoder: Optional[nn.Embedding] = None,
):
super().__init__(config)
self.encoder = SeamlessM4TEncoder(config, is_t2u_encoder=True)
self.decoder = SeamlessM4TDecoder(config, embed_tokens_decoder)
self.post_init()
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs + encoder_outputs
return Seq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4TTextToUnit`].",
SEAMLESS_M4T_START_DOCSTRING,
"""
embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
""",
)
class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = [
"vocoder",
"speech_encoder",
"text_encoder",
"text_decoder",
]
_tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(
self,
config: SeamlessM4TConfig,
embed_tokens_decoder: Optional[nn.Embedding] = None,
):
config = copy.deepcopy(config)
for param, val in config.to_dict().items():
if param.startswith("t2u_"):
config.__setattr__(param[4:], val)
super().__init__(config)
self.model = SeamlessM4TTextToUnitModel(config, embed_tokens_decoder)
self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False)
self.post_init()
def get_encoder(self):
return self.model.encoder
def get_decoder(self):
return self.model.decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
@add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
def _tie_weights(self) -> None:
if getattr(self.config, "tie_word_embeddings", True):
output_embeddings = self.get_output_embeddings()
if output_embeddings is not None:
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
HIFIGAN_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SeamlessM4TConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
class HifiGanResidualBlock(nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
super().__init__()
self.leaky_relu_slope = leaky_relu_slope
self.convs1 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=dilation[i],
padding=self.get_padding(kernel_size, dilation[i]),
)
for i in range(len(dilation))
]
)
self.convs2 = nn.ModuleList(
[
nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=1,
padding=self.get_padding(kernel_size, 1),
)
for _ in range(len(dilation))
]
)
def get_padding(self, kernel_size, dilation=1):
return (kernel_size * dilation - dilation) // 2
def apply_weight_norm(self):
for layer in self.convs1:
nn.utils.weight_norm(layer)
for layer in self.convs2:
nn.utils.weight_norm(layer)
def remove_weight_norm(self):
for layer in self.convs1:
nn.utils.remove_weight_norm(layer)
for layer in self.convs2:
nn.utils.remove_weight_norm(layer)
def forward(self, hidden_states):
for conv1, conv2 in zip(self.convs1, self.convs2):
residual = hidden_states
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv1(hidden_states)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = conv2(hidden_states)
hidden_states = hidden_states + residual
return hidden_states
class SeamlessM4TVariancePredictor(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.unit_embed_dim
kernel_size = config.variance_predictor_kernel_size
var_pred_dropout = config.var_pred_dropout
self.conv1 = nn.Conv1d(
embed_dim,
embed_dim,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
)
self.activation_fuction = nn.ReLU()
self.ln1 = nn.LayerNorm(embed_dim)
self.dropout_module = nn.Dropout(p=var_pred_dropout)
self.conv2 = nn.Conv1d(
embed_dim,
embed_dim,
kernel_size=kernel_size,
padding=1,
)
self.ln2 = nn.LayerNorm(embed_dim)
self.proj = nn.Linear(embed_dim, 1)
def forward(self, hidden_states: Tensor) -> Tensor:
hidden_states = self.conv1(hidden_states.transpose(1, 2))
hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
hidden_states = self.dropout_module(self.ln1(hidden_states))
hidden_states = self.conv2(hidden_states.transpose(1, 2))
hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
hidden_states = self.dropout_module(self.ln2(hidden_states))
return self.proj(hidden_states).squeeze(dim=2)
class SeamlessM4THifiGan(nn.Module):
def __init__(self, config: SeamlessM4TConfig):
super().__init__()
model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim
self.leaky_relu_slope = config.leaky_relu_slope
self.num_kernels = len(config.resblock_kernel_sizes)
self.num_upsamples = len(config.upsample_rates)
self.conv_pre = nn.Conv1d(
model_in_dim,
config.upsample_initial_channel,
kernel_size=7,
stride=1,
padding=3,
)
self.upsampler = nn.ModuleList()
for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
self.upsampler.append(
nn.ConvTranspose1d(
config.upsample_initial_channel // (2**i),
config.upsample_initial_channel // (2 ** (i + 1)),
kernel_size=kernel_size,
stride=upsample_rate,
padding=(kernel_size - upsample_rate) // 2,
)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.upsampler)):
channels = config.upsample_initial_channel // (2 ** (i + 1))
for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor:
r"""
Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
waveform.
Args:
spectrogram (`torch.FloatTensor`):
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim`
is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`.
Returns:
`torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
"""
hidden_states = self.conv_pre(input_embeds)
for i in range(self.num_upsamples):
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = self.upsampler[i](hidden_states)
res_state = self.resblocks[i * self.num_kernels](hidden_states)
for j in range(1, self.num_kernels):
res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
hidden_states = res_state / self.num_kernels
hidden_states = nn.functional.leaky_relu(hidden_states)
hidden_states = self.conv_post(hidden_states)
hidden_states = torch.tanh(hidden_states)
waveform = hidden_states.squeeze(1)
return waveform
@add_start_docstrings(
"""Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis).""",
HIFIGAN_START_DOCSTRING,
)
class SeamlessM4TCodeHifiGan(PreTrainedModel):
config_class = SeamlessM4TConfig
main_input_name = "input_embeds"
_no_split_modules = []
def __init__(self, config):
super().__init__(config)
self.pad_token_id = config.t2u_pad_token_id
self.dur_predictor = SeamlessM4TVariancePredictor(config)
self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim)
self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim)
self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim)
self.hifi_gan = SeamlessM4THifiGan(config)
self.post_init()
def _get_dur_output_lengths(self, input_ids, dur_out):
"""
Computes the output length after the duration layer.
"""
unit_lengths = (input_ids != self.pad_token_id).sum(1)
unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1)
cumulative_dur_out = torch.cumsum(dur_out, dim=1)
unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze()
return unit_lengths
def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
Computes the output length of the hifigan convolutional layers
"""
def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
return (
torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
)
def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
for i, (upsample_rate, kernel_size) in enumerate(
zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
):
input_lengths = _transpose_conv_out_length(
input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
)
for i in range(len(self.config.upsample_rates)):
for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
for dil in dilation:
input_lengths = _conv_out_length(
input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
)
for dil in dilation:
input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
return input_lengths
def forward(
self, input_ids: torch.LongTensor, spkr_id: torch.Tensor, lang_id: torch.Tensor
) -> Tuple[torch.Tensor]:
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`SeamlessM4TTextToUnitForConditionalGeneration`]. [What are input
IDs?](../glossary#input-ids)
spkr_id (`int`, *optional*):
The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
tgt_lang (`str`, *optional*):
The language id to use as target language for translation.
"""
hidden_states = self.unit_embedding(input_ids).transpose(1, 2)
spkr = self.speaker_embedding(spkr_id).transpose(1, 2)
lang = self.language_embedding(lang_id).transpose(1, 2)
log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
dur_out = torch.clamp(torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1)
if hidden_states.size(0) == 1:
hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
else:
if hidden_states.shape[0] > 1 and self.training:
logger.warning(
"""`self.training=True` and you use batching. You lose parallelism during the hifigan
forward pass because the samples are interleaved."""
)
hidden_states = [
torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1)
for (hidden_state, duration) in zip(hidden_states, dur_out)
]
hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2)
spkr = spkr.repeat(1, 1, hidden_states.shape[-1])
lang = lang.repeat(1, 1, hidden_states.shape[-1])
hidden_states = torch.cat([lang, hidden_states, spkr], dim=1)
hidden_states = self.hifi_gan(hidden_states)
unit_lengths = self._get_dur_output_lengths(input_ids, dur_out)
lengths = self._get_output_hifigan_lengths(unit_lengths)
return hidden_states, lengths
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
def apply_weight_norm(self):
nn.utils.weight_norm(self.hifi_gan.conv_pre)
for layer in self.hifi_gan.upsampler:
nn.utils.weight_norm(layer)
for layer in self.hifi_gan.resblocks:
layer.apply_weight_norm()
nn.utils.weight_norm(self.hifi_gan.conv_post)
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
for layer in self.hifi_gan.upsampler:
nn.utils.remove_weight_norm(layer)
for layer in self.hifi_gan.resblocks:
layer.remove_weight_norm()
nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
@add_start_docstrings(
"The text-to-text SeamlessM4T Model transformer which can be used for T2TT.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"]
main_input_name = "input_ids"
_tied_weights_keys = [
"lm_head.weight",
"text_encoder.embed_tokens.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.text_encoder = SeamlessM4TEncoder(config, self.shared)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_encoder(self):
return self.text_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_encoder.embed_tokens = value
self.text_decoder.embed_tokens = value
self.shared = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
pass
def generate(
self,
input_ids=None,
tgt_lang=None,
generation_config=None,
logits_processor=None,
stopping_criteria=None,
prefix_allowed_tokens_fn=None,
synced_gpus=False,
**kwargs,
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"The speech-to-text SeamlessM4T Model transformer which can be used for S2TT.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
main_input_name = "input_features"
_tied_weights_keys = [
"lm_head.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.speech_encoder = SeamlessM4TSpeechEncoder(config)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_encoder(self):
return self.speech_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_decoder.embed_tokens = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
def forward(
self,
input_features: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
"""
Forward pass of the SeamlessM4T model for speech-to-text tasks.
"""
def generate(
self,
input_features=None,
tgt_lang=None,
generation_config=None,
logits_processor=None,
stopping_criteria=None,
prefix_allowed_tokens_fn=None,
synced_gpus=False,
**kwargs,
):
"""
Generate sequences based on input features and configuration.
"""
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
"""
Prepare inputs for sequence generation based on given parameters.
"""
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"The text-to-speech SeamlessM4T Model transformer which can be used for T2ST.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["speech_encoder"]
main_input_name = "input_ids"
_tied_weights_keys = [
"lm_head.weight",
"text_encoder.embed_tokens.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config: SeamlessM4TConfig):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.text_encoder = SeamlessM4TEncoder(config, self.shared)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
self.vocoder = SeamlessM4TCodeHifiGan(config)
def get_encoder(self):
return self.text_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_encoder.embed_tokens = value
self.text_decoder.embed_tokens = value
self.shared = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.Tensor] = None,
return_intermediate_token_ids: Optional[bool] = None,
tgt_lang: Optional[str] = None,
spkr_id: Optional[int] = 0,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"The speech-to-speech SeamlessM4T Model transformer which can be used for S2ST.",
SEAMLESS_M4T_START_DOCSTRING,
)
class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel):
_keys_to_ignore_on_load_missing = ["text_encoder"]
main_input_name = "input_features"
_tied_weights_keys = [
"lm_head.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.speech_encoder = SeamlessM4TSpeechEncoder(config)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
self.vocoder = SeamlessM4TCodeHifiGan(config)
def get_encoder(self):
return self.speech_encoder
def get_decoder(self):
return self.text_decoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_decoder.embed_tokens = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
def forward(
self,
input_features: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
pass
@torch.no_grad()
def generate(
self,
input_features: Optional[torch.Tensor] = None,
return_intermediate_token_ids: Optional[bool] = None,
tgt_lang: Optional[str] = None,
spkr_id: Optional[int] = 0,
**kwargs,
):
pass
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@add_start_docstrings(
"The original SeamlessM4T Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).",
SEAMLESS_M4T_START_DOCSTRING,
"""
current_modality (`str`, *optional*, defaults to `"text"`):
Default modality. Used to initialize the model.
""",
)
class SeamlessM4TModel(SeamlessM4TPreTrainedModel):
_tied_weights_keys = [
"lm_head.weight",
"text_encoder.embed_tokens.weight",
"text_decoder.embed_tokens.weight",
]
def __init__(self, config, current_modality="text"):
super().__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self.text_encoder = SeamlessM4TEncoder(config, self.shared)
self.speech_encoder = SeamlessM4TSpeechEncoder(config)
self.text_decoder = SeamlessM4TDecoder(config, self.shared)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
self.current_modality = current_modality
if current_modality == "speech":
self.main_input_name = "input_features"
self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
self.vocoder = SeamlessM4TCodeHifiGan(config)
def set_modality(self, modality="text"):
if modality == "text":
self.main_input_name = "input_ids"
self.current_modality = "text"
elif modality == "speech":
self.main_input_name = "input_features"
self.current_modality = "speech"
else:
raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.")
def get_encoder(self):
if self.current_modality == "text":
return self.text_encoder
else:
return self.speech_encoder
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_input_embeddings(self):
return self.text_decoder.embed_tokens
def set_input_embeddings(self, value):
self.text_encoder.embed_tokens = value
self.text_decoder.embed_tokens = value
self.shared = value
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.lm_head, self.shared)
@add_start_docstrings_to_model_forward(M4T_MODEL_INPUTS_DOCSTRING)
@torch.no_grad()
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
input_features: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
def generate(
self,
input_ids: Optional[torch.Tensor] = None,
input_features: Optional[torch.Tensor] = None,
return_intermediate_token_ids: Optional[bool] = None,
tgt_lang: Optional[str] = None,
spkr_id: Optional[int] = 0,
generate_speech: Optional[bool] = True,
**kwargs,
):
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
.\models\seamless_m4t\processing_seamless_m4t.py
"""
Audio/Text processor class for SeamlessM4T
"""
from ...processing_utils import ProcessorMixin
class SeamlessM4TProcessor(ProcessorMixin):
r"""
Constructs a SeamlessM4T processor which wraps a SeamlessM4T feature extractor and a SeamlessM4T tokenizer into a
single processor.
[`SeamlessM4TProcessor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and
[`SeamlessM4TTokenizerFast`]. See the [`~SeamlessM4TProcessor.__call__`] and [`~SeamlessM4TProcessor.decode`] for
more information.
Args:
feature_extractor ([`SeamlessM4TFeatureExtractor`]):
The audio processor is a required input.
tokenizer ([`SeamlessM4TTokenizerFast`]):
The tokenizer is a required input.
"""
feature_extractor_class = "SeamlessM4TFeatureExtractor"
tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast")
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to SeamlessM4TTokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
Please refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to SeamlessM4TTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
.\models\seamless_m4t\tokenization_seamless_m4t.py
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece as spm
from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import (
BatchEncoding,
PreTokenizedInput,
PreTrainedTokenizer,
TextInput,
)
from ...tokenization_utils_base import AddedToken
from ...utils import PaddingStrategy, logging
logger = logging.get_logger(__name__)
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/hf-seamless-m4t-medium": (
"https://huggingface.co/facebook/hf-seamless-m4t-medium/blob/main/sentencepiece.bpe.model"
),
}
}
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/hf-seamless-m4t-medium": 2048,
}
class SeamlessM4TTokenizer(PreTrainedTokenizer):
"""
Construct a SeamlessM4T tokenizer.
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).
The tokenization method is `<language code> <tokens> <eos>` for source language documents, and `<eos> <language
code> <tokens> <eos>` for target language documents.
Examples:
```
>>> from transformers import SeamlessM4TTokenizer
>>> tokenizer = SeamlessM4TTokenizer.from_pretrained(
... "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra"
... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
```
"""
def __init__(
self,
vocab_file,
src_lang=None,
tgt_lang=None,
unk_token="<unk>",
sep_token="<sep>",
cls_token="<cls>",
pad_token="<pad>",
mask_token="<mask>",
**kwargs
):
"""
Args:
vocab_file (str): 文件路径,包含预训练词汇表
src_lang (Optional[str]): 源语言代码,默认为 None
tgt_lang (Optional[str]): 目标语言代码,默认为 None
unk_token (str): 未知标记,默认为 "<unk>"
sep_token (str): 分隔符标记,默认为 "<sep>"
cls_token (str): 类别标记,默认为 "<cls>"
pad_token (str): 填充标记,默认为 "<pad>"
mask_token (str): 掩码标记,默认为 "<mask>"
**kwargs: 其他关键字参数
"""
super().__init__(
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs
)
self.vocab_file = vocab_file
self.src_lang = src_lang
self.tgt_lang = tgt_lang
self.sp_model = spm.SentencePieceProcessor(model_file=self.vocab_file)
def _tokenize(self, text_input):
"""
执行实际的分词过程,根据源语言和目标语言代码进行不同的分词处理。
Args:
text_input (str): 输入文本字符串
Returns:
List[str]: 分词后的字符串列表
"""
if self.src_lang:
tokenized = [self.src_lang] + self.sp_model.encode(text_input, out_type=str) + [self.eos_token]
elif self.tgt_lang:
tokenized = [self.eos_token] + [self.tgt_lang] + self.sp_model.encode(text_input, out_type=str) + [self.eos_token]
else:
raise ValueError("Either src_lang or tgt_lang must be specified.")
return tokenized
def _convert_token_to_id(self, token):
"""
将单个标记转换为其对应的 ID。
Args:
token (str): 输入的单个标记字符串
Returns:
int: 标记对应的 ID
"""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""
将单个 ID 转换为其对应的标记。
Args:
index (int): 输入的单个 ID
Returns:
str: ID 对应的标记字符串
"""
return self.sp_model.id_to_piece(index)
def convert_tokens_to_string(self, tokens):
"""
将分词后的标记列表转换为原始字符串。
Args:
tokens (List[str]): 输入的分词后的标记列表
Returns:
str: 原始字符串
"""
if tokens[0] == self.eos_token:
tokens = tokens[1:]
return self.sp_model.decode(tokens)
@property
def vocab_size(self):
"""
返回词汇表大小。
Returns:
int: 词汇表大小
"""
return len(self.sp_model)
@property
def vocab(self):
"""
返回词汇表内容。
Returns:
List[str]: 词汇表中的所有标记列表
"""
return [self.sp_model.id_to_piece(i) for i in range(self.vocab_size)]
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
"""
从预训练模型加载分词器。
Args:
pretrained_model_name_or_path (str): 预训练模型的名称或路径
*inputs: 不定长参数列表
**kwargs: 关键字参数
Returns:
PreTrainedTokenizer: 加载后的预训练分词器对象
"""
tokenizer = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
return tokenizer
Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
tokenizer_file (`str`, *optional*):
The path to a tokenizer file to use instead of the vocab file.
src_lang (`str`, *optional*, defaults to `"eng"`):
The language to use as source language for translation.
tgt_lang (`str`, *optional*, defaults to `"fra"`):
The language to use as target language for translation.
sp_model_kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments to pass to the model initialization.
additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
A tuple or a list of additional special tokens. Can be used to specify the list of languages that will be
supported by the tokenizer.
add_prefix_space (`bool`, *optional*, defaults to `True`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word.
"""
# 从外部导入的全局变量,包含预训练模型所需的词汇表文件名
vocab_files_names = VOCAB_FILES_NAMES
# 从外部导入的全局变量,包含预训练模型支持的最大输入序列长度
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 从外部导入的全局变量,包含预训练模型所需的词汇表文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
tokenizer_file=None,
src_lang="eng",
tgt_lang="fra",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
additional_special_tokens=None,
add_prefix_space=True,
**kwargs,
):
# 初始化函数,用于初始化对象的各种属性和设置
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
# 保留了一个未使用的参数以保持某些重要的“Copied from”语句
self.legacy = False
self.vocab_file = vocab_file
# 获取 SentencePiece 处理器对象
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
# 下面是为了保持 fairseq 与 SentencePiece 词汇对齐的逻辑说明
self._added_tokens_decoder = {
0: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
1: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
2: AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token,
3: AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token,
}
# fairseq 词汇与 SentencePiece 词汇的偏移量,用于对齐
self.fairseq_offset = 1
# 获取 SentencePiece 词汇表大小
self.sp_model_size = len(self.sp_model)
# 设置源语言和目标语言的特殊标记
self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang
self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang
self.add_prefix_space = add_prefix_space
# 调用父类的初始化方法,初始化基本的特殊标记和其他参数
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
tokenizer_file=tokenizer_file,
src_lang=src_lang,
tgt_lang=tgt_lang,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
add_prefix_space=add_prefix_space,
**kwargs,
)
# 设置源语言特殊标记
self.set_src_lang_special_tokens(self._src_lang)
# 设置目标语言特殊标记
self.set_tgt_lang_special_tokens(self._tgt_lang)
# 从 transformers.models.nllb.tokenization_nllb.NllbTokenizer.__getstate__ 复制而来的方法
# 返回对象的序列化状态,包括所有实例变量
def __getstate__(self):
# 复制对象的字典形式作为状态
state = self.__dict__.copy()
# 将 sp_model 设置为 None,不包含在序列化状态中
state["sp_model"] = None
# 使用 sp_model 的 serialized_model_proto 方法获取其序列化模型协议
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
# 返回状态字典
return state
# 从给定状态恢复对象的状态
# 从 transformers.models.nllb.tokenization_nllb.NllbTokenizer.__setstate__ 复制而来
def __setstate__(self, d):
# 直接使用给定的状态字典来恢复对象的实例变量
self.__dict__ = d
# 为了向后兼容
# 如果对象中没有 sp_model_kwargs 属性,则创建空字典
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
# 使用 sp_model_kwargs 中的参数创建 SentencePieceProcessor 对象赋给 sp_model
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
# 从 sp_model_proto 中加载序列化的模型协议
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
# 返回 sp_model 的词汇表大小
@property
def vocab_size(self):
return len(self.sp_model)
# 定义对象的调用方法,允许将对象像函数一样调用
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
padding: Union[bool, str, PaddingStrategy] = True,
pad_to_multiple_of: Optional[int] = 2,
src_lang: Optional[str] = None,
tgt_lang: Optional[str] = None,
**kwargs,
):
pass
# 返回 src_lang 属性值,从 transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang 复制而来
@property
def src_lang(self) -> str:
return self._src_lang
# 设置 src_lang 属性,从 transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang 复制而来
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
# 如果 new_src_lang 中不包含 "__",则添加前后缀 "__"
if "__" not in new_src_lang:
self._src_lang = f"__{new_src_lang}__"
else:
self._src_lang = new_src_lang
# 调用 set_src_lang_special_tokens 方法,设置特殊标记的 src_lang
self.set_src_lang_special_tokens(self._src_lang)
# 返回 tgt_lang 属性值
@property
def tgt_lang(self) -> str:
return self._tgt_lang
# 设置 tgt_lang 属性
@tgt_lang.setter
def tgt_lang(self, new_tgt_lang: str) -> None:
# 如果 new_tgt_lang 中不包含 "__",则添加前后缀 "__"
if "__" not in new_tgt_lang:
self._tgt_lang = f"__{new_tgt_lang}__"
else:
self._tgt_lang = new_tgt_lang
# 调用 set_tgt_lang_special_tokens 方法,设置特殊标记的 tgt_lang
self.set_tgt_lang_special_tokens(self._tgt_lang)
# 返回特殊标记的掩码,从 transformers.models.nllb.tokenization_nllb.NllbTokenizer.get_special_tokens_mask 复制而来
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# Create lists filled with 1s for the prefix and suffix tokens
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
# If token_ids_1 is not provided, return the prefix tokens, token_ids_0 with 0s inserted, and suffix tokens
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
# If token_ids_1 is provided, concatenate prefix tokens, token_ids_0, token_ids_1, and suffix tokens
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
# Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
if token_ids_1 is None:
# Return prefix tokens, token_ids_0, and suffix tokens if token_ids_1 is not provided
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
# Return prefix tokens, token_ids_0, token_ids_1, and suffix tokens if token_ids_1 is provided
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
# Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs tensor from given sequences, indicating which sequence a token belongs to (first or second).
This function is used in sequence pair tasks, such as question answering or sequence classification.
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of IDs representing the second sequence for sequence pairs.
Returns:
`List[int]`: List of token type IDs where each ID corresponds to its respective token in token_ids_0 and token_ids_1.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# Define special tokens for separation and classification
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If only one sequence is provided
if token_ids_1 is None:
# Return a list of zeros based on the length of cls + token_ids_0 + sep
return len(cls + token_ids_0 + sep) * [0]
# If two sequences are provided
# Return a list of zeros based on the length of cls + token_ids_0 + sep + sep + token_ids_1 + sep
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
# Check if source and target languages are specified
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model.")
# Set the source language
self.src_lang = src_lang
# Generate inputs by calling the model with additional special tokens and other kwargs
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
# Adjust target language format if not already formatted
if "__" not in tgt_lang:
tgt_lang = f"__{tgt_lang}__"
# Convert the target language to its corresponding ID
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
# Add the target language ID to inputs
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def get_vocab(self):
# Create a vocabulary dictionary mapping token IDs to tokens
vocab = {
self.convert_ids_to_tokens(i): i for i in range(self.fairseq_offset, self.vocab_size + self.fairseq_offset)
}
# Update vocabulary with any additional tokens
vocab.update(self.added_tokens_encoder)
return vocab
@property
def unk_token_length(self):
# Return the length of the encoded representation of the unknown token
return len(self.sp_model.encode(str(self.unk_token)))
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
def get_spm_processor(self, from_slow=False):
# Initialize SentencePiece tokenizer with specified parameters
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
# Load vocabulary and return tokenizer instance for legacy or slow mode
if self.legacy or from_slow: # no dependency on protobuf
tokenizer.Load(self.vocab_file)
return tokenizer
# Load serialized SentencePiece model for faster processing
with open(self.vocab_file, "rb") as f:
sp_model = f.read()
model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
model = model_pb2.ModelProto.FromString(sp_model)
normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec.add_dummy_prefix = False
model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model)
return tokenizer
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
"""
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special.
"""
# 如果 self.legacy 为真或者输入文本为空,则调用父类的tokenize方法处理
if self.legacy or len(text) == 0:
return super().tokenize(text, **kwargs)
# 将特殊的 SPIECE_UNDERLINE 替换为空格
text = text.replace(SPIECE_UNDERLINE, " ")
# 如果设置了 add_prefix_space,则在文本前加上 SPIECE_UNDERLINE
if self.add_prefix_space:
text = SPIECE_UNDERLINE + text
# 使用父类的tokenize方法对处理后的文本进行分词
tokens = super().tokenize(text, **kwargs)
# 如果分词后的tokens长度大于1,并且第一个token是 SPIECE_UNDERLINE,第二个token是特殊标记中的一个,则去掉第一个token
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.
We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
# 使用 sentencepiece 模型对文本进行编码,返回字符串形式的tokens列表
tokens = self.sp_model.encode(text, out_type=str)
# 如果 self.legacy 为真或者文本不以 SPIECE_UNDERLINE 或空格开头,则直接返回tokens
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
# 1. 对字符串进行编码并加上前缀,例如 "<unk> Hey"
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
# 2. 从编码结果中移除 unk_token
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
# 使用 vocab 将 token 转换为对应的 id
spm_id = self.sp_model.PieceToId(token)
# 如果 spm_id 为0,则返回未知token的id
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
# 使用 vocab 将 index 转换为对应的 token
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
# 如果 tokens 的第一个元素以 SPIECE_UNDERLINE 开头,并且设置了 add_prefix_space,则去掉第一个字符
if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
tokens[0] = tokens[0][1:]
# 将 tokens 中的 SPIECE_UNDERLINE 替换为空格,然后连接成字符串并去除首尾空格
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
# Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.save_vocabulary
# 将词汇表保存到指定目录下的文件中
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 检查保存目录是否存在,如果不存在则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构建输出词汇表文件的路径,根据可选的前缀和默认文件名构成
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇表文件路径与输出路径不同且当前词汇表文件存在,则复制当前词汇表文件到输出路径
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 如果当前词汇表文件不存在,则将序列化后的模型内容写入输出路径
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# 返回输出词汇表文件路径的元组
return (out_vocab_file,)
# 从 transformers.models.nllb.tokenization_nllb.NllbTokenizer.prepare_seq2seq_batch 复制,配置序列到序列任务的批处理
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "eng",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "fra",
**kwargs,
) -> BatchEncoding:
# 设置源语言和目标语言
self.src_lang = src_lang
self.tgt_lang = tgt_lang
# 调用父类方法,准备序列到序列的批处理数据
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
# 从 transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_input_mode 复制,切换到输入模式
def _switch_to_input_mode(self):
# 设置特殊令牌以适应源语言
return self.set_src_lang_special_tokens(self.src_lang)
# 从 transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_target_mode 复制,切换到目标模式
def _switch_to_target_mode(self):
# 设置特殊令牌以适应目标语言
return self.set_tgt_lang_special_tokens(self.tgt_lang)
# 设置特殊令牌以适应源语言
def set_src_lang_special_tokens(self, src_lang) -> None:
"""重设特殊令牌以适应源语言设定。
前缀=[src_lang_code],后缀=[eos]
"""
# 将当前语言代码转换为对应的标识符
self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
self.init_kwargs["src_lang"] = src_lang
# 如果当前语言代码等于未知标记的标识符,则记录警告
if self.cur_lang_code == self.unk_token_id:
logger.warning_once(
f"`src_lang={src_lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
)
# 设置前缀特殊令牌为当前语言代码,后缀特殊令牌为结束符标识符
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]
# https://github.com/facebookresearch/fairseq2/blob/c53f18e6be6b8b46b722f2249b8397b7eccd7ad3/src/fairseq2/models/nllb/tokenizer.py#L112-L116
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target lang setting.
Prefix=[eos, tgt_lang_code] and suffix=[eos].
"""
# 将当前语言代码转换为对应的词汇表中的 ID
self.cur_lang_code = self.convert_tokens_to_ids(lang)
# 将目标语言设置更新到初始化参数中
self.init_kwargs["tgt_lang"] = lang
# 如果当前语言代码等于未知标记的 ID,则记录警告日志
if self.cur_lang_code == self.unk_token_id:
logger.warning_once(
f"`tgt_lang={lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
)
# 设置前缀特殊标记为 [eos, 当前语言代码]
self.prefix_tokens = [self.eos_token_id, self.cur_lang_code]
# 设置后缀特殊标记为 [eos]
self.suffix_tokens = [self.eos_token_id]
.\models\seamless_m4t\tokenization_seamless_m4t_fast.py
import os
from shutil import copyfile
from typing import List, Optional, Tuple, Union
from tokenizers import processors
from ...tokenization_utils import (
BatchEncoding,
PreTokenizedInput,
TextInput,
)
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import PaddingStrategy, is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_seamless_m4t import SeamlessM4TTokenizer
else:
SeamlessM4TTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/hf-seamless-m4t-medium": "https://huggingface.co/facebook/hf-seamless-m4t-medium/resolve/main/vocab.txt",
},
"tokenizer_file": {
"facebook/hf-seamless-m4t-medium": "https://huggingface.co/facebook/hf-seamless-m4t-medium/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/hf-seamless-m4t-medium": 2048,
}
class SeamlessM4TTokenizerFast(PreTrainedTokenizerFast):
"""
构建一个“快速” SeamlessM4T tokenizer(基于 HuggingFace 的 tokenizers 库)。基于 BPE 模型。
该 tokenizer 继承自 PreTrainedTokenizerFast,包含大多数主要方法。用户应参考超类以获取更多关于这些方法的信息。
分词方法为:<language code> <tokens> <eos> 用于源语言文档,
和 <eos> <language code> <tokens> <eos> 用于目标语言文档。
示例:
```
>>> from transformers import SeamlessM4TTokenizerFast
>>> tokenizer = SeamlessM4TTokenizerFast.from_pretrained(
... "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra"
... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
```
"""
pass
Args:
vocab_file (`str`, *optional*):
Path to the vocabulary file.
tokenizer_file (`str`, *optional*):
The path to a tokenizer file to use instead of the vocab file.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
src_lang (`str`, *optional*, defaults to `"eng"`):
The language to use as source language for translation.
tgt_lang (`str`, *optional*, defaults to `"fra"`):
The language to use as target language for translation.
additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
A tuple or a list of additional special tokens.
Define constants and variables related to tokenizer and vocabulary files.
These constants are typically provided by the library and tailored for specific models.
"""
vocab_files_names = VOCAB_FILES_NAMES
# Assigns predefined constant `VOCAB_FILES_NAMES` to `vocab_files_names`
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# Assigns predefined constant `PRETRAINED_VOCAB_FILES_MAP` to `pretrained_vocab_files_map`
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# Assigns predefined constant `PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES` to `max_model_input_sizes`
slow_tokenizer_class = SeamlessM4TTokenizer
# Assigns `SeamlessM4TTokenizer` class to `slow_tokenizer_class`, presumably for tokenization tasks
model_input_names = ["input_ids", "attention_mask"]
# Defines a list of strings representing model input names
prefix_tokens: List[int] = []
# Initializes an empty list `prefix_tokens` intended to hold integer values
suffix_tokens: List[int] = []
# Initializes an empty list `suffix_tokens` intended to hold integer values
# 初始化方法,设置各种参数和特殊标记,继承自父类Tokenizer
def __init__(
self,
vocab_file=None, # 词汇文件路径,默认为None
tokenizer_file=None, # 分词器文件路径,默认为None
bos_token="<s>", # 开始标记,默认为"<s>"
eos_token="</s>", # 结束标记,默认为"</s>"
sep_token="</s>", # 分隔标记,默认为"</s>"
cls_token="<s>", # 类别标记,默认为"<s>"
unk_token="<unk>", # 未知标记,默认为"<unk>"
pad_token="<pad>", # 填充标记,默认为"<pad>"
src_lang="eng", # 源语言,默认为"eng"
tgt_lang="fra", # 目标语言,默认为"fra"
additional_special_tokens=None, # 额外的特殊标记列表,默认为None
**kwargs, # 其他关键字参数
):
# 调用父类的初始化方法,传入所有参数
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
src_lang=src_lang,
tgt_lang=tgt_lang,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
# 设置实例的词汇文件路径
self.vocab_file = vocab_file
# 根据源语言设置私有属性,以双下划线包围
self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang
# 根据目标语言设置私有属性,以双下划线包围
self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang
# 调用方法设置源语言特殊标记
self.set_src_lang_special_tokens(self._src_lang)
# 调用方法设置目标语言特殊标记
self.set_tgt_lang_special_tokens(self._tgt_lang)
@property
# 返回是否可以保存慢速分词器的布尔值
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
@property
# 返回源语言的私有属性
# 摘录自transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang
def src_lang(self) -> str:
return self._src_lang
@src_lang.setter
# 设置新的源语言,更新私有属性和特殊标记
def src_lang(self, new_src_lang: str) -> None:
# 如果新的源语言不含双下划线,则用双下划线包围
if "__" not in new_src_lang:
self._src_lang = f"__{new_src_lang}__"
else:
self._src_lang = new_src_lang
# 更新源语言的特殊标记
self.set_src_lang_special_tokens(self._src_lang)
@property
# 返回目标语言的私有属性
def tgt_lang(self) -> str:
return self._tgt_lang
@tgt_lang.setter
# 设置新的目标语言,更新私有属性和特殊标记
def tgt_lang(self, new_tgt_lang: str) -> None:
# 如果新的目标语言不含双下划线,则用双下划线包围
if "__" not in new_tgt_lang:
self._tgt_lang = f"__{new_tgt_lang}__"
else:
self._tgt_lang = new_tgt_lang
# 更新目标语言的特殊标记
self.set_tgt_lang_special_tokens(self._tgt_lang)
# 构建带有特殊标记的输入
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.
An SeamlessM4T sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `[src_lang_code] X [eos]`
- `decoder_input_ids`: (for decoder) `[eos, tgt_lang_code] X [eos]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: list of [input IDs](../glossary
"""
if token_ids_1 is None:
# If only one sequence is provided, concatenate with prefix and suffix tokens
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
# If two sequences are provided, concatenate with prefix, both sequences, and suffix tokens
# Note: This is for API consistency; in practice, pairs of sequences are not expected.
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
# Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# Define special tokens for separation and classification
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
# Return a list of zeros corresponding to the length of cls + token_ids_0 + sep
return len(cls + token_ids_0 + sep) * [0]
# Return a list of zeros corresponding to the length of cls + token_ids_0 + sep + sep + token_ids_1 + sep
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
# Raise an error if either source language or target language is missing
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
# Set the source language attribute
self.src_lang = src_lang
# Generate model inputs with special tokens added
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
if "__" not in tgt_lang:
# Ensure target language has proper formatting with double underscores
tgt_lang = f"__{tgt_lang}__"
# Convert target language name to token ID
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
# Add forced beginning-of-sequence token ID to inputs
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
# 从 transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.prepare_seq2seq_batch 复制而来,用于准备序列到序列的批次数据
# 将源语言和目标语言设置为默认值"eng"和"fra",并调用父类方法准备序列到序列的批次数据
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "eng",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "fra",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang # 设置源语言
self.tgt_lang = tgt_lang # 设置目标语言
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
# 从 transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_input_mode 复制而来
# 切换为输入模式,设置当前语言的特殊标记
def _switch_to_input_mode(self):
return self.set_src_lang_special_tokens(self.src_lang)
# 从 transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_target_mode 复制而来
# 切换为目标模式,设置当前语言的特殊标记
def _switch_to_target_mode(self):
return self.set_tgt_lang_special_tokens(self.tgt_lang)
# 设置源语言的特殊标记
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting.
Prefix=[src_lang_code], suffix = [eos]
"""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang) # 将源语言代码转换为对应的标记 ID
# 如果当前语言代码等于未知标记 ID,则发出警告
if self.cur_lang_code == self.unk_token_id:
logger.warning_once(
f"`tgt_lang={src_lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
)
self.init_kwargs["src_lang"] = src_lang # 更新初始化参数中的源语言设置
self.prefix_tokens = [self.cur_lang_code] # 设置前缀特殊标记为当前语言代码
self.suffix_tokens = [self.eos_token_id] # 设置后缀特殊标记为终止标记 ID
# 将标记 ID 转换为对应的字符串形式
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
# 更新分词器的后处理器,使用模板处理方式
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str, # 单句模板
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, # 双句模板
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), # 特殊标记映射
)
# 重设特殊标记为目标语言设置。
# 前缀=[eos, tgt_lang_code],后缀=[eos]。
def set_tgt_lang_special_tokens(self, lang: str) -> None:
# 将当前语言代码转换为对应的标记 ID
self.cur_lang_code = self.convert_tokens_to_ids(lang)
# 如果当前语言代码等于未知标记 ID,则记录警告日志
if self.cur_lang_code == self.unk_token_id:
logger.warning_once(
f"`tgt_lang={lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
)
# 更新初始化参数中的目标语言设置
self.init_kwargs["tgt_lang"] = lang
# 设置前缀标记为 [eos, cur_lang_code]
self.prefix_tokens = [self.eos_token_id, self.cur_lang_code]
# 设置后缀标记为 [eos]
self.suffix_tokens = [self.eos_token_id]
# 将前缀和后缀标记 ID 转换为对应的字符串表示
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
# 设置 Tokenizer 的后处理器为 TemplateProcessing 对象
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
# 设置特殊标记列表,将标记字符串与其对应的标记 ID 组合成元组
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
# 从 transformers 库中的 NllbTokenizerFast 类的方法 save_vocabulary 复制而来
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 如果无法保存慢速 Tokenizer 的词汇表,则引发 ValueError 异常
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
# 检查保存目录是否存在,若不存在则记录错误日志并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
# 定义输出的词汇表文件路径
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇表文件路径与目标路径不同,则复制当前词汇表文件到目标路径
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 返回输出词汇表文件路径的元组形式
return (out_vocab_file,)
# 类方法,用于从预训练模型或路径加载模型的初始化方法
@classmethod
def _from_pretrained(
cls,
resolved_vocab_files,
pretrained_model_name_or_path,
init_configuration,
*init_inputs,
token=None,
cache_dir=None,
local_files_only=False,
_commit_hash=None,
_is_local=False,
**kwargs,
):
# 方法主体部分包含复杂的初始化逻辑,此处省略不进行详细解释
pass
):
# 调用父类的_from_pretrained方法,初始化tokenizer对象
tokenizer = super()._from_pretrained(
resolved_vocab_files,
pretrained_model_name_or_path,
init_configuration,
*init_inputs,
token=token,
cache_dir=cache_dir,
local_files_only=local_files_only,
_commit_hash=_commit_hash,
_is_local=_is_local,
**kwargs,
)
# 在从预训练模型加载后,确保设置源语言特殊标记
tokenizer.set_src_lang_special_tokens(tokenizer._src_lang)
# 在从预训练模型加载后,确保设置目标语言特殊标记
tokenizer.set_tgt_lang_special_tokens(tokenizer._tgt_lang)
# 返回初始化后的tokenizer对象
return tokenizer
def __call__(
self,
# 输入的文本可以是单一文本、预分词输入或其列表形式
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
# 可选的文本对,可以是单一文本、预分词输入或其列表形式
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
# 目标文本,可以是单一文本、预分词输入或其列表形式
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
# 可选的目标文本对,可以是单一文本、预分词输入或其列表形式
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
# 填充策略,可以是布尔值、字符串或填充策略对象
padding: Union[bool, str, PaddingStrategy] = True,
# 填充至的倍数,可选的整数值
pad_to_multiple_of: Optional[int] = 2,
# 源语言标识符,可选的字符串
src_lang: Optional[str] = None,
# 目标语言标识符,可选的字符串
tgt_lang: Optional[str] = None,
# 其它关键字参数
**kwargs,
.\models\seamless_m4t\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_seamless_m4t": ["SEAMLESS_M4T_PRETRAINED_CONFIG_ARCHIVE_MAP", "SeamlessM4TConfig"],
"feature_extraction_seamless_m4t": ["SeamlessM4TFeatureExtractor"],
"processing_seamless_m4t": ["SeamlessM4TProcessor"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_seamless_m4t"] = ["SeamlessM4TTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_seamless_m4t_fast"] = ["SeamlessM4TTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_seamless_m4t"] = [
"SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST",
"SeamlessM4TForTextToSpeech",
"SeamlessM4TForSpeechToSpeech",
"SeamlessM4TForTextToText",
"SeamlessM4TForSpeechToText",
"SeamlessM4TModel",
"SeamlessM4TPreTrainedModel",
"SeamlessM4TCodeHifiGan",
"SeamlessM4THifiGan",
"SeamlessM4TTextToUnitForConditionalGeneration",
"SeamlessM4TTextToUnitModel",
]
if TYPE_CHECKING:
from .configuration_seamless_m4t import SEAMLESS_M4T_PRETRAINED_CONFIG_ARCHIVE_MAP, SeamlessM4TConfig
from .feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
from .processing_seamless_m4t import SeamlessM4TProcessor
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_seamless_m4t import SeamlessM4TTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_seamless_m4t_fast import SeamlessM4TTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_seamless_m4t import (
SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST,
SeamlessM4TCodeHifiGan,
SeamlessM4TForSpeechToSpeech,
SeamlessM4TForSpeechToText,
SeamlessM4TForTextToSpeech,
SeamlessM4TForTextToText,
SeamlessM4THifiGan,
SeamlessM4TModel,
SeamlessM4TPreTrainedModel,
SeamlessM4TTextToUnitForConditionalGeneration,
SeamlessM4TTextToUnitModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)