Transformers 源码解析(一百零五)
.\models\speech_encoder_decoder\convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
"""Convert Wav2Vec2 checkpoint."""
import argparse
import json
import os
import fairseq
import torch
from torch import nn
from transformers import (
Speech2Text2Config,
Speech2Text2ForCausalLM,
Speech2Text2Tokenizer,
SpeechEncoderDecoderConfig,
SpeechEncoderDecoderModel,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
Wav2Vec2Model,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"quantizer.weight_proj": "quantizer.weight_proj",
"quantizer.vars": "quantizer.codevectors",
"project_q": "project_q",
"final_proj": "project_hid",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
TOP_LEVEL_KEYS = [
"lm_head",
"quantizer.weight_proj",
"quantizer.codevectors",
"project_q",
"project_hid",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert hf_shape == value.shape, (
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.feature_extractor
proj_weight = None
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
elif name.split(".")[0] == "proj":
proj_weight = fairseq_model.proj
is_used = True
else:
for key, mapped_key in MAPPING.items():
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name:
weight_type = "bias"
elif "weight" in name:
weight_type = "weight"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
return proj_weight
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
if (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
" found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def create_vocab_dict(dict_path):
with open(dict_path, "r", encoding="utf-8") as f:
lines = f.readlines()
words = [line.split(" ")[0] for line in lines]
num_words = len(words)
vocab_dict = {
"<s>": 0,
"<pad>": 1,
"</s>": 2,
"<unk>": 3,
}
vocab_dict.update(dict(zip(words, range(4, num_words + 4))))
return vocab_dict
@torch.no_grad()
def convert_wav2vec2_checkpoint(
checkpoint_path,
pytorch_dump_folder_path,
dict_path,
encoder_config_path,
decoder_config_path,
vocab_size,
num_decoder_layers,
):
"""
复制/粘贴/调整模型权重到Transformers设计。
"""
encoder_config = Wav2Vec2Config.from_pretrained(encoder_config_path)
decoder_config = Speech2Text2Config.from_pretrained(
decoder_config_path, vocab_size=vocab_size, decoder_layers=num_decoder_layers, do_stable_layer_norm=True
)
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0,
do_normalize=True,
return_attention_mask=True,
)
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
)
model = model[0].eval()
hf_encoder = Wav2Vec2Model(encoder_config)
projection_layer = recursively_load_weights_wav2vec2(model.encoder, hf_encoder)
hf_decoder = Speech2Text2ForCausalLM(decoder_config)
missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)
unexpected_keys.remove("embed_out")
hf_decoder.lm_head.weight = nn.Parameter(model.decoder.embed_out.detach())
logger.warning(f"加载解码器权重时缺少以下键: {missing_keys}")
logger.warning(f"加载解码器权重时出现以下意外的键: {unexpected_keys}")
hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
hf_wav2vec.config.tie_word_embeddings = False
hf_wav2vec.enc_to_dec_proj.weight = nn.Parameter(projection_layer.weight)
hf_wav2vec.enc_to_dec_proj.bias = nn.Parameter(projection_layer.bias)
vocab_dict = create_vocab_dict(dict_path)
with open(os.path.join(pytorch_dump_folder_path, "vocab.json"), "w") as fp:
json.dump(vocab_dict, fp)
tokenizer = Speech2Text2Tokenizer(os.path.join(pytorch_dump_folder_path, "vocab.json"))
tokenizer.save_pretrained(pytorch_dump_folder_path)
config = hf_wav2vec.config.to_dict()
config["pad_token_id"] = tokenizer.pad_token_id
config["bos_token_id"] = tokenizer.bos_token_id
config["eos_token_id"] = tokenizer.eos_token_id
config["tokenizer_class"] = "speech_to_text_2"
config["feature_extractor_type"] = "wav2vec2"
hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)
hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
feature_extractor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument(
"--encoder_config_path",
default="facebook/wav2vec2-large-lv60",
type=str,
help="Path to hf encoder wav2vec2 checkpoint config",
)
parser.add_argument(
"--decoder_config_path",
default="facebook/s2t-small-mustc-en-fr-st",
type=str,
help="Path to hf decoder s2t checkpoint config",
)
parser.add_argument("--vocab_size", default=10224, type=int, help="Vocab size of decoder")
parser.add_argument("--num_decoder_layers", default=7, type=int, help="Number of decoder layers")
args = parser.parse_args()
convert_wav2vec2_checkpoint(
args.checkpoint_path,
args.pytorch_dump_folder_path,
args.dict_path,
encoder_config_path=args.encoder_config_path,
decoder_config_path=args.decoder_config_path,
vocab_size=args.vocab_size,
num_decoder_layers=args.num_decoder_layers,
)
.\models\speech_encoder_decoder\modeling_flax_speech_encoder_decoder.py
""" Classes to support Flax Speech-Encoder-Decoder architectures"""
import os
from typing import Optional, Tuple, Union
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey
from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
from ...modeling_flax_utils import FlaxPreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from ..auto.configuration_auto import AutoConfig
from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "SpeechEncoderDecoderConfig"
SPEECH_ENCODER_DECODER_START_DOCSTRING = r"""
This class can be used to initialize a speech-sequence-to-text-sequence model with any pretrained speech
autoencoding model as the encoder and any pretrained text autoregressive model as the decoder. The encoder is
loaded via [`~AutoModel.from_pretrained`] function and the decoder is loaded via
[`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added to the decoder
and should be fine-tuned on a downstream generative task, like summarization.
The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
Zhou, Wei Li, Peter J. Liu.
Additionally, in [Large-Scale Self- and Semi-Supervised Learning for Speech
Translation](https://arxiv.org/abs/2104.06678) it is shown how leveraging large pretrained speech models for speech
translation yields a significant performance improvement.
After such an Speech-Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other
models (see the examples for more information).
# 这个模型继承自 `FlaxPreTrainedModel`。请查看超类文档以了解库为所有模型实现的通用方法(如下载或保存模型、调整输入嵌入、修剪头等)。
# 这个模型同时也是 Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) 的子类。
# 可以像常规的 Flax Module 一样使用它,并参考 Flax 文档了解与一般用法和行为相关的所有事项。
# 参数:
# config ([`SpeechEncoderDecoderConfig`]): 模型配置类,包含模型的所有参数。
# 初始化时使用配置文件不会加载与模型关联的权重,只加载配置。查看 [`~FlaxPreTrainedModel.from_pretrained`] 方法以加载模型权重。
# dtype (`jax.numpy.dtype`, *optional*, 默认为 `jax.numpy.float32`):
# 计算的数据类型。可以是 `jax.numpy.float32`、`jax.numpy.float16`(在GPU上)和 `jax.numpy.bfloat16`(在TPU上)之一。
# 这可以用于在GPU或TPU上启用混合精度训练或半精度推理。如果指定了dtype,所有计算将使用给定的 `dtype` 执行。
# **请注意,这只指定计算的数据类型,并不影响模型参数的数据类型。**
# 如果希望更改模型参数的数据类型,请参阅 [`~FlaxPreTrainedModel.to_fp16`] 和 [`~FlaxPreTrainedModel.to_bf16`]。
"""
SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
Args:
inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
`torch.FloatTensor`.
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).
For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
and prepending them with the `decoder_start_token_id`.
decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
range `[0, config.decoder.max_position_embeddings - 1]`.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
"""
Args:
inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac*
or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile
library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
*torch.FloatTensor*.
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
"""
定义了一个多行字符串常量,用作文档字符串,描述了输入解码器的期望格式。
"""
class FlaxSpeechEncoderDecoderModule(nn.Module):
config: SpeechEncoderDecoderConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
encoder_config = self.config.encoder
decoder_config = self.config.decoder
from ...models.auto.modeling_flax_auto import FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, FLAX_MODEL_MAPPING
encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class
decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class
self.encoder = encoder_module(encoder_config, dtype=self.dtype)
self.decoder = decoder_module(decoder_config, dtype=self.dtype)
if (
self.encoder.config.hidden_size != self.decoder.config.hidden_size
and self.decoder.config.cross_attention_hidden_size is None
):
self.enc_to_dec_proj = nn.Dense(
self.decoder.config.hidden_size,
kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
dtype=self.dtype,
)
else:
self.enc_to_dec_proj = None
def _get_feat_extract_output_lengths(
self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
):
"""
计算卷积层的输出长度
"""
add_adapter = self.config.encoder.add_adapter if add_adapter is None else add_adapter
def _conv_out_length(input_length, kernel_size, stride):
return (input_length - kernel_size) // stride + 1
for kernel_size, stride in zip(self.config.encoder.conv_kernel, self.config.encoder.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
if add_adapter:
for _ in range(self.config.encoder.num_adapter_layers):
input_lengths = _conv_out_length(input_lengths, 1, self.config.encoder.adapter_stride)
return input_lengths
def _get_encoder_module(self):
return self.encoder
def _get_projection_module(self):
return self.enc_to_dec_proj
def _get_decoder_module(self):
return self.decoder
def __call__(
self,
inputs,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
decoder_position_ids,
encoder_outputs=None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
freeze_feature_encoder: bool = False,
):
encoder_outputs = self.encoder(
inputs,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
freeze_feature_encoder=freeze_feature_encoder,
)
encoder_hidden_states = encoder_outputs[0]
if self.enc_to_dec_proj is not None:
encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
if attention_mask is not None:
encoder_attention_mask = self.encoder._get_feature_vector_attention_mask(
encoder_hidden_states.shape[1], attention_mask
)
else:
encoder_attention_mask = None
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
position_ids=decoder_position_ids,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
if not return_dict:
return decoder_outputs + encoder_outputs
return FlaxSeq2SeqLMOutput(
logits=decoder_outputs.logits,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_hidden_states,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
@add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING)
class FlaxSpeechEncoderDecoderModel(FlaxPreTrainedModel):
r"""
[`FlaxSpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture
with the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one
as decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the
encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
"""
config_class = SpeechEncoderDecoderConfig
base_model_prefix: str = "speech_encoder_decoder"
module_class = FlaxSpeechEncoderDecoderModule
def __init__(
self,
config: SpeechEncoderDecoderConfig,
input_shape: Optional[Tuple] = None,
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
if not _do_init:
raise ValueError(
"`FlaxSpeechEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
)
if config.decoder.cross_attention_hidden_size is not None:
if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
raise ValueError(
"If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
" `config.encoder.hidden_size`."
)
config.tie_word_embeddings = False
module = self.module_class(config=config, dtype=dtype, **kwargs)
if input_shape is None:
encoder_input_length = 1024
decoder_input_length = module._get_feat_extract_output_lengths(encoder_input_length)
input_shape = ((1, encoder_input_length), (1, decoder_input_length))
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
encoder_input_shape, decoder_input_shape = input_shape
inputs = jnp.zeros(encoder_input_shape, dtype="f4")
attention_mask = jnp.ones_like(inputs, dtype="i4")
decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")
decoder_attention_mask = jnp.ones_like(decoder_input_ids)
batch_size, sequence_length = inputs.shape
decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape
if not decoder_batch_size == batch_size:
raise ValueError(
f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder"
f" and {decoder_batch_size} for decoder."
)
decoder_position_ids = jnp.broadcast_to(
jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
)
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
random_params = self.module.init(
rngs,
inputs,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
decoder_position_ids,
)["params"]
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
return random_params
def init_cache(self, batch_size, max_length, encoder_outputs):
r"""
Args:
batch_size (`int`):
用于快速自回归解码的批大小。定义了初始化缓存时的批大小。
max_length (`int`):
自回归解码的最大可能长度。定义了初始化缓存时的序列长度。
encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
`encoder_outputs` 包括 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)。
`last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`,*可选* 的隐藏状态序列,
是编码器最后一层的输出的隐藏状态序列。在解码器的交叉注意力中使用。
"""
decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
decoder_attention_mask = jnp.ones_like(decoder_input_ids)
decoder_position_ids = jnp.broadcast_to(
jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
)
def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
decoder_module = module._get_decoder_module()
return decoder_module(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
position_ids=decoder_position_ids,
**kwargs,
)
init_variables = self.module.init(
jax.random.PRNGKey(0),
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
decoder_position_ids=decoder_position_ids,
encoder_hidden_states=encoder_outputs[0],
init_cache=True,
method=_decoder_forward,
)
return unfreeze(init_variables["cache"])
def _get_feat_extract_output_lengths(
self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
):
return self.module._get_feat_extract_output_lengths(input_lengths, add_adapter=add_adapter)
@add_start_docstrings(SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
def encode(
self,
inputs: jnp.ndarray,
attention_mask: Optional[jnp.ndarray] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
freeze_feature_encoder: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
):
r"""
Returns:
Example:
```
>>> from transformers import FlaxSpeechEncoderDecoderModel
>>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
... "facebook/wav2vec2-large-lv60", "facebook/bart-large"
... )
>>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)
>>> encoder_outputs = model.encode(inputs)
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
if attention_mask is None:
attention_mask = jnp.ones_like(inputs, dtype="i4")
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
def _encoder_forward(module, inputs, attention_mask, **kwargs):
encode_module = module._get_encoder_module()
return encode_module(inputs, attention_mask, **kwargs)
outputs = self.module.apply(
{"params": params or self.params},
inputs=jnp.array(inputs, dtype="f4"),
attention_mask=jnp.array(attention_mask, dtype="i4"),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=not train,
freeze_feature_encoder=freeze_feature_encoder,
rngs=rngs,
method=_encoder_forward,
)
if return_dict:
outputs = FlaxBaseModelOutput(
last_hidden_state=outputs.last_hidden_state,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
return outputs
@add_start_docstrings(SPEECH_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def decode(
self,
decoder_input_ids,
encoder_outputs,
encoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_position_ids: Optional[jnp.ndarray] = None,
past_key_values: dict = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
):
r"""
@add_start_docstrings_to_model_forward(SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING)
```
# 使用装饰器替换返回文档字符串,指定输出类型为FlaxSeq2SeqLMOutput,配置类为_CONFIG_FOR_DOC
@replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
# 定义实例方法__call__,用于执行模型推理或训练
def __call__(
self,
# 输入数据,一个NumPy数组
inputs: jnp.ndarray,
# 可选项,注意力掩码数组,用于指示输入中哪些元素是填充的
attention_mask: Optional[jnp.ndarray] = None,
# 可选项,解码器输入的ID数组,用于生成序列
decoder_input_ids: Optional[jnp.ndarray] = None,
# 可选项,解码器注意力掩码数组,用于指示解码器输入中哪些元素是填充的
decoder_attention_mask: Optional[jnp.ndarray] = None,
# 可选项,解码器位置ID数组,指示每个解码器输入在序列中的位置
decoder_position_ids: Optional[jnp.ndarray] = None,
# 可选项,是否输出注意力权重
output_attentions: Optional[bool] = None,
# 可选项,是否输出隐藏状态
output_hidden_states: Optional[bool] = None,
# 可选项,是否以字典形式返回输出结果
return_dict: Optional[bool] = None,
# 是否处于训练模式
train: bool = False,
# 是否冻结特征编码器
freeze_feature_encoder: bool = False,
# 模型参数的字典
params: dict = None,
# 随机数生成器的密钥
dropout_rng: PRNGKey = None,
):
r"""
Returns:
Examples:
```
>>> from transformers import FlaxSpeechEncoderDecoderModel, AutoTokenizer
>>>
>>> model = FlaxSpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large")
>>>
>>> tokenizer_output = AutoTokenizer.from_pretrained("facebook/bart-large")
>>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)
>>>
>>> model.config.decoder_start_token_id = model.decoder.config.bos_token_id
>>> model.config.pad_token_id = model.decoder.config.pad_token_id
>>> model.config.eos_token_id = model.decoder.config.eos_token_id
>>> outputs = model.generate(inputs)
```
"""
# Decide whether to use provided output attentions setting or default from model config
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# Decide whether to use provided output hidden states setting or default from model config
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# Decide whether to use provided return dict setting or default from model config
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Prepare encoder inputs: if attention_mask is not provided, create one with all ones
if attention_mask is None:
attention_mask = jnp.ones_like(inputs, dtype="i4")
# Prepare decoder inputs: decoder_input_ids cannot be None, raise error if so
if decoder_input_ids is None:
raise ValueError(
"`decoder_input_ids` cannot be `None`. For sequence to sequence training, `decoder_position_ids` must"
" be specified as an input argument."
)
# Prepare decoder attention mask: if not provided, create one with all ones
if decoder_attention_mask is None:
decoder_attention_mask = jnp.ones_like(decoder_input_ids)
# Prepare decoder position ids: if not provided, broadcast from a range of sequence lengths
if decoder_position_ids is None:
batch_size, sequence_length = decoder_input_ids.shape
decoder_position_ids = jnp.broadcast_to(
jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
)
# Handle any dropout random number generator if provided
rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
# Apply the Flax module to the inputs and other provided arguments
return self.module.apply(
{"params": params or self.params},
inputs=jnp.array(inputs, dtype="f4"),
attention_mask=jnp.array(attention_mask, dtype="i4"),
decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=not train,
freeze_feature_encoder=freeze_feature_encoder,
rngs=rngs,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
max_length,
attention_mask: Optional[jax.Array] = None,
decoder_attention_mask: Optional[jax.Array] = None,
encoder_outputs=None,
**kwargs,
):
# initializing the cache
# 获取批量大小和解码器输入序列长度
batch_size, seq_length = decoder_input_ids.shape
# 使用初始化方法初始化缓存,获取过去的键值对
past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
# 由于解码器使用因果掩码,可以创建一个静态的全1注意力掩码
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
# 如果存在解码器注意力掩码,则更新静态注意力掩码
if decoder_attention_mask is not None:
# 计算解码器位置ID,累积求和减一
decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
else:
# 否则使用广播方式创建解码器位置ID
decoder_position_ids = jnp.broadcast_to(
jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
)
# 返回输入生成的字典,包括过去键值对、编码器输出、编码器注意力掩码、扩展后的解码器注意力掩码和解码器位置ID
return {
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"encoder_attention_mask": attention_mask,
"decoder_attention_mask": extended_attention_mask,
"decoder_position_ids": decoder_position_ids,
}
def update_inputs_for_generation(self, model_outputs, model_kwargs):
# 更新输入以用于生成
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
return model_kwargs
@classmethod
def from_encoder_decoder_pretrained(
cls,
encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
*model_args,
**kwargs,
):
# 从预训练的编码器-解码器模型中加载模型
pass # 这里省略了具体实现的注释,因为这个函数体没有具体代码,仅用于说明类方法的加载功能
.\models\speech_encoder_decoder\modeling_speech_encoder_decoder.py
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from ...configuration_utils import PretrainedConfig
from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from ..auto.configuration_auto import AutoConfig
from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "SpeechEncoderDecoderConfig"
SPEECH_ENCODER_DECODER_START_DOCSTRING = r"""
This class can be used to initialize a speech-sequence-to-text-sequence model with any pretrained speech
autoencoding model as the encoder and any pretrained text autoregressive model as the decoder. The encoder is
loaded via [`~AutoModel.from_pretrained`] function and the decoder is loaded via
[`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added to the decoder
and should be fine-tuned on a downstream generative task, like summarization.
The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
Zhou, Wei Li, Peter J. Liu.
Additionally, in [Large-Scale Self- and Semi-Supervised Learning for Speech
Translation](https://arxiv.org/abs/2104.06678) it is shown how leveraging large pretrained speech models for speech
translation yields a significant performance improvement.
After such an Speech-Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other
models (see the examples for more information).
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SpeechEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
These comments provide context and explanations for each part of the code block, as requested.
"""
SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
"""
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
将输入的 token ids 向右移动一个位置。
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
if decoder_start_token_id is None:
raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
@add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING)
class SpeechEncoderDecoderModel(PreTrainedModel):
r"""
[`SpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
one of the base model classes of the library as encoder and another one as decoder when created with the
:meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
:meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
[`SpeechEncoderDecoderModel`] 是一个通用的模型类,将会被实例化为一个 Transformer 架构,其编码器和解码器是基于库中的基础模型类创建的,
可以通过 :meth*~transformers.AutoModel.from_pretrained* 方法创建编码器,以及 :meth*~transformers.AutoModelForCausalLM.from_pretrained*
方法创建解码器。
"""
config_class = SpeechEncoderDecoderConfig
base_model_prefix = "speech_encoder_decoder"
main_input_name = "inputs"
supports_gradient_checkpointing = True
def __init__(
self,
config: Optional[PretrainedConfig] = None,
encoder: Optional[PreTrainedModel] = None,
decoder: Optional[PreTrainedModel] = None,
):
super().__init__(config)
self.encoder = encoder
self.decoder = decoder
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def get_output_embeddings(self):
return self.decoder.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
return self.decoder.set_output_embeddings(new_embeddings)
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder of the speech encoder so
that its parameters will not be updated during training.
调用此函数将禁用语音编码器的特征编码器的梯度计算,使其在训练过程中不会更新参数。
"""
self.encoder.freeze_feature_encoder()
@classmethod
def from_pretrained(cls, *args, **kwargs):
if kwargs.get("_fast_init", False):
logger.warning(
"Fast initialization is currently not supported for SpeechEncoderDecoderModel. "
"Falling back to slow initialization..."
)
kwargs["_fast_init"] = False
return super().from_pretrained(*args, **kwargs)
@classmethod
@add_start_docstrings_to_model_forward(SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
inputs: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
input_values: Optional[torch.FloatTensor] = None,
input_features: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
):
decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
input_dict = {
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"decoder_input_ids": decoder_inputs["input_ids"],
"encoder_outputs": encoder_outputs,
"past_key_values": decoder_inputs["past_key_values"],
"use_cache": use_cache,
}
return input_dict
def resize_token_embeddings(self, *args, **kwargs):
raise NotImplementedError(
"Resizing the embedding layers via the SpeechEncoderDecoderModel directly is not supported. Please use the"
" respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))"
)
def _reorder_cache(self, past_key_values, beam_idx):
return self.decoder._reorder_cache(past_key_values, beam_idx)
.\models\speech_encoder_decoder\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
_import_structure = {"configuration_speech_encoder_decoder": ["SpeechEncoderDecoderConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_speech_encoder_decoder"] = ["SpeechEncoderDecoderModel"]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_speech_encoder_decoder"] = ["FlaxSpeechEncoderDecoderModel"]
if TYPE_CHECKING:
from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_speech_encoder_decoder import SpeechEncoderDecoderModel
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_speech_encoder_decoder import FlaxSpeechEncoderDecoderModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\speech_to_text\configuration_speech_to_text.py
class Speech2TextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate a
Speech2Text model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Speech2Text
[facebook/s2t-small-librispeech-asr](https://huggingface.co/facebook/s2t-small-librispeech-asr) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import Speech2TextConfig, Speech2TextModel
>>> # Initializing a Speech2Text s2t_transformer_s style configuration
>>> configuration = Speech2TextConfig()
>>> # Initializing a model (with random weights) from the s2t_transformer_s style configuration
>>> model = Speech2TextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "speech_to_text"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=10000,
encoder_layers=12,
encoder_ffn_dim=2048,
encoder_attention_heads=4,
decoder_layers=6,
decoder_ffn_dim=2048,
decoder_attention_heads=4,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=2,
scale_embedding=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
max_source_positions=6000,
max_target_positions=1024,
num_conv_layers=2,
conv_kernel_sizes=(5, 5),
conv_channels=1024,
input_feat_per_channel=80,
input_channels=1,
**kwargs,
):
self.vocab_size = vocab_size
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
self.max_source_positions = max_source_positions
self.max_target_positions = max_target_positions
self.num_conv_layers = num_conv_layers
self.conv_kernel_sizes = list(conv_kernel_sizes)
self.conv_channels = conv_channels
self.input_feat_per_channel = input_feat_per_channel
self.input_channels = input_channels
if len(self.conv_kernel_sizes) != self.num_conv_layers:
raise ValueError(
"Configuration for convolutional module is incorrect. "
"It is required that `len(config.conv_kernel_sizes)` == `config.num_conv_layers` "
f"but is `len(config.conv_kernel_sizes) = {len(self.conv_kernel_sizes)}`, "
f"`config.num_conv_layers = {self.num_conv_layers}`."
)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
.\models\speech_to_text\convert_s2t_fairseq_to_tfms.py
import argparse
import torch
from torch import nn
from transformers import Speech2TextConfig, Speech2TextForConditionalGeneration
def remove_ignore_keys_(state_dict):
ignore_keys = [
"encoder.version",
"decoder.version",
"model.encoder.version",
"model.decoder.version",
"decoder.output_projection.weight",
"_float_tensor",
"encoder.embed_positions._float_tensor",
"decoder.embed_positions._float_tensor",
]
for k in ignore_keys:
state_dict.pop(k, None)
def rename_keys(s_dict):
keys = list(s_dict.keys())
for key in keys:
if "transformer_layers" in key:
s_dict[key.replace("transformer_layers", "layers")] = s_dict.pop(key)
elif "subsample" in key:
s_dict[key.replace("subsample", "conv")] = s_dict.pop(key)
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def convert_fairseq_s2t_checkpoint_to_tfms(checkpoint_path, pytorch_dump_folder_path):
m2m_100 = torch.load(checkpoint_path, map_location="cpu")
args = m2m_100["args"]
state_dict = m2m_100["model"]
lm_head_weights = state_dict["decoder.output_projection.weight"]
remove_ignore_keys_(state_dict)
rename_keys(state_dict)
vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
tie_embeds = args.share_decoder_input_output_embed
conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]
config = Speech2TextConfig(
vocab_size=vocab_size,
max_source_positions=args.max_source_positions,
max_target_positions=args.max_target_positions,
encoder_layers=args.encoder_layers,
decoder_layers=args.decoder_layers,
encoder_attention_heads=args.encoder_attention_heads,
decoder_attention_heads=args.decoder_attention_heads,
encoder_ffn_dim=args.encoder_ffn_embed_dim,
decoder_ffn_dim=args.decoder_ffn_embed_dim,
d_model=args.encoder_embed_dim,
dropout=args.dropout,
attention_dropout=args.attention_dropout,
activation_dropout=args.activation_dropout,
activation_function="relu",
num_conv_layers=len(conv_kernel_sizes),
conv_channels=args.conv_channels,
conv_kernel_sizes=conv_kernel_sizes,
input_feat_per_channel=args.input_feat_per_channel,
input_channels=args.input_channels,
tie_word_embeddings=tie_embeds,
num_beams=5,
max_length=200,
use_cache=True,
decoder_start_token_id=2,
early_stopping=True,
)
model = Speech2TextForConditionalGeneration(config)
missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
if len(missing) > 0 and not set(missing) <= {
"encoder.embed_positions.weights",
"decoder.embed_positions.weights",
}:
raise ValueError(
"Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights` are allowed to be missing,"
f" but all the following weights are missing {missing}"
)
if tie_embeds:
model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens)
else:
model.lm_head.weight.data = lm_head_weights
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
args = parser.parse_args()
convert_fairseq_s2t_checkpoint_to_tfms(args.fairseq_path, args.pytorch_dump_folder_path)
.\models\speech_to_text\feature_extraction_speech_to_text.py
feature_size=80,
sampling_rate=16000,
num_mel_bins=80,
padding_value=0.0,
do_ceptral_normalize=True,
normalize_means=True,
normalize_vars=True,
**kwargs,
super().__init__(**kwargs)
self.feature_size = feature_size
self.sampling_rate = sampling_rate
self.num_mel_bins = num_mel_bins
self.padding_value = padding_value
self.do_ceptral_normalize = do_ceptral_normalize
self.normalize_means = normalize_means
self.normalize_vars = normalize_vars
def _extract_mel_features(self, signal: np.ndarray) -> np.ndarray:
"""
Extracts Mel-filter bank features from raw speech signal.
Args:
signal (`np.ndarray`): Raw speech signal.
Returns:
`np.ndarray`: Extracted Mel-filter bank features.
"""
return mel_filter_bank(
signal,
self.sampling_rate,
self.num_mel_bins,
)
def _apply_cmvn(self, features: np.ndarray) -> np.ndarray:
"""
Applies utterance-level cepstral mean and variance normalization (CMVN) to the extracted features.
Args:
features (`np.ndarray`): Extracted features.
Returns:
`np.ndarray`: Features after CMVN.
"""
means = np.mean(features, axis=1, keepdims=True)
variances = np.var(features, axis=1, keepdims=True)
if self.normalize_means:
features -= means
if self.normalize_vars:
features /= np.sqrt(variances + 1e-5)
return features
def _extract_features(self, signal: np.ndarray) -> BatchFeature:
"""
Extracts features from the raw speech signal.
Args:
signal (`np.ndarray`): Raw speech signal.
Returns:
`BatchFeature`: Batch of extracted features.
"""
spectrogram_feats = spectrogram(
signal,
self.sampling_rate,
window_function,
)
mel_feats = self._extract_mel_features(spectrogram_feats)
if self.do_ceptral_normalize:
mel_feats = self._apply_cmvn(mel_feats)
return BatchFeature(input_features=mel_feats, attention_mask=np.ones_like(mel_feats, dtype=np.float32))
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
self.num_mel_bins = num_mel_bins
self.do_ceptral_normalize = do_ceptral_normalize
self.normalize_means = normalize_means
self.normalize_vars = normalize_vars
self.return_attention_mask = True
if not is_speech_available():
mel_filters = mel_filter_bank(
num_frequency_bins=256,
num_mel_filters=self.num_mel_bins,
min_frequency=20,
max_frequency=sampling_rate // 2,
sampling_rate=sampling_rate,
norm=None,
mel_scale="kaldi",
triangularize_in_mel_space=True,
)
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
self.window = window_function(400, "povey", periodic=False)
def _extract_fbank_features(
self,
waveform: np.ndarray,
) -> np.ndarray:
"""
Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
and hence the waveform should not be normalized before feature extraction.
"""
waveform = waveform * (2**15)
if is_speech_available():
waveform = torch.from_numpy(waveform).unsqueeze(0)
features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
features = features.numpy()
else:
waveform = np.squeeze(waveform)
features = spectrogram(
waveform,
self.window,
frame_length=400,
hop_length=160,
fft_length=512,
power=2.0,
center=False,
preemphasis=0.97,
mel_filters=self.mel_filters,
log_mel="log",
mel_floor=1.192092955078125e-07,
remove_dc_offset=True,
).T
return features
@staticmethod
def utterance_cmvn(
x: np.ndarray,
input_length: int,
normalize_means: Optional[bool] = True,
normalize_vars: Optional[bool] = True,
padding_value: float = 0.0,
) -> np.ndarray:
if normalize_means:
mean = x[:input_length].mean(axis=0)
x = np.subtract(x, mean)
if normalize_vars:
std = x[:input_length].std(axis=0)
x = np.divide(x, std)
if input_length < x.shape[0]:
x[input_length:] = padding_value
x = x.astype(np.float32)
return x
def normalize(
self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
) -> List[np.ndarray]:
lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
return [
self.utterance_cmvn(x, n, self.normalize_means, self.normalize_vars, self.padding_value)
for x, n in zip(input_features, lengths)
]
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
**kwargs,
.\models\speech_to_text\modeling_speech_to_text.py
""" PyTorch Speech2Text model."""
import math
from typing import Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_speech_to_text import Speech2TextConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Speech2TextConfig"
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/s2t-small-librispeech-asr",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class Conv1dSubsampler(nn.Module):
"""
Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
via gated linear units (https://arxiv.org/abs/1911.08460)
"""
def __init__(self, config):
super(Conv1dSubsampler, self).__init__()
self.config = config
self.num_layers = config.num_conv_layers
self.in_channels = config.input_feat_per_channel * config.input_channels
self.mid_channels = config.conv_channels
self.out_channels = config.d_model
self.kernel_sizes = config.conv_kernel_sizes
self.conv_layers = nn.ModuleList(
nn.Conv1d(
self.in_channels if i == 0 else self.mid_channels // 2,
self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2,
kernel_size=k,
stride=2,
padding=k // 2,
)
for i, k in enumerate(self.kernel_sizes)
)
def forward(self, input_features):
hidden_states = input_features.transpose(1, 2).contiguous()
for conv in self.conv_layers:
hidden_states = conv(hidden_states)
hidden_states = nn.functional.glu(hidden_states, dim=1)
hidden_states = hidden_states.transpose(1, 2).contiguous()
return hidden_states
class Speech2TextSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.weights = nn.Parameter(emb_weights)
self.weights.requires_grad = False
self.weights.detach_()
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
构建正弦位置编码。这与tensor2tensor中的实现匹配,但与《Attention Is All You Need》第3.5节中的描述略有不同。
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
bsz, seq_len = input_ids.size()
position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
max_pos = self.padding_idx + 1 + seq_len
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
def create_position_ids_from_input_ids(
self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
):
raise NotImplementedError
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
input_ids: torch.Tensor representing input tensor with token IDs
padding_idx: int, index of padding token in input_ids
past_key_values_length: int, length of past key values to be considered
Returns:
torch.Tensor representing the tensor with position indices
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
class Speech2TextAttention(nn.Module):
"""来自“Attention Is All You Need”论文的多头注意力机制"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[Speech2TextConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim必须能够被num_heads整除 (当前 `embed_dim`: {self.embed_dim}"
f" 和 `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
"""将张量形状重新排列为(bsz, num_heads, seq_len, head_dim),并转置前两个维度"""
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
"""前向传播函数,实现注意力机制的计算"""
pass
class Speech2TextEncoderLayer(nn.Module):
def __init__(self, config: Speech2TextConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
layer_head_mask: torch.Tensor,
output_attentions: bool = False,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class Speech2TextDecoderLayer(nn.Module):
def __init__(self, config: Speech2TextConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
class Speech2TextPreTrainedModel(PreTrainedModel):
config_class = Speech2TextConfig
base_model_prefix = "model"
main_input_name = "input_features"
supports_gradient_checkpointing = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
"""
计算卷积层的输出长度
"""
for i in range(self.config.num_conv_layers):
input_lengths = (input_lengths - 1) // 2 + 1
return input_lengths
def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
if len(attention_mask.shape) > 2:
attention_mask = attention_mask[:, :, -1]
subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
bsz = attention_mask.size()[0]
attention_mask = torch.zeros(
(bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
return attention_mask
SPEECH_TO_TEXT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`Speech2TextConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SPEECH_TO_TEXT_INPUTS_DOCSTRING = r"""
"""
class Speech2TextEncoder(Speech2TextPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`Speech2TextEncoderLayer`].
Args:
config: Speech2TextConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: Speech2TextConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_source_positions
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.conv = Conv1dSubsampler(config)
self.embed_positions = Speech2TextSinusoidalPositionalEmbedding(
self.max_source_positions,
embed_dim,
self.padding_idx,
)
self.layers = nn.ModuleList([Speech2TextEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
input_features,
attention_mask=None,
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
pass
class Speech2TextDecoder(Speech2TextPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2TextDecoderLayer`]
Args:
config: Speech2TextConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: Speech2TextConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_target_positions
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = Speech2TextSinusoidalPositionalEmbedding(
self.max_target_positions,
config.d_model,
self.padding_idx,
)
self.layers = nn.ModuleList([Speech2TextDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
@add_start_docstrings(
"The bare Speech2Text Model outputting raw hidden-states without any specific head on top.",
SPEECH_TO_TEXT_START_DOCSTRING,
)
class Speech2TextModel(Speech2TextPreTrainedModel):
def __init__(self, config: Speech2TextConfig):
super().__init__(config)
self.encoder = Speech2TextEncoder(config)
self.decoder = Speech2TextDecoder(config)
self.post_init()
def get_input_embeddings(self):
return self.decoder.embed_tokens
def set_input_embeddings(self, value):
self.decoder.embed_tokens = value
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_features: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The Speech2Text Model with a language modeling head. Can be used for summarization.",
SPEECH_TO_TEXT_START_DOCSTRING,
)
class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
base_model_prefix = "model"
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config: Speech2TextConfig):
super().__init__(config)
self.model = Speech2TextModel(config)
self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False)
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_features: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\speech_to_text\modeling_tf_speech_to_text.py
""" TensorFlow Speech2Text model."""
from __future__ import annotations
import random
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation, glu
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPastAndCrossAttentions,
TFSeq2SeqLMOutput,
TFSeq2SeqModelOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFModelInputType,
TFPreTrainedModel,
TFSharedEmbeddings,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_speech_to_text import Speech2TextConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Speech2TextConfig"
_CHECKPOINT_FOR_DOC = "facebook/s2t-small-librispeech-asr"
TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/s2t-small-librispeech-asr",
]
LARGE_NEGATIVE = -1e8
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
start_tokens = tf.fill(
(shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
)
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
shifted_input_ids = tf.where(
shifted_input_ids == -100,
tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
shifted_input_ids,
)
assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
return tf.identity(shifted_input_ids, name="shifted_input_ids")
with tf.control_dependencies([assert_gte0]):
shifted_input_ids = tf.identity(shifted_input_ids)
return shifted_input_ids
def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
"""
Make causal mask used for bi-directional self-attention.
"""
bsz = input_ids_shape[0]
tgt_len = input_ids_shape[1]
mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
mask_cond = tf.range(shape_list(mask)[-1])
mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
if past_key_values_length > 0:
mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
src_len = shape_list(mask)[1]
tgt_len = tgt_len if tgt_len is not None else src_len
one_cst = tf.constant(1.0)
mask = tf.cast(mask, dtype=one_cst.dtype)
expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFConv1dSubsampler(keras.layers.Layer):
"""
Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
via gated linear units (https://arxiv.org/abs/1911.08460)
"""
def __init__(self, config: Speech2TextConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.num_layers = config.num_conv_layers
self.in_channels = config.input_feat_per_channel * config.input_channels
self.mid_channels = config.conv_channels
self.out_channels = config.d_model
self.kernel_sizes = config.conv_kernel_sizes
self.conv_layers = [
keras.layers.Conv1D(
filters=self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2,
kernel_size=k,
strides=2,
name=f"conv_layers.{i}",
)
for i, k in enumerate(self.kernel_sizes)
]
def call(self, input_features: tf.Tensor) -> tf.Tensor:
hidden_states = tf.cast(input_features, tf.float32)
for i, conv in enumerate(self.conv_layers):
pad_len = self.kernel_sizes[i] // 2
hidden_shapes = shape_list(hidden_states)
hidden_states = tf.concat(
(
tf.zeros((hidden_shapes[0], pad_len, hidden_shapes[2])),
hidden_states,
tf.zeros((hidden_shapes[0], pad_len, hidden_shapes[2])),
),
axis=1,
)
hidden_states = conv(hidden_states)
hidden_states = glu(hidden_states, axis=2)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv_layers", None) is not None:
for i, layer in enumerate(self.conv_layers):
with tf.name_scope(layer.name):
layer.build([None, None, self.in_channels] if i == 0 else [None, None, self.mid_channels // 2])
class TFSpeech2TextSinusoidalPositionalEmbedding(keras.layers.Layer):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None, **kwargs):
super().__init__(**kwargs)
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.embedding_weights = self._get_embedding(num_positions + self.offset, embedding_dim, padding_idx)
@staticmethod
def _get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None) -> tf.Tensor:
"""
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
"""
half_dim = embedding_dim // 2
emb = tf.math.log(10000.0) / (half_dim - 1)
emb = tf.math.exp(tf.range(half_dim, dtype=tf.float32) * -emb)
emb = tf.expand_dims(tf.range(num_embeddings, dtype=tf.float32), axis=1) * tf.expand_dims(emb, axis=0)
emb = tf.reshape(tf.concat([tf.math.sin(emb), tf.math.cos(emb)], axis=1), shape=[num_embeddings, -1])
if embedding_dim % 2 == 1:
emb = tf.concat([emb, tf.zeros((num_embeddings, 1))], axis=1)
if padding_idx is not None:
emb = tf.concat([emb[:padding_idx, :], tf.zeros((1, tf.shape(emb)[1])), emb[padding_idx + 1 :, :]], axis=0)
return emb
def call(self, input_ids: tf.Tensor, past_key_values_length: int = 0) -> tf.Tensor:
bsz, seq_len = shape_list(input_ids)
position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
embeddings = self._get_embedding(
self.padding_idx + 1 + seq_len + self.offset + past_key_values_length, self.embedding_dim, self.padding_idx
)
return tf.reshape(tf.gather(embeddings, tf.reshape(position_ids, (-1,)), axis=0), (bsz, seq_len, -1))
@staticmethod
def create_position_ids_from_input_ids(
input_ids: tf.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
) -> tf.Tensor:
pass
def make_positions(x: tf.Tensor) -> tf.Tensor:
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
x: tf.Tensor, input tensor where positions will be computed.
Returns:
tf.Tensor, tensor with replaced positions.
"""
mask = tf.cast(tf.math.not_equal(input_ids, padding_idx), dtype=tf.int32)
incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
return tf.cast(incremental_indices, dtype=tf.int64) + padding_idx
class TFSpeech2TextAttention(keras.layers.Layer):
"""多头注意力机制,基于 'Attention Is All You Need'"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
key_value_states: tf.Tensor | None = None,
past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
training: Optional[bool] = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "k_proj", None) is not None:
with tf.name_scope(self.k_proj.name):
self.k_proj.build([None, None, self.embed_dim])
if getattr(self, "q_proj", None) is not None:
with tf.name_scope(self.q_proj.name):
self.q_proj.build([None, None, self.embed_dim])
if getattr(self, "v_proj", None) is not None:
with tf.name_scope(self.v_proj.name):
self.v_proj.build([None, None, self.embed_dim])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.embed_dim])
class TFSpeech2TextEncoderLayer(keras.layers.Layer):
def __init__(self, config: Speech2TextConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFSpeech2TextAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False
):
"""
Args:
hidden_states (`tf.Tensor`): 输入到层的张量,形状为 `(batch, seq_len, embed_dim)`
attention_mask (`tf.Tensor`): 注意力掩码张量,形状为 `(batch, 1, tgt_len, src_len)`,其中填充元素由非常大的负值表示。
layer_head_mask (`tf.Tensor`): 给定层中注意力头的掩码张量,形状为 `(encoder_attention_heads,)`
training (`bool`): 是否处于训练模式
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, self_attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
training=training,
)
tf.debugging.assert_equal(
shape_list(hidden_states),
shape_list(residual),
message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = self.activation_dropout(hidden_states, training=training)
hidden_states = self.fc2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
return hidden_states, self_attn_weights
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.encoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFSpeech2TextDecoderLayer(keras.layers.Layer):
def __init__(self, config: Speech2TextConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFSpeech2TextAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
name="self_attn",
is_decoder=True,
)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFSpeech2TextAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
name="encoder_attn",
is_decoder=True,
)
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
self,
hidden_states,
attention_mask: tf.Tensor | None = None,
encoder_hidden_states: tf.Tensor | None = None,
encoder_attention_mask: tf.Tensor | None = None,
layer_head_mask: tf.Tensor | None = None,
cross_attn_layer_head_mask: tf.Tensor | None = None,
past_key_value: Tuple[tf.Tensor] | None = None,
training=False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attn", None) is not None:
with tf.name_scope(self.self_attn.name):
self.self_attn.build(None)
if getattr(self, "self_attn_layer_norm", None) is not None:
with tf.name_scope(self.self_attn_layer_norm.name):
self.self_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "encoder_attn", None) is not None:
with tf.name_scope(self.encoder_attn.name):
self.encoder_attn.build(None)
if getattr(self, "encoder_attn_layer_norm", None) is not None:
with tf.name_scope(self.encoder_attn_layer_norm.name):
self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
if getattr(self, "fc1", None) is not None:
with tf.name_scope(self.fc1.name):
self.fc1.build([None, None, self.embed_dim])
if getattr(self, "fc2", None) is not None:
with tf.name_scope(self.fc2.name):
self.fc2.build([None, None, self.config.decoder_ffn_dim])
if getattr(self, "final_layer_norm", None) is not None:
with tf.name_scope(self.final_layer_norm.name):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFSpeech2TextPreTrainedModel(TFPreTrainedModel):
config_class = Speech2TextConfig
base_model_prefix = "model"
main_input_name = "input_features"
_keys_to_ignore_on_load_unexpected = [r"encoder.embed_positions.weights"]
def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
"""
Computes the output length of the convolutional layers
计算卷积层的输出长度
"""
for _ in range(self.config.num_conv_layers):
input_lengths = (input_lengths - 1) // 2 + 1
return input_lengths
@property
def input_signature(self):
return {
"input_features": tf.TensorSpec(
(None, None, self.config.input_feat_per_channel * self.config.input_channels),
tf.float32,
name="input_features",
),
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
"decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
"decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
}
SPEECH_TO_TEXT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
"""
</Tip>
"""
SPEECH_TO_TEXT_INPUTS_DOCSTRING = r"""
"""
@keras_serializable
class TFSpeech2TextEncoder(keras.layers.Layer):
config_class = Speech2TextConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`TFSpeech2TextEncoderLayer`].
Args:
config: Speech2TextConfig
"""
def __init__(self, config: Speech2TextConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
# 初始化 dropout 层,使用指定的 dropout 概率
self.dropout = keras.layers.Dropout(config.dropout)
# layerdrop 是指定的 encoder_layerdrop 参数
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_source_positions
self.embed_scale = tf.math.sqrt(float(embed_dim)) if config.scale_embedding else 1.0
# 创建 TFConv1dSubsampler 对象,用于卷积操作
self.conv = TFConv1dSubsampler(config, name="conv")
# 创建 TFSpeech2TextSinusoidalPositionalEmbedding 对象,用于位置编码
self.embed_positions = TFSpeech2TextSinusoidalPositionalEmbedding(
num_positions=config.max_source_positions,
embedding_dim=embed_dim,
padding_idx=self.padding_idx,
name="embed_positions",
)
# 创建多个 TFSpeech2TextEncoderLayer 对象,作为 Transformer 编码器的层
self.layers = [TFSpeech2TextEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
# 创建 LayerNormalization 层,用于归一化每个编码器层的输出
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
"""
Computes the output length of the convolutional layers
"""
# 计算卷积层的输出长度
for _ in range(self.config.num_conv_layers):
input_lengths = (input_lengths - 1) // 2 + 1
return input_lengths
def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
# 如果 attention_mask 的维度大于2,则取最后一个维度
if len(attention_mask.shape) > 2:
attention_mask = attention_mask[:, :, -1]
# 计算特征提取的输出长度
subsampled_lengths = self._get_feat_extract_output_lengths(tf.math.reduce_sum(attention_mask, -1))
bsz = shape_list(attention_mask)[0]
# 创建注意力掩码,将特定位置标记为1
indices = tf.concat(
(
tf.expand_dims(tf.range(bsz, dtype=attention_mask.dtype), -1),
tf.expand_dims(subsampled_lengths - 1, -1),
),
axis=-1,
)
attention_mask = tf.scatter_nd(indices=indices, updates=tf.ones(bsz), shape=[bsz, feature_vector_length])
# 反转和累积注意力掩码
attention_mask = tf.cast(tf.reverse(tf.math.cumsum(tf.reverse(attention_mask, [-1]), -1), [-1]), tf.int64)
return attention_mask
@unpack_inputs
def call(
self,
input_features=None,
attention_mask=None,
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
# 定义神经网络模型的 build 方法,用于构建模型的各个层次和参数
def build(self, input_shape=None):
# 如果已经构建过模型,直接返回,避免重复构建
if self.built:
return
# 设置标志位,表示模型已经构建
self.built = True
# 如果存在卷积层,则构建卷积层
if getattr(self, "conv", None) is not None:
# 使用卷积层的名称作为 TensorFlow 的命名空间
with tf.name_scope(self.conv.name):
self.conv.build(None)
# 如果存在位置嵌入层,则构建位置嵌入层
if getattr(self, "embed_positions", None) is not None:
# 使用位置嵌入层的名称作为 TensorFlow 的命名空间
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
# 如果存在层归一化层,则构建层归一化层
if getattr(self, "layer_norm", None) is not None:
# 使用层归一化层的名称作为 TensorFlow 的命名空间
with tf.name_scope(self.layer_norm.name):
# 构建层归一化层,输入形状为 [None, None, self.config.d_model]
self.layer_norm.build([None, None, self.config.d_model])
# 如果存在多个层,则分别构建每个层
if getattr(self, "layers", None) is not None:
for layer in self.layers:
# 使用每个层的名称作为 TensorFlow 的命名空间
with tf.name_scope(layer.name):
# 构建当前层,输入形状为 None(即不限制输入形状)
layer.build(None)
# 使用 keras_serializable 装饰器使类可序列化
@keras_serializable
class TFSpeech2TextDecoder(keras.layers.Layer):
# 将 config_class 属性设置为 Speech2TextConfig 类
config_class = Speech2TextConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFSpeech2TextDecoderLayer`]
Args:
config: Speech2TextConfig
"""
# 初始化方法,接受一个 config 参数和其他关键字参数
def __init__(self, config: Speech2TextConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 将传入的 config 参数赋值给 self.config
self.config = config
# 设置 layerdrop 属性为 config.decoder_layerdrop
self.layerdrop = config.decoder_layerdrop
# 设置 padding_idx 属性为 config.pad_token_id
self.padding_idx = config.pad_token_id
# 设置 max_target_positions 属性为 config.max_target_positions
self.max_target_positions = config.max_target_positions
# 如果 config.scale_embedding 为 True,则设置 embed_scale 为 d_model 的平方根,否则为 1.0
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
# 创建 TFSharedEmbeddings 对象并赋值给 embed_tokens 属性
self.embed_tokens = TFSharedEmbeddings(config.vocab_size, config.d_model, name="embed_tokens")
# 创建 TFSpeech2TextSinusoidalPositionalEmbedding 对象并赋值给 embed_positions 属性
self.embed_positions = TFSpeech2TextSinusoidalPositionalEmbedding(
num_positions=config.max_target_positions,
embedding_dim=config.d_model,
padding_idx=self.padding_idx,
name="embed_positions",
)
# 创建包含 config.decoder_layers 个 TFSpeech2TextDecoderLayer 的列表并赋值给 layers 属性
self.layers = [TFSpeech2TextDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
# 创建 LayerNormalization 层并赋值给 layer_norm 属性
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
# 创建 Dropout 层并赋值给 dropout 属性
self.dropout = keras.layers.Dropout(config.dropout)
# 获取 embed_tokens 属性的方法
def get_embed_tokens(self):
return self.embed_tokens
# 设置 embed_tokens 属性的方法
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
# 使用 unpack_inputs 装饰器定义 call 方法,接受多个参数用于 Transformer 解码器的前向传播
@unpack_inputs
def call(
self,
input_ids=None,
inputs_embeds=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
# 此处实现前向传播逻辑,具体内容需要进一步详细注释,但不在此处进行总结
# build 方法用于构建层,当被调用时检查是否已经构建,如果已构建则直接返回,否则构建各层
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 将 built 属性标记为 True,表示已构建
self.built = True
# 如果 embed_tokens 属性存在,则构建其内部结构
if getattr(self, "embed_tokens", None) is not None:
with tf.name_scope(self.embed_tokens.name):
self.embed_tokens.build(None)
# 如果 embed_positions 属性存在,则构建其内部结构
if getattr(self, "embed_positions", None) is not None:
with tf.name_scope(self.embed_positions.name):
self.embed_positions.build(None)
# 如果 layer_norm 属性存在,则构建其内部结构
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.d_model])
# 遍历 layers 列表中的每一层,并构建其内部结构
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
# 使用 keras_serializable 装饰器使类可序列化
@keras_serializable
class TFSpeech2TextMainLayer(keras.layers.Layer):
# 将 config_class 属性设置为 Speech2TextConfig 类
config_class = Speech2TextConfig
# 初始化方法,接受一个配置对象 config 和其他关键字参数
def __init__(self, config: Speech2TextConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 将传入的配置对象保存在实例变量中
self.config = config
# 创建一个 TFSpeech2TextEncoder 对象并保存在实例变量 encoder 中
self.encoder = TFSpeech2TextEncoder(config, name="encoder")
# 创建一个 TFSpeech2TextDecoder 对象并保存在实例变量 decoder 中
self.decoder = TFSpeech2TextDecoder(config, name="decoder")
# 获取输入嵌入的方法,返回 decoder 的 embed_tokens 属性
def get_input_embeddings(self):
return self.decoder.embed_tokens
# 设置输入嵌入的方法,接受新的嵌入向量并将其赋值给 decoder 的 embed_tokens 属性
def set_input_embeddings(self, new_embeddings):
self.decoder.embed_tokens = new_embeddings
# 装饰器函数,用于解包输入参数
@unpack_inputs
def call(
self,
input_features=None,
attention_mask=None,
decoder_input_ids=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
encoder_outputs=None,
past_key_values=None,
decoder_inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
**kwargs,
):
# 此处是模型的调用方法,接受多个输入参数,并进行相应的处理
# build 方法用于构建模型,在第一次调用时执行
def build(self, input_shape=None):
# 如果模型已经构建过,直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果实例变量中存在 encoder 对象,则在命名空间下构建 encoder
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果实例变量中存在 decoder 对象,则在命名空间下构建 decoder
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
# 定义一个基于 TFSpeech2TextPreTrainedModel 的具体模型类 TFSpeech2TextModel,用于输出未经特定头部处理的原始隐藏状态
@add_start_docstrings(
"The bare Speech2Text Model outputting raw hidden-states without any specific head on top.",
SPEECH_TO_TEXT_START_DOCSTRING,
)
class TFSpeech2TextModel(TFSpeech2TextPreTrainedModel):
# 初始化方法,接受一个 Speech2TextConfig 类型的配置对象和其他可选参数
def __init__(self, config: Speech2TextConfig, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 创建一个 TFSpeech2TextMainLayer 对象作为模型的主层,使用给定的配置对象和名称
self.model = TFSpeech2TextMainLayer(config, name="model")
# 获取模型的编码器部分
def get_encoder(self):
return self.model.encoder
# 获取模型的解码器部分
def get_decoder(self):
return self.model.decoder
# 定义模型的调用方法,接受多个输入参数和一些可选的输出控制标志,返回模型输出的元组或 TFSeq2SeqModelOutput 类型
@unpack_inputs
@add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSeq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_features: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
decoder_input_ids: np.ndarray | tf.Tensor | None = None,
decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
decoder_head_mask: np.ndarray | tf.Tensor | None = None,
cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
encoder_outputs: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> Union[Tuple, TFSeq2SeqModelOutput]:
# 调用模型的主层对象,传递所有参数和标志,并接收输出结果
outputs = self.model(
input_features=input_features,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
past_key_values=past_key_values,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回模型的输出结果
return outputs
# 定义一个方法用于生成模型的输出
def serving_output(self, output):
# 如果配置中使用缓存,则获取输出中的过去关键值的第二项,否则为 None
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 如果配置中输出隐藏状态,则将输出中的解码器隐藏状态转换为 TensorFlow 张量,否则为 None
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置中输出注意力权重,则将输出中的解码器注意力权重转换为 TensorFlow 张量,否则为 None
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 如果配置中输出注意力权重,则将输出中的交叉注意力权重转换为 TensorFlow 张量,否则为 None
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 如果配置中输出隐藏状态,则将输出中的编码器隐藏状态转换为 TensorFlow 张量,否则为 None
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置中输出注意力权重,则将输出中的编码器注意力权重转换为 TensorFlow 张量,否则为 None
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 返回一个 TFSeq2SeqModelOutput 对象,封装了模型的输出
return TFSeq2SeqModelOutput(
last_hidden_state=output.last_hidden_state,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
# 构建方法,用于构建模型
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果模型已存在
if getattr(self, "model", None) is not None:
# 使用模型的名称空间构建模型,输入形状为 None
with tf.name_scope(self.model.name):
self.model.build(None)
# 定义一个基于 TFSpeech2TextPreTrainedModel 和 TFCausalLanguageModelingLoss 的模型类,用于语音到文本转换,并具有语言建模头部
@add_start_docstrings(
"The Speech2Text Model with a language modeling head. Can be used for summarization.",
SPEECH_TO_TEXT_START_DOCSTRING,
)
class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCausalLanguageModelingLoss):
# 初始化方法,接受一个 Speech2TextConfig 对象作为参数
def __init__(self, config: Speech2TextConfig):
# 调用父类的初始化方法
super().__init__(config)
# 创建 TFSpeech2TextMainLayer 对象作为模型主体,并命名为 "model"
self.model = TFSpeech2TextMainLayer(config, name="model")
# 创建一个 Dense 层作为语言建模头部,输出维度为 config.vocab_size,不使用偏置
self.lm_head = keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head")
# 设置是否支持在 XLA 生成中使用的标志为 False
# TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate
self.supports_xla_generation = False
# 将传入的 config 对象保存到实例变量中
self.config = config
# 返回模型的编码器部分
def get_encoder(self):
return self.model.encoder
# 返回模型的解码器部分
def get_decoder(self):
return self.model.decoder
# 重新调整 token embeddings 的大小,返回更新后的 embeddings
def resize_token_embeddings(self, new_num_tokens: int) -> tf.Variable:
new_embeddings = super().resize_token_embeddings(new_num_tokens)
return new_embeddings
# 返回语言建模头部
def get_output_embeddings(self):
return self.lm_head
# 设置新的输出 embeddings
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 模型的前向传播方法,接受多种输入参数并返回 TFSeq2SeqLMOutput 类型的输出
@unpack_inputs
@add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_features: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
decoder_input_ids: np.ndarray | tf.Tensor | None = None,
decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
decoder_head_mask: np.ndarray | tf.Tensor | None = None,
cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
encoder_outputs: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
labels: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
**kwargs,
# 定义一个方法用于处理模型输出,并根据配置选择性地返回不同的张量
def serving_output(self, output):
# 如果配置要求使用缓存,则获取输出中的过去键值(past_key_values)的第二个元素
pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
# 如果配置要求输出隐藏状态,则将输出的解码器隐藏状态转换为张量
dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将输出的解码器注意力权重转换为张量
dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
# 如果配置要求输出交叉注意力权重,则将输出的交叉注意力权重转换为张量
cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
# 如果配置要求输出隐藏状态,则将输出的编码器隐藏状态转换为张量
enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
# 如果配置要求输出注意力权重,则将输出的编码器注意力权重转换为张量
enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
# 返回一个 TFSeq2SeqLMOutput 对象,包含不同类型的模型输出
return TFSeq2SeqLMOutput(
logits=output.logits,
past_key_values=pkv,
decoder_hidden_states=dec_hs,
decoder_attentions=dec_attns,
cross_attentions=cross_attns,
encoder_last_hidden_state=output.encoder_last_hidden_state,
encoder_hidden_states=enc_hs,
encoder_attentions=enc_attns,
)
# 准备用于生成的输入参数,根据条件截取 decoder_input_ids
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
# 如果存在 past_key_values,则截取 decoder_input_ids 的最后一个元素作为输入
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
# 返回一个字典,包含用于生成的输入参数
return {
"input_features": None, # 需要传递以使 Keras.layer.__call__ 正常运行
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # 更改此项以避免缓存(可能用于调试)
}
# 构建方法,用于建立模型的组件
def build(self, input_shape=None):
# 如果已经建立过,则直接返回
if self.built:
return
# 标记模型已经建立
self.built = True
# 如果存在模型对象,则在命名空间下建立模型
if getattr(self, "model", None) is not None:
with tf.name_scope(self.model.name):
self.model.build(None)
# 如果存在 lm_head 对象,则在命名空间下建立 lm_head
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build([None, None, self.config.d_model])
# 转换 TensorFlow 权重名称到 PyTorch 权重名称的方法
def tf_to_pt_weight_rename(self, tf_weight):
# 如果输入的 TensorFlow 权重名称是 "lm_head.weight",则返回对应的 PyTorch 权重名称
if tf_weight == "lm_head.weight":
return tf_weight, "model.decoder.embed_tokens.weight"
else:
return (tf_weight,)
.\models\speech_to_text\processing_speech_to_text.py
"""
Speech2Text 的语音处理器类
"""
import warnings
from contextlib import contextmanager
from ...processing_utils import ProcessorMixin
class Speech2TextProcessor(ProcessorMixin):
"""
构造一个 Speech2Text 处理器,将 Speech2Text 特征提取器和 Speech2Text 分词器封装到单个处理器中。
[`Speech2TextProcessor`] 提供了 [`Speech2TextFeatureExtractor`] 和 [`Speech2TextTokenizer`] 的所有功能。
查看 [`~Speech2TextProcessor.__call__`] 和 [`~Speech2TextProcessor.decode`] 获取更多信息。
Args:
feature_extractor (`Speech2TextFeatureExtractor`):
[`Speech2TextFeatureExtractor`] 的一个实例。特征提取器是必需的输入。
tokenizer (`Speech2TextTokenizer`):
[`Speech2TextTokenizer`] 的一个实例。分词器是必需的输入。
"""
feature_extractor_class = "Speech2TextFeatureExtractor"
tokenizer_class = "Speech2TextTokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
[`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
[`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
[`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
information.
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Speech2TextTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Speech2TextTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
Speech2Text.
"""
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
.\models\speech_to_text\tokenization_speech_to_text.py
"""Tokenization classes for Speech2Text."""
import json
import os
from pathlib import Path
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"spm_file": "sentencepiece.bpe.model",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/s2t-small-librispeech-asr": (
"https://huggingface.co/facebook/s2t-small-librispeech-asr/resolve/main/vocab.json"
),
},
"spm_file": {
"facebook/s2t-small-librispeech-asr": (
"https://huggingface.co/facebook/s2t-small-librispeech-asr/resolve/main/sentencepiece.bpe.model"
)
},
}
MAX_MODEL_INPUT_SIZES = {
"facebook/s2t-small-librispeech-asr": 1024,
}
MUSTC_LANGS = ["pt", "fr", "ru", "nl", "ro", "it", "es", "de"]
LANGUAGES = {"mustc": MUSTC_LANGS}
class Speech2TextTokenizer(PreTrainedTokenizer):
"""
Construct an Speech2Text tokenizer.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
the superclass for more information regarding such methods.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
class PreTrainedTokenizer:
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = MAX_MODEL_INPUT_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
spm_file,
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
unk_token="<unk>",
do_upper_case=False,
do_lower_case=False,
tgt_lang=None,
lang_codes=None,
additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
):
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.do_upper_case = do_upper_case
self.do_lower_case = do_lower_case
self.encoder = load_json(vocab_file)
self.decoder = {v: k for k, v in self.encoder.items()}
self.spm_file = spm_file
self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
if lang_codes is not None:
self.lang_codes = lang_codes
self.langs = LANGUAGES[lang_codes]
self.lang_tokens = [f"<lang:{lang}>" for lang in self.langs]
self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"<lang:{lang}>") for lang in self.langs}
if additional_special_tokens is not None:
additional_special_tokens = self.lang_tokens + additional_special_tokens
else:
additional_special_tokens = self.lang_tokens
self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
self.set_tgt_lang_special_tokens(self._tgt_lang)
else:
self.lang_code_to_id = {}
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
do_upper_case=do_upper_case,
do_lower_case=do_lower_case,
tgt_lang=tgt_lang,
lang_codes=lang_codes,
sp_model_kwargs=self.sp_model_kwargs,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
@property
def vocab_size(self) -> int:
return len(self.encoder)
def get_vocab(self) -> Dict:
vocab = self.encoder.copy()
vocab.update(self.added_tokens_encoder)
return vocab
@property
def tgt_lang(self) -> str:
return self._tgt_lang
@tgt_lang.setter
def tgt_lang(self, new_tgt_lang) -> None:
self._tgt_lang = new_tgt_lang
self.set_tgt_lang_special_tokens(new_tgt_lang)
def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
"""Reset the special tokens to the target language setting. prefix=[eos, tgt_lang_code] and suffix=[eos]."""
lang_code_id = self.lang_code_to_id[tgt_lang]
self.prefix_tokens = [lang_code_id]
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder[self.unk_token])
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the decoder."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
current_sub_tokens = []
out_string = ""
for token in tokens:
if token in self.all_special_tokens:
decoded = self.sp_model.decode(current_sub_tokens)
out_string += (decoded.upper() if self.do_upper_case else decoded) + token + " "
current_sub_tokens = []
else:
current_sub_tokens.append(token)
decoded = self.sp_model.decode(current_sub_tokens)
out_string += decoded.upper() if self.do_upper_case else decoded
return out_string.strip()
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
"""Build model inputs from a sequence by appending eos_token_id."""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1]
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def __getstate__(self) -> Dict:
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d: Dict) -> None:
self.__dict__ = d
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
save_dir = Path(save_directory)
assert save_dir.is_dir(), f"{save_directory} should be a directory"
vocab_save_path = save_dir / (
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
)
spm_save_path = save_dir / (
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
)
save_json(self.encoder, vocab_save_path)
if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
copyfile(self.spm_file, spm_save_path)
elif not os.path.isfile(self.spm_file):
with open(spm_save_path, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (str(vocab_save_path), str(spm_save_path))
def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
spm.Load(str(path))
return spm
def load_json(path: str) -> Union[Dict, List]:
with open(path, "r") as f:
return json.load(f)
def save_json(data, path: str) -> None:
with open(path, "w") as f:
json.dump(data, f, indent=2)
.\models\speech_to_text\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tf_available,
is_torch_available,
)
_import_structure = {
"configuration_speech_to_text": ["SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Speech2TextConfig"],
"feature_extraction_speech_to_text": ["Speech2TextFeatureExtractor"],
"processing_speech_to_text": ["Speech2TextProcessor"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_speech_to_text"] = ["Speech2TextTokenizer"]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_speech_to_text"] = [
"TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFSpeech2TextForConditionalGeneration",
"TFSpeech2TextModel",
"TFSpeech2TextPreTrainedModel",
]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_speech_to_text"] = [
"SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
"Speech2TextForConditionalGeneration",
"Speech2TextModel",
"Speech2TextPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
from .processing_speech_to_text import Speech2TextProcessor
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_speech_to_text import Speech2TextTokenizer
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_speech_to_text import (
TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFSpeech2TextForConditionalGeneration,
TFSpeech2TextModel,
TFSpeech2TextPreTrainedModel,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_speech_to_text import (
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
Speech2TextForConditionalGeneration,
Speech2TextModel,
Speech2TextPreTrainedModel,
)
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\speech_to_text_2\configuration_speech_to_text_2.py
""" Speech2Text model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/s2t-wav2vec2-large-en-de": (
"https://huggingface.co/facebook/s2t-wav2vec2-large-en-de/resolve/main/config.json"
),
}
class Speech2Text2Config(PretrainedConfig):
r"""
这是配置类,用于存储 [`Speech2Text2ForCausalLM`] 的配置。根据指定的参数实例化 Speech2Text2 模型,
定义模型架构。使用默认值实例化配置将生成类似于 Speech2Text2
[facebook/s2t-wav2vec2-large-en-de](https://huggingface.co/facebook/s2t-wav2vec2-large-en-de) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。有关更多信息,请阅读 [`PretrainedConfig`] 的文档。
```
# 定义了 Speech2Text 模型的配置类 Speech2Text2Config 的默认参数
Args:
vocab_size (`int`, *optional*, defaults to 50265):
语音到文本模型的词汇表大小,定义了在调用 Speech2TextModel 时传递的 `inputs_ids` 可表示的不同标记数量。
d_model (`int`, *optional*, defaults to 1024):
层和池化层的维度。
decoder_layers (`int`, *optional*, defaults to 12):
解码器层数。
decoder_attention_heads (`int`, *optional*, defaults to 16):
Transformer 解码器中每个注意力层的注意力头数。
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
解码器中“中间”(通常称为前馈)层的维度。
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
池化器中的非线性激活函数(函数或字符串)。如果是字符串,支持 `"gelu"`, `"relu"`, `"silu"` 和 `"gelu_new"`。
dropout (`float`, *optional*, defaults to 0.1):
嵌入层和池化器中所有全连接层的丢弃概率。
attention_dropout (`float`, *optional*, defaults to 0.0):
注意力概率的丢弃比率。
activation_dropout (`float`, *optional*, defaults to 0.0):
全连接层内部激活的丢弃比率。
init_std (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
参考 https://arxiv.org/abs/1909.11556 进一步了解详情。
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
解码器的 LayerDrop 概率。参见 LayerDrop 论文 https://arxiv.org/abs/1909.11556 了解更多详情。
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后一个键/值注意力(并非所有模型都使用)。
max_target_positions (`int`, *optional*, defaults to 1024):
模型可能会用到的最大序列长度。通常将其设置为一个较大的值(例如 512、1024 或 2048)。
Example:
```
>>> from transformers import Speech2Text2Config, Speech2Text2ForCausalLM
>>> # 初始化一个 Speech2Text2Config 配置类实例
>>> configuration = Speech2Text2Config()
>>> # 从 Speech2Text2Config 配置类实例初始化一个带有随机权重的模型
>>> model = Speech2Text2ForCausalLM(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
# 设置模型类型
model_type = "speech_to_text_2"
# 在推断过程中需要忽略的键列表
keys_to_ignore_at_inference = ["past_key_values"]
# 属性映射字典,将模型配置的字段映射到其他命名约定
attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
# 初始化函数,用于初始化 TransformerDecoderModel 类的实例
def __init__(
self,
vocab_size=10000, # 词汇表大小,默认为10000
decoder_layers=6, # 解码器层数,默认为6层
decoder_ffn_dim=2048, # 解码器中间层的维度,默认为2048
decoder_attention_heads=4, # 解码器注意力头数,默认为4个头
decoder_layerdrop=0.0, # 解码器层级随机丢弃的概率,默认为0.0(不丢弃)
use_cache=True, # 是否使用缓存,默认为True
activation_function="relu", # 激活函数,默认为ReLU
d_model=256, # 模型维度,默认为256
dropout=0.1, # 全连接层和注意力层的dropout概率,默认为0.1
attention_dropout=0.0, # 注意力机制的dropout概率,默认为0.0(不丢弃)
activation_dropout=0.0, # 激活函数的dropout概率,默认为0.0(不丢弃)
init_std=0.02, # 参数初始化的标准差,默认为0.02
decoder_start_token_id=2, # 解码器起始token的ID,默认为2
scale_embedding=True, # 是否对embedding进行缩放,默认为True
pad_token_id=1, # 填充token的ID,默认为1
bos_token_id=0, # 开始token的ID,默认为0
eos_token_id=2, # 结束token的ID,默认为2
max_target_positions=1024, # 目标序列的最大长度,默认为1024
**kwargs, # 其他关键字参数
):
self.vocab_size = vocab_size # 设置词汇表大小
self.d_model = d_model # 设置模型维度
self.decoder_ffn_dim = decoder_ffn_dim # 设置解码器中间层维度
self.decoder_layers = decoder_layers # 设置解码器层数
self.decoder_attention_heads = decoder_attention_heads # 设置解码器注意力头数
self.dropout = dropout # 设置全连接层和注意力层的dropout概率
self.attention_dropout = attention_dropout # 设置注意力机制的dropout概率
self.activation_dropout = activation_dropout # 设置激活函数的dropout概率
self.activation_function = activation_function # 设置激活函数类型
self.init_std = init_std # 设置参数初始化的标准差
self.decoder_layerdrop = decoder_layerdrop # 设置解码器层级随机丢弃的概率
self.use_cache = use_cache # 设置是否使用缓存
self.num_hidden_layers = decoder_layers # 设置隐藏层的数量为解码器层数
self.scale_embedding = scale_embedding # 设置是否对embedding进行缩放,若True则缩放因子为sqrt(d_model)
self.max_target_positions = max_target_positions # 设置目标序列的最大长度
# 调用父类的初始化方法,传入填充、起始和结束token的ID以及其他关键字参数
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
.\models\speech_to_text_2\modeling_speech_to_text_2.py
""" PyTorch Speech2Text2 model."""
import copy
import math
from typing import Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, logging, replace_return_docstrings
from .configuration_speech_to_text_2 import Speech2Text2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Speech2Text2Config"
_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de"
SPEECH_TO_TEXT_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/s2t-wav2vec2-large-en-de",
]
class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
if hasattr(self, "weights"):
emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
self.weights = nn.Parameter(emb_weights)
self.weights.requires_grad = False
self.weights.detach_()
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
bsz, seq_len = input_ids.size()
position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
max_pos = self.padding_idx + 1 + seq_len
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
def create_position_ids_from_input_ids(
self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
x: torch.Tensor x:
Returns: torch.Tensor
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
class Speech2Text2Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[Speech2Text2Config] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
pass
class Speech2Text2DecoderLayer(nn.Module):
def __init__(self, config: Speech2Text2Config):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = Speech2Text2Attention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
if config.is_decoder:
self.encoder_attn = Speech2Text2Attention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
"""
Transformer 解码器,由 config.decoder_layers 层组成。每一层是一个 Speech2Text2DecoderLayer 类的实例。
Args:
config: Speech2Text2Config,模型的配置对象
embed_tokens (nn.Embedding): 输出的嵌入层对象
"""
def __init__(self, config: Speech2Text2Config):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_target_positions
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = Speech2Text2SinusoidalPositionalEmbedding(
self.max_target_positions,
config.d_model,
self.padding_idx,
)
self.layers = nn.ModuleList([Speech2Text2DecoderLayer(config) for _ in range(config.decoder_layers)])
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
@add_start_docstrings(
"The Speech2Text2 Model with a language modeling head. Can be used for summarization.",
SPEECH_TO_TEXT_2_START_DOCSTRING,
)
class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
super().__init__(config)
self.decoder = Speech2Text2Decoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
@add_start_docstrings(
"The Speech2Text2 Decoder with a language modeling head. Can be used as the decoder part of"
" [`EncoderDecoderModel`] and [`SpeechEncoderDecoder`].",
SPEECH_TO_TEXT_2_START_DOCSTRING,
)
class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = Speech2Text2DecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
):
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\speech_to_text_2\processing_speech_to_text_2.py
"""
Speech processor class for Speech2Text2
"""
import warnings
from contextlib import contextmanager
from ...processing_utils import ProcessorMixin
class Speech2Text2Processor(ProcessorMixin):
r"""
Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
a single processor.
[`Speech2Text2Processor`] offers all the functionalities of [`AutoFeatureExtractor`] and [`Speech2Text2Tokenizer`].
See the [`~Speech2Text2Processor.__call__`] and [`~Speech2Text2Processor.decode`] for more information.
Args:
feature_extractor (`AutoFeatureExtractor`):
An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`Speech2Text2Tokenizer`):
An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "Speech2Text2Tokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
[`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
[`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
methods for more information.
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Speech2Text2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Speech2Text2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
Speech2Text2.
"""
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.feature_extractor
self._in_target_context_manager = False