Transformers 源码解析(一百二十三)
.\models\wav2vec2_conformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_wav2vec2_conformer": [
"WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Wav2Vec2ConformerConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_wav2vec2_conformer"] = [
"WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"Wav2Vec2ConformerForAudioFrameClassification",
"Wav2Vec2ConformerForCTC",
"Wav2Vec2ConformerForPreTraining",
"Wav2Vec2ConformerForSequenceClassification",
"Wav2Vec2ConformerForXVector",
"Wav2Vec2ConformerModel",
"Wav2Vec2ConformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_wav2vec2_conformer import (
WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
Wav2Vec2ConformerConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_wav2vec2_conformer import (
WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
Wav2Vec2ConformerForAudioFrameClassification,
Wav2Vec2ConformerForCTC,
Wav2Vec2ConformerForPreTraining,
Wav2Vec2ConformerForSequenceClassification,
Wav2Vec2ConformerForXVector,
Wav2Vec2ConformerModel,
Wav2Vec2ConformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\wav2vec2_phoneme\tokenization_wav2vec2_phoneme.py
import json
import os
import sys
from dataclasses import dataclass
from itertools import groupby
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
import numpy as np
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import AddedToken
from ...utils import (
ModelOutput,
is_flax_available,
is_tf_available,
is_torch_available,
logging,
requires_backends,
to_py_obj,
)
logger = logging.get_logger(__name__)
if TYPE_CHECKING:
if is_torch_available():
import torch
if is_tf_available():
import tensorflow as tf
if is_flax_available():
import jax.numpy as jnp
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/wav2vec2-lv-60-espeak-cv-ft": (
"https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft/resolve/main/vocab.json"
),
},
"tokenizer_config_file": {
"facebook/wav2vec2-lv-60-espeak-cv-ft": (
"https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft/resolve/main/tokenizer_config.json"
),
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-lv-60-espeak-cv-ft": sys.maxsize}
ListOfDict = List[Dict[str, Union[int, str]]]
@dataclass
class Wav2Vec2PhonemeCTCTokenizerOutput(ModelOutput):
"""
[`Wav2Vec2PhonemeCTCTokenizer`]的输出类型,带有音素。
Args:
text (list of `str` or `str`):
解码的文本,通常是语音转录。
char_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
解码字符的偏移量。结合采样率和模型下采样率,可以用来计算每个字符的时间戳。
"""
text: Union[List[str], str]
char_offsets: Union[List[ListOfDict], ListOfDict] = None
class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
"""
构造一个Wav2Vec2PhonemeCTC分词器。
"""
This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
the superclass for more information regarding such methods.
Args:
vocab_file (`str`):
File containing the vocabulary.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sentence token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sentence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
do_phonemize (`bool`, *optional*, defaults to `True`):
Whether the tokenizer should phonetize the input or not. Only if a sequence of phonemes is passed to the
tokenizer, `do_phonemize` should be set to `False`.
phonemizer_lang (`str`, *optional*, defaults to `"en-us"`):
The language of the phoneme set to which the tokenizer should phonetize the input text to.
phonemizer_backend (`str`, *optional*. defaults to `"espeak"`):
The backend phonetization library that shall be used by the phonemizer library. Defaults to `espeak-ng`.
See the [phonemizer package](https://github.com/bootphon/phonemizer
**kwargs
Additional keyword arguments passed along to [`PreTrainedTokenizer`]
):
self._word_delimiter_token = word_delimiter_token
self._phone_delimiter_token = phone_delimiter_token
self.do_phonemize = do_phonemize
self.phonemizer_lang = phonemizer_lang
self.phonemizer_backend = phonemizer_backend
if do_phonemize:
self.init_backend(self.phonemizer_lang)
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
word_delimiter_token=word_delimiter_token,
phone_delimiter_token=phone_delimiter_token,
do_phonemize=do_phonemize,
phonemizer_lang=phonemizer_lang,
phonemizer_backend=phonemizer_backend,
**kwargs,
)
@property
def vocab_size(self) -> int:
return len(self.decoder)
def get_vocab(self) -> Dict:
vocab = dict(self.encoder.copy())
vocab.update(self.added_tokens_encoder)
return vocab
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
to_add = []
for token in new_tokens:
if isinstance(token, str):
to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalized=True, special=special_tokens))
else:
to_add.append(token)
return super()._add_tokens(to_add, special_tokens)
def init_backend(self, phonemizer_lang: str):
"""
Initializes the backend.
Args:
phonemizer_lang (`str`): The language to be used.
"""
requires_backends(self, "phonemizer")
from phonemizer.backend import BACKENDS
self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags")
def prepare_for_tokenization(
self,
text: str,
is_split_into_words: bool = False,
phonemizer_lang: Optional[str] = None,
do_phonemize: Optional[bool] = None,
) -> Tuple[str, Dict[str, Any]]:
"""
Performs any necessary transformations before tokenization.
This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
`kwargs` at the end of the encoding process to be sure all the arguments have been used.
Args:
text (`str`):
The text to prepare.
is_split_into_words (`bool`, *optional*, defaults to `False`):
Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
which it will tokenize. This is useful for NER or token classification.
phonemizer_lang (`str`, *optional*):
The language of the phoneme set to which the tokenizer should phonetize the input text to.
do_phonemize (`bool`, *optional*):
Whether the tokenizer should phonetize the input text or not. Only if a sequence of phonemes is passed
to the tokenizer, `do_phonemize` should be set to `False`.
Returns:
`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
"""
if is_split_into_words:
text = " " + text
if do_phonemize is not None:
self.do_phonemize = do_phonemize
if phonemizer_lang is not None:
self.phonemizer_lang = phonemizer_lang
self.init_backend(phonemizer_lang)
return (text, {})
def _tokenize(self, text, **kwargs):
"""
Converts a string into a sequence of tokens (string), using the tokenizer.
"""
text = text.strip()
if self.do_phonemize:
text = text.lower()
text = self.phonemize(text, self.phonemizer_lang)
tokens = text.split(" ")
tokens = list(filter(lambda p: p.strip() != "", tokens))
return tokens
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
from phonemizer.separator import Separator
word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
self.init_backend(phonemizer_lang)
else:
phonemizer_lang = self.phonemizer_lang
separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
phonemes = self.backend.phonemize(
[text],
separator=separator,
)
phonemes = phonemes[0].strip()
return phonemes
@property
def word_delimiter_token(self) -> str:
"""
`str`: 单词分隔符标记。如果在尚未设置时使用,则记录错误日志。
"""
if self._word_delimiter_token is None:
if self.verbose:
logger.error("Using word_delimiter_token, but it is not set yet.")
return None
return str(self._word_delimiter_token)
@property
def word_delimiter_token_id(self) -> Optional[int]:
"""
`Optional[int]`: 单词分隔符标记在词汇表中的ID。如果尚未设置,则返回 `None`。
"""
if self._word_delimiter_token is None:
return None
return self.convert_tokens_to_ids(self.word_delimiter_token)
@word_delimiter_token.setter
def word_delimiter_token(self, value):
self._word_delimiter_token = value
@word_delimiter_token_id.setter
def word_delimiter_token_id(self, value):
self._word_delimiter_token = self.convert_tokens_to_ids(value)
@property
def phone_delimiter_token(self) -> str:
"""
`str`: 音素分隔符标记。如果在尚未设置时使用,则记录错误日志。
"""
if self._phone_delimiter_token is None:
if self.verbose:
logger.error("Using phone_delimiter_token, but it is not set yet.")
return None
return str(self._phone_delimiter_token)
@property
def phone_delimiter_token_id(self) -> Optional[int]:
"""
`Optional[int]`: 音素分隔符标记在词汇表中的ID。如果尚未设置,则返回 `None`。
"""
if self._phone_delimiter_token is None:
return None
return self.convert_tokens_to_ids(self.phone_delimiter_token)
@phone_delimiter_token.setter
def phone_delimiter_token(self, value):
self._phone_delimiter_token = value
@phone_delimiter_token_id.setter
def phone_delimiter_token_id(self, value):
self._phone_delimiter_token = self.convert_tokens_to_ids(value)
def _convert_token_to_id(self, token: str) -> int:
"""将给定的 token(字符串)转换为索引(整数),使用词汇表进行映射。"""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) into a token (str) using the vocabulary."""
result = self.decoder.get(index, self.unk_token)
return result
def convert_tokens_to_string(
self,
tokens: List[str],
group_tokens: bool = True,
spaces_between_special_tokens: bool = False,
filter_word_delimiter_token: bool = True,
output_char_offsets: bool = False,
) -> str:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
if group_tokens:
chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
else:
chars = tokens
char_repetitions = len(tokens) * [1]
processed_chars = list(filter(lambda char: char != self.pad_token, chars))
if filter_word_delimiter_token and self.word_delimiter_token is not None:
processed_chars = list(filter(lambda token: token != self.word_delimiter_token, processed_chars))
char_offsets = None
if output_char_offsets:
word_delimiter_token_for_offsets = (
self.word_delimiter_token if filter_word_delimiter_token is True else None
)
char_offsets = self._compute_offsets(
char_repetitions, chars, self.pad_token, word_delimiter_token=word_delimiter_token_for_offsets
)
if len(char_offsets) != len(processed_chars):
raise ValueError(
f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
" have to be of the same length, but are: `len(offsets)`: "
f"{len(char_offsets)} and `len(processed_tokens)`: {len(processed_chars)}"
)
for i, char in enumerate(processed_chars):
char_offsets[i]["char"] = char
string = " ".join(processed_chars).strip()
return {"text": string, "char_offsets": char_offsets}
@staticmethod
def _compute_offsets(
char_repetitions: List[int], chars: List[str], ctc_token: int, word_delimiter_token: Optional[int] = None
):
) -> List[Dict[str, Union[str, int]]]:
end_indices = np.asarray(char_repetitions).cumsum()
start_indices = np.concatenate(([0], end_indices[:-1]))
offsets = [
{"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
]
offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
if word_delimiter_token is not None:
offsets = list(filter(lambda offsets: offsets["char"] != word_delimiter_token, offsets))
return offsets
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
group_tokens: bool = True,
filter_word_delimiter_token: bool = True,
spaces_between_special_tokens: bool = False,
output_char_offsets: bool = False,
) -> str:
"""
特殊的 _decode 函数用于 Wav2Vec2PhonemeTokenizer,因为添加的特殊标记应该与基础词汇表中的标记完全相同,
因此必须在整个标记列表上调用 `convert_tokens_to_string` 函数,而不是单独处理添加的标记
"""
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
result = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
result.append(token)
string_output = self.convert_tokens_to_string(
result,
group_tokens=group_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
filter_word_delimiter_token=filter_word_delimiter_token,
output_char_offsets=output_char_offsets,
)
text = string_output["text"]
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces
if clean_up_tokenization_spaces is not None
else self.clean_up_tokenization_spaces
)
if clean_up_tokenization_spaces:
text = self.clean_up_tokenization(text)
if output_char_offsets:
return Wav2Vec2PhonemeCTCTokenizerOutput(text=text, char_offsets=string_output["char_offsets"])
else:
return text
def decode(
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False,
**kwargs,
) -> str:
"""
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.
Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
Args:
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets. Character offsets can be used in combination with the
sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.
<Tip>
Please take a look at the Example of [`~models.wav2vec2.tokenization_wav2vec2.decode`] to better
understand how to make use of `output_word_offsets`.
[`~model.wav2vec2_phoneme.tokenization_wav2vec2_phoneme.batch_decode`] works the same way with
phonemes.
</Tip>
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`str` or [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]: The decoded
sentence. Will be a [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]
when `output_char_offsets == True`.
"""
token_ids = to_py_obj(token_ids)
return self._decode(
token_ids=token_ids,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
output_char_offsets=output_char_offsets,
**kwargs,
)
def batch_decode(
self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
output_char_offsets: bool = False,
**kwargs,
):
"""
Batch decodes sequences of token ids into strings or `ModelOutput` objects.
Args:
sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
List or batch of tokenized input sequences.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets.
Returns:
`List[str]` or `List[~transformers.file_utils.ModelOutput]`: List of decoded sentences or model outputs.
"""
) -> List[str]:
"""
Convert a list of lists of token ids into a list of strings by calling decode.
Args:
sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces.
output_char_offsets (`bool`, *optional*, defaults to `False`):
Whether or not to output character offsets. Character offsets can be used in combination with the
sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.
<Tip>
Please take a look at the Example of [`~models.wav2vec2.tokenization_wav2vec2.decode`] to better
understand how to make use of `output_word_offsets`.
[`~model.wav2vec2_phoneme.tokenization_wav2vec2_phoneme.batch_decode`] works analogous with phonemes
and batched output.
</Tip>
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
Returns:
`List[str]` or [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]: The
decoded sentence. Will be a
[`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`] when
`output_char_offsets == True`.
"""
batch_decoded = [
self.decode(
seq,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
output_char_offsets=output_char_offsets,
**kwargs,
)
for seq in sequences
]
if output_char_offsets:
return Wav2Vec2PhonemeCTCTokenizerOutput({k: [d[k] for d in batch_decoded] for k in batch_decoded[0]})
return batch_decoded
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
return (vocab_file,)
.\models\wav2vec2_phoneme\__init__.py
from typing import TYPE_CHECKING
from ...utils import _LazyModule
_import_structure = {"tokenization_wav2vec2_phoneme": ["Wav2Vec2PhonemeCTCTokenizer"]}
if TYPE_CHECKING:
from .tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\wav2vec2_with_lm\processing_wav2vec2_with_lm.py
"""
Speech processor class for Wav2Vec2
"""
import os
import warnings
from contextlib import contextmanager, nullcontext
from dataclasses import dataclass
from multiprocessing import Pool, get_context, get_start_method
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
import numpy as np
from ...processing_utils import ProcessorMixin
from ...utils import ModelOutput, logging, requires_backends
logger = logging.get_logger(__name__)
if TYPE_CHECKING:
from pyctcdecode import BeamSearchDecoderCTC
from ...feature_extraction_utils import FeatureExtractionMixin
from ...tokenization_utils import PreTrainedTokenizerBase
ListOfDict = List[Dict[str, Union[int, str]]]
@dataclass
class Wav2Vec2DecoderWithLMOutput(ModelOutput):
"""
Output type of [`Wav2Vec2DecoderWithLM`], with transcription.
Args:
text (list of `str` or `str`):
Decoded logits in text from. Usually the speech transcription.
logit_score (list of `float` or `float`):
Total logit score of the beams associated with produced text.
lm_score (list of `float`):
Fused lm_score of the beams associated with produced text.
word_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
can be used to compute time stamps for each word.
"""
text: Union[List[List[str]], List[str], str]
logit_score: Union[List[List[float]], List[float], float] = None
lm_score: Union[List[List[float]], List[float], float] = None
word_offsets: Union[List[List[ListOfDict]], List[ListOfDict], ListOfDict] = None
class Wav2Vec2ProcessorWithLM(ProcessorMixin):
r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
with language model support into a single processor for language model boosted speech recognition decoding.
"""
Args:
feature_extractor ([`Wav2Vec2FeatureExtractor`]):
An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
tokenizer ([`Wav2Vec2CTCTokenizer`]):
An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
decoder (`pyctcdecode.BeamSearchDecoderCTC`):
An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
"""
# 定义字符串常量,表示特征提取器和分词器的类名
feature_extractor_class = "Wav2Vec2FeatureExtractor"
tokenizer_class = "Wav2Vec2CTCTokenizer"
def __init__(
self,
feature_extractor: "FeatureExtractionMixin",
tokenizer: "PreTrainedTokenizerBase",
decoder: "BeamSearchDecoderCTC",
):
from pyctcdecode import BeamSearchDecoderCTC
# 调用父类的初始化方法,传入特征提取器和分词器实例
super().__init__(feature_extractor, tokenizer)
# 检查解码器是否为正确的类型,若不是则抛出异常
if not isinstance(decoder, BeamSearchDecoderCTC):
raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
# 确保解码器的字母表与分词器的词汇表内容匹配
missing_decoder_tokens = self.get_missing_alphabet_tokens(decoder, tokenizer)
if len(missing_decoder_tokens) > 0:
raise ValueError(
f"The tokens {missing_decoder_tokens} are defined in the tokenizer's "
"vocabulary, but not in the decoder's alphabet. "
f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
)
# 将解码器、当前处理器和目标上下文管理器的初始状态设置为属性
self.decoder = decoder
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
# 保存预训练模型至指定目录
def save_pretrained(self, save_directory):
super().save_pretrained(save_directory) # 调用父类方法保存预训练模型
self.decoder.save_to_dir(save_directory) # 调用解码器的保存方法保存至指定目录
# 设置语言模型属性的静态方法,用于设置解码器的模型属性
@classmethod
@staticmethod
def _set_language_model_attribute(decoder: "BeamSearchDecoderCTC", attribute: str, value: float):
setattr(decoder.model_container[decoder._model_key], attribute, value)
# 返回解码器的语言模型属性作为属性方法
@property
def language_model(self):
return self.decoder.model_container[self.decoder._model_key]
@staticmethod
def get_missing_alphabet_tokens(decoder, tokenizer):
from pyctcdecode.alphabet import BLANK_TOKEN_PTN, UNK_TOKEN, UNK_TOKEN_PTN
# 确保解码器的字母表中包含所有除特殊标记外的标记,检索缺失的字母表标记
tokenizer_vocab_list = list(tokenizer.get_vocab().keys())
# 替换特殊标记
for i, token in enumerate(tokenizer_vocab_list):
if BLANK_TOKEN_PTN.match(token):
tokenizer_vocab_list[i] = ""
if token == tokenizer.word_delimiter_token:
tokenizer_vocab_list[i] = " "
if UNK_TOKEN_PTN.match(token):
tokenizer_vocab_list[i] = UNK_TOKEN
# 检查哪些额外标记不是特殊的标记
missing_tokens = set(tokenizer_vocab_list) - set(decoder._alphabet.labels)
return missing_tokens
def __call__(self, *args, **kwargs):
"""
在普通模式下使用时,该方法将所有参数转发到Wav2Vec2FeatureExtractor的[`~Wav2Vec2FeatureExtractor.__call__`],并返回其输出。
如果在上下文[`~Wav2Vec2ProcessorWithLM.as_target_processor`]中使用,则将所有参数转发到Wav2Vec2CTCTokenizer的[`~Wav2Vec2CTCTokenizer.__call__`]。
有关更多信息,请参阅上述两个方法的文档字符串。
"""
# 为了向后兼容性
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
# 定义一个方法 `pad`,用于数据填充
def pad(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
[`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
[`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods
for more information.
"""
# 如果在目标处理器的上下文中使用,则调用当前处理器的 `pad` 方法
if self._in_target_context_manager:
return self.current_processor.pad(*args, **kwargs)
# 从 `kwargs` 中弹出 `input_features` 和 `labels` 参数
input_features = kwargs.pop("input_features", None)
labels = kwargs.pop("labels", None)
# 如果有额外的位置参数,将第一个位置参数作为 `input_features`,其余作为 `args`
if len(args) > 0:
input_features = args[0]
args = args[1:]
# 如果 `input_features` 不为 `None`,使用特征提取器的 `pad` 方法进行填充
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
# 如果 `labels` 不为 `None`,使用标记器的 `pad` 方法进行填充
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
# 根据是否有 `labels` 和 `input_features` 返回不同的结果
if labels is None:
return input_features
elif input_features is None:
return labels
else:
# 如果两者都有,将 `labels` 的 `input_ids` 添加到 `input_features` 的 `"labels"` 键中
input_features["labels"] = labels["input_ids"]
return input_features
# 定义一个方法 `batch_decode`,用于批量解码 logits
def batch_decode(
self,
logits: np.ndarray,
pool: Optional[Pool] = None,
num_processes: Optional[int] = None,
beam_width: Optional[int] = None,
beam_prune_logp: Optional[float] = None,
token_min_logp: Optional[float] = None,
hotwords: Optional[Iterable[str]] = None,
hotword_weight: Optional[float] = None,
alpha: Optional[float] = None,
beta: Optional[float] = None,
unk_score_offset: Optional[float] = None,
lm_score_boundary: Optional[bool] = None,
output_word_offsets: bool = False,
n_best: int = 1,
):
# 方法用于批量解码 logits 并返回结果
pass
# 定义一个方法 `decode`,用于解码 logits
def decode(
self,
logits: np.ndarray,
beam_width: Optional[int] = None,
beam_prune_logp: Optional[float] = None,
token_min_logp: Optional[float] = None,
hotwords: Optional[Iterable[str]] = None,
hotword_weight: Optional[float] = None,
alpha: Optional[float] = None,
beta: Optional[float] = None,
unk_score_offset: Optional[float] = None,
lm_score_boundary: Optional[bool] = None,
output_word_offsets: bool = False,
n_best: int = 1,
):
# 方法用于解码 logits 并返回结果
pass
@contextmanager
# 定义一个方法 `as_target_processor`,用于临时设置处理目标的处理器。在微调 Wav2Vec2 模型时,用于对标签进行编码。
def as_target_processor(self):
"""
Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning
Wav2Vec2.
"""
# 发出警告信息,提醒用户 `as_target_processor` 方法将在 Transformers v5 中移除,建议使用 `__call__` 方法的 `text` 参数处理标签。
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
# 设置目标处理上下文管理器为真
self._in_target_context_manager = True
# 将当前处理器设置为分词器 tokenizer
self.current_processor = self.tokenizer
# 返回一个生成器,用于临时设置目标处理器
yield
# 在生成器中,将当前处理器设置为特征提取器 feature_extractor
self.current_processor = self.feature_extractor
# 设置目标处理上下文管理器为假,表示处理结束
self._in_target_context_manager = False
.\models\wav2vec2_with_lm\__init__.py
from typing import TYPE_CHECKING
from ...utils import _LazyModule
_import_structure = {"processing_wav2vec2_with_lm": ["Wav2Vec2ProcessorWithLM"]}
if TYPE_CHECKING:
from .processing_wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\wavlm\configuration_wavlm.py
"""
WavLM model configuration
This module contains the configuration class `WavLMConfig` which defines the model architecture
and inherits from `PretrainedConfig`.
"""
import functools
import operator
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/wavlm-base": "https://huggingface.co/microsoft/wavlm-base/resolve/main/config.json",
}
class WavLMConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`WavLMModel`]. It is used to instantiate an WavLM
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the WavLM
[microsoft/wavlm-base](https://huggingface.co/microsoft/wavlm-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
```
Example:
```
>>> from transformers import WavLMConfig, WavLMModel
>>> # Initializing a WavLM facebook/wavlm-base-960h style configuration
>>> configuration = WavLMConfig()
>>> # Initializing a model (with random weights) from the facebook/wavlm-base-960h style configuration
>>> model = WavLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "wavlm"
def __init__(
self,
vocab_size=32,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout=0.1,
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
feat_extract_norm="group",
feat_extract_activation="gelu",
conv_dim=(512, 512, 512, 512, 512, 512, 512),
conv_stride=(5, 2, 2, 2, 2, 2, 2),
conv_kernel=(10, 3, 3, 3, 3, 2, 2),
conv_bias=False,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
num_buckets=320,
max_bucket_distance=800,
do_stable_layer_norm=False,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
num_codevectors_per_group=320,
num_codevector_groups=2,
contrastive_logits_temperature=0.1,
num_negatives=100,
codevector_dim=256,
proj_codevector_dim=256,
diversity_loss_weight=0.1,
ctc_loss_reduction="mean",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=256,
tdnn_dim=(512, 512, 512, 512, 1500),
tdnn_kernel=(5, 3, 3, 1, 1),
tdnn_dilation=(1, 2, 3, 1, 1),
xvector_output_dim=512,
num_ctc_classes=80,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
add_adapter=False,
adapter_kernel_size=3,
adapter_stride=2,
num_adapter_layers=3,
output_hidden_size=None,
**kwargs,
):
@property
def inputs_to_logits_ratio(self):
return functools.reduce(operator.mul, self.conv_stride, 1)
.\models\wavlm\convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
"""Convert WavLM checkpoint."""
import argparse
import torch
from unilm.wavlm.WavLM import WavLM as WavLMOrig
from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig
from transformers import WavLMConfig, WavLMModel, logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn.grep_linear": "encoder.layers.*.attention.gru_rel_pos_linear",
"self_attn.relative_attention_bias": "encoder.layers.*.attention.rel_attn_embed",
"self_attn.grep_a": "encoder.layers.*.attention.gru_rel_pos_const",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"quantizer.weight_proj": "quantizer.weight_proj",
"quantizer.vars": "quantizer.codevectors",
"project_q": "project_q",
"final_proj": "project_hid",
"w2v_encoder.proj": "ctc_proj",
"mask_emb": "masked_spec_embed",
}
TOP_LEVEL_KEYS = [
"ctc_proj",
"quantizer.weight_proj",
"quantizer.codevectors",
"project_q",
"project_hid",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert hf_shape == value.shape, (
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name and "relative_attention_bias" not in name:
weight_type = "bias"
elif "weight" in name:
weight_type = "weight"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
" found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
@torch.no_grad()
def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
checkpoint = torch.load(checkpoint_path)
cfg = WavLMConfigOrig(checkpoint["cfg"])
model = WavLMOrig(cfg)
model.load_state_dict(checkpoint["model"])
model.eval()
if config_path is not None:
config = WavLMConfig.from_pretrained(config_path)
else:
config = WavLMConfig()
hf_wavlm = WavLMModel(config)
recursively_load_weights(model, hf_wavlm)
hf_wavlm.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
args = parser.parse_args()
convert_wavlm_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
.\models\wavlm\convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
import argparse
import torch
from transformers import (
Wav2Vec2FeatureExtractor,
WavLMConfig,
WavLMForAudioFrameClassification,
WavLMForSequenceClassification,
WavLMForXVector,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def convert_classification(base_model_name, hf_config, downstream_dict):
model = WavLMForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
model.projector.weight.data = downstream_dict["projector.weight"]
model.projector.bias.data = downstream_dict["projector.bias"]
model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
return model
def convert_diarization(base_model_name, hf_config, downstream_dict):
model = WavLMForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
model.classifier.weight.data = downstream_dict["model.linear.weight"]
model.classifier.bias.data = downstream_dict["model.linear.bias"]
return model
def convert_xvector(base_model_name, hf_config, downstream_dict):
model = WavLMForXVector.from_pretrained(base_model_name, config=hf_config)
model.projector.weight.data = downstream_dict["connector.weight"]
model.projector.bias.data = downstream_dict["connector.bias"]
for i, kernel_size in enumerate(hf_config.tdnn_kernel):
model.tdnn[i].kernel.weight.data = downstream_dict[
f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
]
model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
model.objective.weight.data = downstream_dict["objective.W"]
return model
@torch.no_grad()
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
"""
此函数用于从 S3PRL 模型检查点转换模型到其他格式,但是这里没有具体的实现代码。
"""
Copy/paste/tweak model's weights to transformers design.
"""
# 加载模型检查点,指定在CPU上进行加载
checkpoint = torch.load(checkpoint_path, map_location="cpu")
# 从检查点中提取下游任务相关的信息
downstream_dict = checkpoint["Downstream"]
# 从预训练配置文件中加载 Wav2Vec2 模型的配置
hf_config = WavLMConfig.from_pretrained(config_path)
# 从预训练模型中加载 Wav2Vec2 特征提取器
hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
base_model_name, return_attention_mask=True, do_normalize=False
)
# 获取模型架构名称
arch = hf_config.architectures[0]
# 根据模型架构名称选择合适的转换函数转换模型
if arch.endswith("ForSequenceClassification"):
hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
elif arch.endswith("ForAudioFrameClassification"):
hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
elif arch.endswith("ForXVector"):
hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
else:
# 抛出异常,表示不支持当前模型架构的权重转换
raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
# 如果配置要求使用加权层求和,加载模型的加权层参数
if hf_config.use_weighted_layer_sum:
hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
# 将特征提取器保存到指定路径
hf_feature_extractor.save_pretrained(model_dump_path)
# 将转换后的模型保存到指定路径
hf_model.save_pretrained(model_dump_path)
# 如果当前脚本作为主程序运行(而不是被导入为模块),则执行以下代码块
if __name__ == "__main__":
# 创建参数解析器对象
parser = argparse.ArgumentParser()
# 添加命令行参数,用于指定huggingface预训练基础模型的名称
parser.add_argument(
"--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
)
# 添加命令行参数,用于指定huggingface分类器配置文件的路径
parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
# 添加命令行参数,用于指定s3prl检查点文件的路径
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
# 添加命令行参数,用于指定最终转换模型的保存路径
parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
# 解析命令行参数,并将它们保存在args对象中
args = parser.parse_args()
# 调用函数,将指定的参数传递给函数
convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
.\models\wavlm\modeling_wavlm.py
""" PyTorch WavLM模型。"""
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
Wav2Vec2BaseModelOutput,
XVectorOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_peft_available,
logging,
)
from .configuration_wavlm import WavLMConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 2
_CONFIG_FOR_DOC = "WavLMConfig"
_CHECKPOINT_FOR_DOC = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
_CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
_CTC_EXPECTED_LOSS = 12.51
_FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
_FRAME_EXPECTED_OUTPUT = [0, 0]
_XVECTOR_CHECKPOINT = "microsoft/wavlm-base-plus-sv"
_XVECTOR_EXPECTED_OUTPUT = 0.97
WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/wavlm-base",
"microsoft/wavlm-base-plus",
"microsoft/wavlm-large",
]
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.LongTensor] = None,
min_masks: int = 0,
) -> np.ndarray:
"""
计算给定形状的随机掩码跨度。用于实现《SpecAugment: 一种用于ASR的简单数据增强方法》。
请注意,此方法未经优化以在TPU上运行,应作为训练期间的预处理步骤在CPU上运行。
"""
pass
Args:
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
the first element is the batch size and the second element is the length of the axis to span.
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
independently generated mask spans of length `mask_length` is computed by
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask
min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
"""
# 解包 shape 参数
batch_size, sequence_length = shape
# 检查 mask_length 是否合法
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
# 检查 mask_length 是否小于 sequence_length
if mask_length > sequence_length:
raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
)
# epsilon 用于概率性取整
epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length):
"""Given input length, compute how many spans should be masked"""
# 计算应该遮罩的 span 数量
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
# 确保遮罩的 span 数量不小于 min_masks
num_masked_span = max(num_masked_span, min_masks)
# 确保遮罩的 span 数量不超过 sequence_length
if num_masked_span * mask_length > sequence_length:
num_masked_span = sequence_length // mask_length
# 确保遮罩的 span 数量不超过 input_length - (mask_length - 1)
if input_length - (mask_length - 1) < num_masked_span:
num_masked_span = max(input_length - (mask_length - 1), 0)
return num_masked_span
# 计算每个 batch 中的实际长度列表
input_lengths = (
attention_mask.sum(-1).detach().tolist()
if attention_mask is not None
else [sequence_length for _ in range(batch_size)]
)
# 创建一个全零的布尔掩码数组
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
spec_aug_mask_idxs = []
# 计算最大允许的遮罩 span 数量
max_num_masked_span = compute_num_masked_span(sequence_length)
# 如果最大允许的遮罩 span 数量为 0,则直接返回全零的掩码数组
if max_num_masked_span == 0:
return spec_aug_mask
# 遍历输入长度列表中的每个长度
for input_length in input_lengths:
# 计算当前输入长度下要生成的被遮盖区间数量
num_masked_span = compute_num_masked_span(input_length)
# 随机选择要遮盖的起始索引
spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
)
# 选择第一个被抽样的索引作为填充向量的虚拟索引,确保所有批次具有相同的维度
# 这是由于概率舍入而产生的
if len(spec_aug_mask_idx) == 0:
# 如果没有选择到任何索引,说明输入长度严格小于序列长度,此时最后一个标记应该是填充标记
# 我们可以使用它作为虚拟遮盖标识符的索引
dummy_mask_idx = sequence_length - 1
else:
dummy_mask_idx = spec_aug_mask_idx[0]
# 将虚拟索引添加到遮盖索引数组中,以确保所有批次的数组长度相同
spec_aug_mask_idx = np.concatenate(
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
)
spec_aug_mask_idxs.append(spec_aug_mask_idx)
# 将遮盖索引列表转换为 NumPy 数组
spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
# 将遮盖索引扩展为遮盖区间
spec_aug_mask_idxs = np.broadcast_to(
spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
)
# 将数组形状重新调整为批次大小乘以最大遮盖区间数乘以遮盖长度
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# 对起始索引添加偏移量,以便索引现在表示一个区间
offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length
)
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
# 确保索引不会超过序列长度
if spec_aug_mask_idxs.max() > sequence_length - 1:
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
# 使用散点方法将索引应用到遮盖向量中
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
# 返回生成的遮盖向量
return spec_aug_mask
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->WavLM
class WavLMNoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 根据给定层编号确定输入和输出的卷积维度
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层,根据配置设定卷积核大小、步长和偏置
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 使用预定义的激活函数对卷积层的输出进行激活
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
# 将输入的隐藏状态应用到卷积层上
hidden_states = self.conv(hidden_states)
# 应用激活函数到卷积层的输出
hidden_states = self.activation(hidden_states)
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->WavLM
class WavLMLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 根据给定层编号确定输入和输出的卷积维度
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层,根据配置设定卷积核大小、步长和偏置
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 创建一个 LayerNorm 层,对卷积层的输出进行归一化处理
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
# 使用预定义的激活函数对卷积层的输出进行激活
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
# 将输入的隐藏状态应用到卷积层上
hidden_states = self.conv(hidden_states)
# 将卷积层输出的维度转置,以便对 LayerNorm 进行处理
hidden_states = hidden_states.transpose(-2, -1)
# 应用 LayerNorm 对卷积层输出进行归一化处理
hidden_states = self.layer_norm(hidden_states)
# 再次转置回原始维度,并将处理后的结果返回
hidden_states = hidden_states.transpose(-2, -1)
# 应用激活函数到处理后的卷积层输出
hidden_states = self.activation(hidden_states)
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->WavLM
class WavLMGroupNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 根据给定层编号确定输入和输出的卷积维度
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层,根据配置设定卷积核大小、步长和偏置
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 使用预定义的激活函数对卷积层的输出进行激活
self.activation = ACT2FN[config.feat_extract_activation]
# 创建一个 GroupNorm 层,对卷积层的输出进行分组归一化处理
self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
def forward(self, hidden_states):
# 将输入的隐藏状态应用到卷积层上
hidden_states = self.conv(hidden_states)
# 应用 GroupNorm 对卷积层输出进行分组归一化处理
hidden_states = self.layer_norm(hidden_states)
# 应用激活函数到处理后的卷积层输出
hidden_states = self.activation(hidden_states)
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding 复制而来,改名为 WavLMPositionalConvEmbedding
class WavLMPositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
# 定义一个 1D 卷积层,用于位置编码
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.num_conv_pos_embeddings,
padding=config.num_conv_pos_embeddings // 2,
groups=config.num_conv_pos_embedding_groups,
)
# 设置权重归一化函数
weight_norm = nn.utils.weight_norm
if hasattr(nn.utils.parametrizations, "weight_norm"):
weight_norm = nn.utils.parametrizations.weight_norm
# 如果使用了 deepspeed 的 zero3 加速,对卷积层进行特殊处理
if is_deepspeed_zero3_enabled():
import deepspeed
# 在 zero3 加速模式下,使用 GatheredParameters 对象管理权重
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
self.conv = weight_norm(self.conv, name="weight", dim=2)
# 注册外部参数以进行 zero3 加速管理
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
# 普通情况下,对卷积层应用权重归一化
self.conv = weight_norm(self.conv, name="weight", dim=2)
# 创建一个用于同步填充的对象
self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings)
# 选择激活函数,根据配置中的 feat_extract_activation 选择对应的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
# 将输入的 hidden_states 调整维度,转换为 Conv1d 的输入格式
hidden_states = hidden_states.transpose(1, 2)
# 应用卷积操作
hidden_states = self.conv(hidden_states)
# 对卷积结果进行同步填充
hidden_states = self.padding(hidden_states)
# 应用激活函数
hidden_states = self.activation(hidden_states)
# 调整输出维度,返回结果
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制而来,改名为 WavLMSamePadLayer
class WavLMSamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
# 根据 num_conv_pos_embeddings 的奇偶性确定需要移除的填充数
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
# 如果需要移除填充,则按照设定的数量截取隐藏状态
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, : -self.num_pad_remove]
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制而来,改名为 WavLMFeatureEncoder
class WavLMFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
# 初始化方法,接受一个配置对象作为参数
def __init__(self, config):
# 调用父类(nn.Module)的初始化方法
super().__init__()
# 根据配置文件中的特征提取归一化方式选择不同的卷积层列表
if config.feat_extract_norm == "group":
# 如果归一化方式是"group",则创建包含组归一化的第一个卷积层和其余的无归一化卷积层
conv_layers = [WavLMGroupNormConvLayer(config, layer_id=0)] + [
WavLMNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
]
elif config.feat_extract_norm == "layer":
# 如果归一化方式是"layer",则创建全部使用层归一化的卷积层列表
conv_layers = [WavLMLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
else:
# 如果归一化方式既不是"group"也不是"layer",则抛出值错误
raise ValueError(
f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
)
# 将卷积层列表转换为 nn.ModuleList 类型,使其成为 nn.Module 的一部分
self.conv_layers = nn.ModuleList(conv_layers)
# 设置梯度检查点为关闭状态
self.gradient_checkpointing = False
# 设置需要梯度计算为 True
self._requires_grad = True
# 冻结模型参数的方法
def _freeze_parameters(self):
# 遍历所有模型参数,并设置其 requires_grad 属性为 False
for param in self.parameters():
param.requires_grad = False
# 将模型的 _requires_grad 属性设置为 False,表示模型参数已冻结
self._requires_grad = False
# 前向传播方法,接受输入值 input_values 作为参数
def forward(self, input_values):
# 将输入值增加一个维度,用于后续卷积操作
hidden_states = input_values[:, None]
# 如果模型需要梯度计算并且处于训练模式,则设置 hidden_states 的 requires_grad 为 True
if self._requires_grad and self.training:
hidden_states.requires_grad = True
# 遍历所有卷积层,并应用它们到 hidden_states 上
for conv_layer in self.conv_layers:
# 如果模型需要梯度计算并且开启了梯度检查点并且处于训练模式,则使用梯度检查点函数处理 hidden_states
if self._requires_grad and self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
conv_layer.__call__,
hidden_states,
)
else:
# 否则直接调用当前卷积层处理 hidden_states
hidden_states = conv_layer(hidden_states)
# 返回最终的 hidden_states,经过所有卷积层处理后的结果
return hidden_states
class WavLMFeatureExtractor(WavLMFeatureEncoder):
# 继承自WavLMFeatureEncoder的WavLMFeatureExtractor类的初始化方法
def __init__(self, config):
# 调用父类WavLMFeatureEncoder的初始化方法
super().__init__(config)
# 发出警告,提示该类已被弃用,并建议使用Transformers v5中的基类
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection复制并修改为WavLM
class WavLMFeatureProjection(nn.Module):
# WavLMFeatureProjection类,继承自nn.Module
def __init__(self, config):
# 初始化方法
super().__init__()
# 使用LayerNorm进行层归一化,eps参数为配置文件中的layer_norm_eps
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
# 使用Linear进行特征投影,将卷积维度投影到隐藏大小,config.hidden_size为配置文件中的隐藏大小
self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
# 使用Dropout进行特征投影的dropout,概率为config.feat_proj_dropout
self.dropout = nn.Dropout(config.feat_proj_dropout)
def forward(self, hidden_states):
# 执行前向传播
# 对隐藏状态进行LayerNorm归一化处理
norm_hidden_states = self.layer_norm(hidden_states)
# 对归一化后的隐藏状态进行投影
hidden_states = self.projection(norm_hidden_states)
# 对投影后的结果应用Dropout
hidden_states = self.dropout(hidden_states)
return hidden_states, norm_hidden_states
class WavLMAttention(nn.Module):
"""基于'Attention Is All You Need'论文的多头注意力机制"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
num_buckets: int = 320,
max_distance: int = 800,
has_relative_position_bias: bool = True,
):
# 初始化方法
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
# 线性变换层,用于计算Q、K、V和输出的线性投影
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.out_proj = nn.Linear(embed_dim, embed_dim)
self.num_buckets = num_buckets
self.max_distance = max_distance
# GRU相对位置编码的常数项和线性变换
self.gru_rel_pos_const = nn.Parameter(torch.ones(1, self.num_heads, 1, 1))
self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
if has_relative_position_bias:
# 如果启用相对位置偏置,则使用Embedding层
self.rel_attn_embed = nn.Embedding(self.num_buckets, self.num_heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_bias: Optional[torch.Tensor] = None,
output_attentions: bool = False,
index=0,
# 定义前向传播方法,接受隐藏状态、注意力掩码、位置偏置等参数
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Attention layer with relative attention"""
# 获取输入张量的维度信息
bsz, tgt_len, _ = hidden_states.size()
# 如果位置偏置为None,则计算位置偏置
if position_bias is None:
# 计算位置偏置
position_bias = self.compute_bias(tgt_len, tgt_len)
# 扩展位置偏置以适应多头注意力的形状要求
position_bias = (
position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
)
# 计算相对位置偏置:
# 1) 重塑隐藏状态张量,以便将多头注意力的头部维度放在中间
gated_hidden_states = hidden_states.view(hidden_states.shape[:-1] + (self.num_heads, -1))
gated_hidden_states = gated_hidden_states.permute(0, 2, 1, 3)
# 2) 投影隐藏状态以计算相对位置偏置
relative_position_proj = self.gru_rel_pos_linear(gated_hidden_states)
# 将投影后的张量重塑,并对最后一个维度求和
relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)
# 3) 从投影后的隐藏状态计算位置偏置的门控值
gate_a, gate_b = torch.sigmoid(relative_position_proj).chunk(2, dim=-1)
gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
# 4) 将门控值应用于位置偏置,计算门控位置偏置
gated_position_bias = gate_output.view(bsz * self.num_heads, -1, 1) * position_bias
gated_position_bias = gated_position_bias.view((-1, tgt_len, tgt_len))
# 调用多头自注意力函数进行注意力计算
attn_output, attn_weights = self.torch_multi_head_self_attention(
hidden_states, attention_mask, gated_position_bias, output_attentions
)
# 返回注意力计算结果、注意力权重和位置偏置
return attn_output, attn_weights, position_bias
def torch_multi_head_self_attention(
self,
hidden_states: torch.FloatTensor,
attention_mask: Union[torch.LongTensor, torch.BoolTensor],
gated_position_bias: torch.FloatTensor,
output_attentions: bool,
) -> (torch.FloatTensor, torch.FloatTensor):
"""simple wrapper around torch's multi_head_attention_forward function"""
# self-attention assumes q = k = v
query = key = value = hidden_states.transpose(0, 1)
# 根据注意力掩码创建键掩码,若没有注意力掩码则为None
key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None
# disable bias and add_zero_attn
bias_k = bias_v = None
add_zero_attn = False
# PyTorch 1.3.0 has F.multi_head_attention_forward defined
# so no problem with backwards compatibility
# 使用 F.multi_head_attention_forward 函数进行多头注意力计算
attn_output, attn_weights = F.multi_head_attention_forward(
query,
key,
value,
self.embed_dim,
self.num_heads,
torch.empty([0]),
# 将三个投影的偏置连接起来作为参数传入
torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
bias_k,
bias_v,
add_zero_attn,
self.dropout,
self.out_proj.weight,
self.out_proj.bias,
self.training,
key_padding_mask,
output_attentions,
gated_position_bias,
use_separate_proj_weight=True,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
)
# [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
# 调整注意力输出的维度顺序
attn_output = attn_output.transpose(0, 1)
if attn_weights is not None:
# IMPORTANT: Attention weights are averaged weights
# here which should not be the case. This is an open issue
# on PyTorch: https://github.com/pytorch/pytorch/issues/32590
# 对注意力权重进行处理,这里的平均权重处理可能不是理想的情况
attn_weights = attn_weights[:, None].broadcast_to(
attn_weights.shape[:1] + (self.num_heads,) + attn_weights.shape[1:]
)
return attn_output, attn_weights
def compute_bias(self, query_length: int, key_length: int) -> torch.FloatTensor:
# 生成相对位置编码
context_position = torch.arange(query_length, dtype=torch.long)[:, None]
memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
relative_position = memory_position - context_position
# 使用 _relative_positions_bucket 方法将相对位置映射到桶中
relative_position_bucket = self._relative_positions_bucket(relative_position)
# 将映射后的相对位置桶转移到与相对位置嵌入张量相同的设备上
relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
# 获取相对位置嵌入的值并进行维度变换
values = self.rel_attn_embed(relative_position_bucket)
values = values.permute([2, 0, 1])
return values
# 定义一个方法,用于将相对位置转换成相对桶索引
def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> torch.FloatTensor:
# 桶的数量,除以2后取整
num_buckets = self.num_buckets // 2
# 将相对位置是否大于0的结果转换成long类型,并乘以桶数量
relative_buckets = (relative_positions > 0).to(torch.long) * num_buckets
# 取相对位置的绝对值
relative_positions = torch.abs(relative_positions)
# 定义最大的精确桶数量
max_exact = num_buckets // 2
# 判断相对位置是否小于最大精确值
is_small = relative_positions < max_exact
# 如果相对位置较大,计算相对位置的大桶索引
relative_positions_if_large = torch.log(relative_positions.float() / max_exact)
relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
relative_position_if_large = (max_exact + relative_positions_if_large).to(torch.long)
relative_position_if_large = torch.min(
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
)
# 根据 is_small 条件选择相对位置或者大桶索引,加到 relative_buckets 中
relative_buckets += torch.where(is_small, relative_positions, relative_position_if_large)
# 返回相对桶索引
return relative_buckets
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward复制而来,将Wav2Vec2替换为WavLM
class WavLMFeedForward(nn.Module):
def __init__(self, config):
super().__init__()
self.intermediate_dropout = nn.Dropout(config.activation_dropout)
# 创建一个线性层,将输入大小为config.hidden_size映射到config.intermediate_size
self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据配置选择激活函数,如果配置中指定的是字符串,使用ACT2FN字典中对应的函数,否则直接使用配置中的函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 创建一个线性层,将config.intermediate_size映射回config.hidden_size
self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.output_dropout = nn.Dropout(config.hidden_dropout)
def forward(self, hidden_states):
# 进行中间线性层的映射和激活函数处理
hidden_states = self.intermediate_dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.intermediate_dropout(hidden_states)
# 进行最终线性层的映射和dropout处理
hidden_states = self.output_dense(hidden_states)
hidden_states = self.output_dropout(hidden_states)
return hidden_states
class WavLMEncoderLayer(nn.Module):
def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
super().__init__()
# 创建WavLMAttention层,初始化时设置了多种参数,包括注意力头数、dropout等
self.attention = WavLMAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
num_buckets=config.num_buckets,
max_distance=config.max_bucket_distance,
has_relative_position_bias=has_relative_position_bias,
)
# 创建dropout层
self.dropout = nn.Dropout(config.hidden_dropout)
# 创建LayerNorm层,用于规范化隐藏状态
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建WavLMFeedForward层,用于处理隐藏状态
self.feed_forward = WavLMFeedForward(config)
# 创建最终的LayerNorm层,用于规范化输出的隐藏状态
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
# 将注意力层之前的隐藏状态保存下来,用于后续的残差连接
attn_residual = hidden_states
# 使用注意力层处理隐藏状态,获取处理后的隐藏状态、注意力权重以及位置偏置
hidden_states, attn_weights, position_bias = self.attention(
hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
output_attentions=output_attentions,
index=index,
)
# 对处理后的隐藏状态应用dropout
hidden_states = self.dropout(hidden_states)
# 添加残差连接
hidden_states = attn_residual + hidden_states
# 对添加了注意力之后的隐藏状态进行LayerNorm规范化
hidden_states = self.layer_norm(hidden_states)
# 使用前馈网络处理规范化后的隐藏状态
hidden_states = hidden_states + self.feed_forward(hidden_states)
# 对前馈网络处理后的隐藏状态再次进行LayerNorm规范化
hidden_states = self.final_layer_norm(hidden_states)
# 准备输出结果,包括隐藏状态和位置偏置
outputs = (hidden_states, position_bias)
# 如果需要输出注意力权重,则在输出结果中添加注意力权重
if output_attentions:
outputs += (attn_weights,)
return outputs
class WavLMEncoderLayerStableLayerNorm(nn.Module):
# 初始化函数,用于创建一个新的WavLM模型层
def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
# 调用父类初始化函数
super().__init__()
# 初始化注意力层,传入配置参数
self.attention = WavLMAttention(
embed_dim=config.hidden_size, # 隐藏层大小
num_heads=config.num_attention_heads, # 注意力头数
dropout=config.attention_dropout, # 注意力层的dropout率
num_buckets=config.num_buckets, # 桶的数量(用于相对位置编码)
max_distance=config.max_bucket_distance, # 最大桶距离(用于相对位置编码)
has_relative_position_bias=has_relative_position_bias, # 是否包含相对位置偏置
)
# 初始化dropout层
self.dropout = nn.Dropout(config.hidden_dropout)
# 初始化Layer Normalization层
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化前馈神经网络层
self.feed_forward = WavLMFeedForward(config)
# 初始化最终的Layer Normalization层
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 前向传播函数,接受隐藏状态作为输入,执行模型的前向计算
def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
# 保存注意力机制前的残差连接
attn_residual = hidden_states
# 应用Layer Normalization层
hidden_states = self.layer_norm(hidden_states)
# 调用注意力层的前向传播计算
hidden_states, attn_weights, position_bias = self.attention(
hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
output_attentions=output_attentions,
)
# 应用dropout层
hidden_states = self.dropout(hidden_states)
# 执行残差连接
hidden_states = attn_residual + hidden_states
# 应用最终的Layer Normalization层
hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
# 输出包括最终隐藏状态和位置偏置
outputs = (hidden_states, position_bias)
# 如果需要输出注意力权重,添加到输出中
if output_attentions:
outputs += (attn_weights,)
# 返回所有输出
return outputs
# 定义一个用于处理音频数据的编码器模型,继承自 nn.Module 类
class WavLMEncoder(nn.Module):
# 初始化方法,接收一个配置参数 config
def __init__(self, config):
# 调用父类 nn.Module 的初始化方法
super().__init__()
# 将配置参数保存到实例变量中
self.config = config
# 初始化位置卷积嵌入层对象,用于处理位置信息的嵌入
self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
# 初始化 LayerNorm 层,用于标准化隐藏状态向量
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化 Dropout 层,用于在训练过程中进行随机失活
self.dropout = nn.Dropout(config.hidden_dropout)
# 使用 nn.ModuleList 初始化一个包含多个 WavLMEncoderLayer 的列表
# 每个 WavLMEncoderLayer 对象都基于相同的 config 参数,并根据其在列表中的位置决定是否使用相对位置偏置
self.layers = nn.ModuleList(
[WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
)
# 初始化梯度检查点标记,默认为 False
self.gradient_checkpointing = False
# 前向传播方法,接收隐藏状态、注意力掩码等参数
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
# 如果存在 attention_mask,则将未填充的 token 对应的 hidden_states 置为 0
if attention_mask is not None:
hidden_states[~attention_mask] = 0.0
# 计算位置嵌入并与 hidden_states 相加
position_embeddings = self.pos_conv_embed(hidden_states)
hidden_states = hidden_states + position_embeddings
# Layer normalization
hidden_states = self.layer_norm(hidden_states)
# Dropout
hidden_states = self.dropout(hidden_states)
# 检查是否启用了 DeepSpeed Zero3
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
position_bias = None
# 遍历每个 Transformer 层
for i, layer in enumerate(self.layers):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加 LayerDrop 功能,控制层的随机丢弃
dropout_probability = torch.rand([])
# 根据 LayerDrop 的概率决定是否跳过当前层
skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
if not skip_the_layer or deepspeed_zero3_is_enabled:
# 如果启用了梯度检查点且在训练阶段,则使用梯度检查点函数
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer.__call__,
hidden_states,
attention_mask,
position_bias,
output_attentions,
)
else:
# 否则直接调用 Transformer 层
layer_outputs = layer(
hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
output_attentions=output_attentions,
index=i,
)
# 更新 hidden_states 和 position_bias
hidden_states, position_bias = layer_outputs[:2]
# 如果跳过了当前层,则设置 layer_outputs 为 None
if skip_the_layer:
layer_outputs = (None, None)
# 如果需要输出注意力矩阵,则将当前层的注意力矩阵添加到 all_self_attentions 中
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[2],)
# 如果需要输出隐藏状态,则将最终的 hidden_states 添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 根据 return_dict 的设置返回相应的结果
if not return_dict:
# 如果不需要返回字典形式的输出,则返回元组
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
else:
# 否则以 BaseModelOutput 形式返回结果
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
# 定义一个稳定的层归一化的编码器类,继承自 nn.Module
class WavLMEncoderStableLayerNorm(nn.Module):
def __init__(self, config):
super().__init__() # 调用父类的初始化方法
self.config = config # 存储传入的配置信息
# 初始化位置卷积嵌入层,使用给定的配置信息
self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
# 初始化层归一化层,指定隐藏层大小和 epsilon 值
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化 dropout 层,设定丢弃概率
self.dropout = nn.Dropout(config.hidden_dropout)
# 使用列表推导式初始化编码器层列表,每层调用 WavLMEncoderLayerStableLayerNorm 类
# 对于第一层(i == 0),设定相对位置偏置参数为 True
self.layers = nn.ModuleList(
[
WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
for i in range(config.num_hidden_layers)
]
)
# 初始化梯度检查点标志为 False
self.gradient_checkpointing = False
# 定义前向传播方法
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
# 参数列表包括隐藏状态、注意力掩码、是否输出注意力权重、是否输出隐藏状态、是否返回字典形式结果等
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if attention_mask is not None:
# 确保填充的标记不参与注意力计算
hidden_states[~attention_mask] = 0
# 使用位置卷积嵌入层处理位置信息
position_embeddings = self.pos_conv_embed(hidden_states)
# 将位置嵌入的结果加到隐藏状态上
hidden_states = hidden_states + position_embeddings
# 对隐藏状态进行dropout
hidden_states = self.dropout(hidden_states)
# 检查是否启用了 DeepSpeed zero3
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
position_bias = None
# 迭代处理每个层
for i, layer in enumerate(self.layers):
if output_hidden_states:
# 如果需要输出隐藏状态,将当前隐藏状态添加到所有隐藏状态元组中
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加 LayerDrop(参见 https://arxiv.org/abs/1909.11556 进行描述)
dropout_probability = torch.rand([])
# 根据 LayerDrop 的概率决定是否跳过当前层
skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
if not skip_the_layer or deepspeed_zero3_is_enabled:
# 在 DeepSpeed zero3 情况下,所有 GPU 必须同步运行
# 如果启用了梯度检查点且处于训练阶段,使用梯度检查点函数处理当前层的调用
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer.__call__,
hidden_states,
attention_mask,
position_bias,
output_attentions,
)
else:
# 否则直接调用当前层处理隐藏状态
layer_outputs = layer(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
position_bias=position_bias,
)
# 更新隐藏状态和位置偏置
hidden_states, position_bias = layer_outputs[:2]
# 如果跳过当前层,设置层输出为 None
if skip_the_layer:
layer_outputs = (None, None)
# 如果需要输出自注意力权重,将当前层的自注意力权重添加到所有自注意力元组中
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[2],)
# 对最终的隐藏状态进行 LayerNorm 处理
hidden_states = self.layer_norm(hidden_states)
# 如果需要输出隐藏状态,将最终的隐藏状态添加到所有隐藏状态元组中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不返回字典形式的结果,则根据需求返回相应的元组
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 返回以 BaseModelOutput 形式封装的结果
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
)
"""
使用 Gumbel softmax 进行向量量化。参见[CATEGORICAL REPARAMETERIZATION WITH
GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf)获取更多信息。
"""
def __init__(self, config):
super().__init__()
self.num_groups = config.num_codevector_groups # 设置编码向量组数
self.num_vars = config.num_codevectors_per_group # 每组编码向量的数量
if config.codevector_dim % self.num_groups != 0:
raise ValueError(
f"`config.codevector_dim {config.codevector_dim} must be divisible"
f" by `config.num_codevector_groups` {self.num_groups} "
"for concatenation."
)
# 存储码本变量(码字)
self.codevectors = nn.Parameter(
torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
)
self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) # 权重投影层
# 可以在训练中进行衰减
self.temperature = 2
@staticmethod
def _compute_perplexity(probs):
"""
计算困惑度函数。
Args:
probs (torch.Tensor): 概率分布张量
Returns:
torch.Tensor: 计算得到的困惑度值
"""
marginal_probs = probs.mean(dim=0) # 计算边际概率
perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum() # 计算困惑度
return perplexity
def forward(self, hidden_states):
# 获取输入张量的批大小、序列长度和隐藏单元大小
batch_size, sequence_length, hidden_size = hidden_states.shape
# 将隐藏状态投影到代码向量维度
hidden_states = self.weight_proj(hidden_states)
# 将张量形状重新视图为(batch_size * sequence_length * num_groups, -1)
hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
if self.training:
# 使用Gumbel Softmax采样代码向量的概率,以可区分的方式
codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
codevector_probs = codevector_probs.type_as(hidden_states)
# 计算困惑度
codevector_soft_dist = torch.softmax(
hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
)
perplexity = self._compute_perplexity(codevector_soft_dist)
else:
# 在非可区分的方式下取argmax
# 计算硬代码向量分布(one hot)
codevector_idx = hidden_states.argmax(dim=-1)
codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
-1, codevector_idx.view(-1, 1), 1.0
)
codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
# 计算困惑度
perplexity = self._compute_perplexity(codevector_probs)
# 将codevector_probs形状重新视图为(batch_size * sequence_length, -1)
codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
# 使用概率检索代码向量
codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
# 返回最终的codevectors和困惑度
return codevectors, perplexity
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->WavLM
class WavLMAdapter(nn.Module):
def __init__(self, config):
super().__init__()
# 如果输出的隐藏层大小与配置中的隐藏层大小不同,可能需要进行降维投影
if config.output_hidden_size != config.hidden_size:
# 创建一个线性投影层,将隐藏状态大小从隐藏层大小投影到输出隐藏层大小
self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
# 创建一个LayerNorm层,用于投影后的隐藏状态的归一化
self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
else:
self.proj = self.proj_layer_norm = None
# 创建一系列适配器层,并存储在模块列表中
self.layers = nn.ModuleList(WavLMAdapterLayer(config) for _ in range(config.num_adapter_layers))
# 设置层丢弃率
self.layerdrop = config.layerdrop
def forward(self, hidden_states):
# 如果存在投影层和LayerNorm层,则对隐藏状态进行投影
if self.proj is not None and self.proj_layer_norm is not None:
hidden_states = self.proj(hidden_states)
hidden_states = self.proj_layer_norm(hidden_states)
# 转置隐藏状态的维度,将第1和第2维互换位置
hidden_states = hidden_states.transpose(1, 2)
# 对每个适配器层进行迭代计算
for layer in self.layers:
# 随机生成一个丢弃概率
layerdrop_prob = np.random.random()
# 如果处于评估模式或者随机生成的概率大于层丢弃率,则应用该适配器层
if not self.training or (layerdrop_prob > self.layerdrop):
hidden_states = layer(hidden_states)
# 再次转置隐藏状态的维度,将第1和第2维互换位置
hidden_states = hidden_states.transpose(1, 2)
# 返回最终的隐藏状态
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->WavLM
class WavLMAdapterLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 创建一个一维卷积层,用于适配器
self.conv = nn.Conv1d(
config.output_hidden_size, # 输入通道数为输出隐藏层大小
2 * config.output_hidden_size, # 输出通道数为2倍的输出隐藏层大小
config.adapter_kernel_size, # 卷积核大小由配置定义
stride=config.adapter_stride, # 卷积步长由配置定义
padding=1, # 填充为1
)
def forward(self, hidden_states):
# 将隐藏状态输入卷积层进行卷积操作
hidden_states = self.conv(hidden_states)
# 使用门控线性单元(Gated Linear Unit, GLU)激活函数进行非线性变换
hidden_states = nn.functional.glu(hidden_states, dim=1)
# 返回经过卷积和GLU激活函数处理后的隐藏状态
return hidden_states
class WavLMPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 配置类为WavLMConfig
config_class = WavLMConfig
# 基础模型前缀为"wavlm"
base_model_prefix = "wavlm"
# 主输入名称为"input_values"
main_input_name = "input_values"
# 支持梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
# 如果模块是 WavLMGumbelVectorQuantizer 类型,使用特殊的初始化方法
if isinstance(module, WavLMGumbelVectorQuantizer):
# 初始化权重矩阵的权重数据为标准正态分布
module.weight_proj.weight.data.normal_(mean=0.0, std=1)
# 将偏置数据初始化为零
module.weight_proj.bias.data.zero_()
# 使用均匀分布初始化编码向量
nn.init.uniform_(module.codevectors)
# 如果模块是 WavLMPositionalConvEmbedding 类型,使用特定的正态分布初始化
elif isinstance(module, WavLMPositionalConvEmbedding):
# 使用正态分布初始化卷积核权重数据
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
# 将卷积层的偏置初始化为常数0
nn.init.constant_(module.conv.bias, 0)
# 如果模块是 WavLMFeatureProjection 类型,使用均匀分布初始化投影权重和偏置
elif isinstance(module, WavLMFeatureProjection):
# 计算均匀分布的上下限
k = math.sqrt(1 / module.projection.in_features)
# 使用均匀分布初始化投影层的权重
nn.init.uniform_(module.projection.weight, a=-k, b=k)
# 使用均匀分布初始化投影层的偏置
nn.init.uniform_(module.projection.bias, a=-k, b=k)
# 如果模块是 nn.Linear 类型,使用正态分布初始化权重,同时将偏置初始化为零
elif isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
# 如果模块是 nn.LayerNorm 或 nn.GroupNorm 类型,将偏置初始化为零,权重初始化为1
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
# 如果模块是 nn.Conv1d 类型,使用 Kaiming 正态分布初始化权重
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
# 计算均匀分布的上下限
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
# 使用均匀分布初始化卷积层的偏置
nn.init.uniform_(module.bias, a=-k, b=k)
def _get_feat_extract_output_lengths(
self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
):
"""
Computes the output length of the convolutional layers
"""
# 如果未指定 add_adapter,则使用配置中的默认值
add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
def _conv_out_length(input_length, kernel_size, stride):
# 根据 PyTorch 文档计算一维卷积层的输出长度公式
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
# 根据配置中的卷积核大小和步长计算每个卷积层的输出长度
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
# 如果需要添加适配器,根据配置中的适配器层数计算适配器的输出长度
if add_adapter:
for _ in range(self.config.num_adapter_layers):
input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
return input_lengths
def _get_feature_vector_attention_mask(
self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
):
"""Compute attention mask for feature vectors"""
# 此方法计算用于特征向量的注意力掩码,输入参数包括特征向量的长度和注意力掩码张量
# 计算非填充部分的长度,相当于 attention_mask.sum(-1),但不进行原地操作以便在推断模式下运行
non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
# 根据非填充长度获取特征提取器的输出长度,可以选择添加适配器
output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
output_lengths = output_lengths.to(torch.long)
# 获取批次大小
batch_size = attention_mask.shape[0]
# 创建一个全零的注意力掩码张量,形状为 (batch_size, feature_vector_length),与输入的注意力掩码相同的数据类型和设备
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
# 设置输出长度前的所有位置为 1,确保这些位置上的值被完全注意到
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
# 将注意力掩码进行翻转,累积求和,并再次翻转,最终转换为布尔类型
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
# 返回最终的注意力掩码张量
return attention_mask
# WAVLM_START_DOCSTRING 变量,包含了关于 WavLM 模型的详细介绍和引用的论文信息
WAVLM_START_DOCSTRING = r"""
WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# WAVLM_INPUTS_DOCSTRING 变量,此处还未添加具体的文档字符串内容
WAVLM_INPUTS_DOCSTRING = r"""
"""
Args:
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
# 输入的原始语音波形的浮点值。可以通过加载 `.flac` 或 `.wav` 音频文件得到一个 `List[float]` 或 `numpy.ndarray` 类型的数组。
# 使用 `AutoProcessor` 进行填充并转换为 `torch.FloatTensor` 类型的张量。详见 [`Wav2Vec2Processor.__call__`]。
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
# 遮罩,用于避免在填充标记索引上执行卷积和注意力操作。遮罩中的值选择在 `[0, 1]` 范围内:
# - 1 表示**未遮罩**的标记,
# - 0 表示**已遮罩**的标记。
# [什么是注意力遮罩?](../glossary#attention-mask)
# <Tip warning={true}>
# 如果相应的处理器具有 `config.return_attention_mask == True`,则应传递 `attention_mask`。对于所有处理器的配置中,`config.return_attention_mask == False` 的模型,在进行批处理推断时应避免传递 `attention_mask` 以避免性能下降。对于这些模型,`input_values` 应仅填充为 0 并传递而不传递 `attention_mask`。请注意,这些模型根据 `input_values` 是否填充会得到略有不同的结果。
# </Tip>
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。有关更多细节,请参阅返回的张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。有关更多细节,请参阅返回的张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回一个 [`~utils.ModelOutput`] 而不是一个普通的元组。
"""
@add_start_docstrings(
"The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
WAVLM_START_DOCSTRING,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model 复制而来,将 Wav2Vec2Model 改为 WavLMModel,wav2vec2 改为 wavlm,WAV_2_VEC_2 改为 WAVLM,WavLMBaseModelOutput 改为 Wav2Vec2BaseModelOutput
class WavLMModel(WavLMPreTrainedModel):
def __init__(self, config: WavLMConfig):
super().__init__(config)
self.config = config
self.feature_extractor = WavLMFeatureEncoder(config) # 初始化特征提取器
self.feature_projection = WavLMFeatureProjection(config) # 初始化特征投影器
# 如果配置中的 mask_time_prob 大于 0.0 或者 mask_feature_prob 大于 0.0,则模型需要掩码向量
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_()) # 初始化掩码特征向量
# 根据配置选择稳定层归一化编码器或一般编码器
if config.do_stable_layer_norm:
self.encoder = WavLMEncoderStableLayerNorm(config) # 初始化稳定层归一化编码器
else:
self.encoder = WavLMEncoder(config) # 初始化一般编码器
self.adapter = WavLMAdapter(config) if config.add_adapter else None # 根据配置选择是否添加适配器
# 初始化权重并应用最终处理
self.post_init()
def freeze_feature_extractor(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其在训练过程中不会更新其参数。
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其在训练过程中不会更新其参数。
"""
self.feature_extractor._freeze_parameters()
def _mask_hidden_states(
self,
hidden_states: torch.FloatTensor,
mask_time_indices: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
"""
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
# `config.apply_spec_augment` can set masking to False
# 检查配置中的 `apply_spec_augment` 是否为 True,如果不是,则直接返回隐藏状态
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
# generate indices & apply SpecAugment along time axis
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
# apply SpecAugment along time axis with given mask_time_indices
# 如果给定了 mask_time_indices,则使用这些索引应用 SpecAugment 到时间轴上的隐藏状态
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
# 根据配置中的概率生成 mask_time_indices,并应用 SpecAugment 到时间轴上的隐藏状态
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
# generate indices & apply SpecAugment along feature axis
# 根据配置中的概率生成 mask_feature_indices,并应用 SpecAugment 到特征轴上的隐藏状态
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0
return hidden_states
@add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Wav2Vec2BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
mask_time_indices: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
# 如果输出注意力值未指定,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果输出隐藏状态未指定,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果返回字典未指定,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 提取输入特征向量
extract_features = self.feature_extractor(input_values)
# 调整特征向量的维度顺序
extract_features = extract_features.transpose(1, 2)
if attention_mask is not None:
# 计算与特征向量对应的减少的注意力掩码
attention_mask = self._get_feature_vector_attention_mask(
extract_features.shape[1], attention_mask, add_adapter=False
)
# 对特征向量进行特征投影
hidden_states, extract_features = self.feature_projection(extract_features)
# 根据给定的时间索引和注意力掩码屏蔽隐藏状态
hidden_states = self._mask_hidden_states(
hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
)
# 使用编码器处理隐藏状态和注意力掩码
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的最后隐藏状态
hidden_states = encoder_outputs[0]
# 如果存在适配器模块,应用适配器
if self.adapter is not None:
hidden_states = self.adapter(hidden_states)
# 如果不要求返回字典形式的输出,返回一个元组
if not return_dict:
return (hidden_states, extract_features) + encoder_outputs[1:]
# 否则,返回一个 Wav2Vec2BaseModelOutput 对象
return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states,
extract_features=extract_features,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
WAVLM_START_DOCSTRING,
)
# 使用装饰器 `add_start_docstrings` 添加模型的文档字符串,描述了该模型的用途和特性
# 从 `transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC` 复制而来,修改为 `WavLMForCTC`,并进行了相应的符号和名称替换
class WavLMForCTC(WavLMPreTrainedModel):
def __init__(self, config, target_lang: Optional[str] = None):
# 调用父类的初始化方法,传入配置信息
super().__init__(config)
# 初始化 WavLM 模型
self.wavlm = WavLMModel(config)
# 添加一个 dropout 层
self.dropout = nn.Dropout(config.final_dropout)
# 设置目标语言属性
self.target_lang = target_lang
# 检查配置中是否定义了词汇表大小,如果没有则引发错误
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
# 根据配置决定输出隐藏层大小
output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
# 添加一个线性层作为语言模型的输出层
self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
# 初始化权重并进行最终处理
self.post_init()
def tie_weights(self):
"""
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.
This method is **not** supposed to be called by the user and is prone to be changed in the future.
"""
# 覆盖 `PreTrainedModel.tie_weights` 方法,以便在传递 `target_lang=...` 给 `from_pretrained(...)` 时能正确加载适配器权重
# 注意,通常 `tie_weights` 用于绑定输入和输出嵌入权重。在这里重新用于正确加载 WavLM 的适配器层,以避免为 `PreTrainedModel` 引入新的 API。
# 虽然有些许 hacky,但是 WavLM 永远不必绑定输入和输出嵌入,因此在这里重新用这个函数是可以接受的。
# 获取目标语言
target_lang = self.target_lang
# 如果 `target_lang` 不为 `None`,且 `config.adapter_attn_dim` 未定义,则引发错误
if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
# 如果 `target_lang` 为 `None`,但 `config.adapter_attn_dim` 已定义,则记录信息提示默认设置为 'eng'
elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
logger.info("By default `target_lang` is set to 'eng'.")
# 如果 `target_lang` 不为 `None`,则加载适配器
elif target_lang is not None:
self.load_adapter(target_lang, force_load=True)
# 调用此函数将禁用特征编码器的梯度计算,使其参数在训练期间不会更新。
def freeze_feature_extractor(self):
# 发出警告信息,提醒方法 `freeze_feature_extractor` 将在 Transformers v5 中删除,
# 建议使用等效的 `freeze_feature_encoder` 方法。
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 `freeze_feature_encoder` 方法冻结特征编码器的参数。
self.freeze_feature_encoder()
# 调用此函数将禁用特征编码器的梯度计算,使其参数在训练期间不会更新。
def freeze_feature_encoder(self):
# 调用特征编码器内部的方法 `_freeze_parameters`,冻结其参数。
self.wavlm.feature_extractor._freeze_parameters()
# 调用此函数将禁用基础模型的梯度计算,使其参数在训练期间不会更新,仅更新分类头部。
def freeze_base_model(self):
# 遍历语音语言模型 `wavlm` 的所有参数,并将其 `requires_grad` 属性设为 False。
for param in self.wavlm.parameters():
param.requires_grad = False
# 重写了 `forward` 方法,并应用了两个装饰器 `add_start_docstrings_to_model_forward` 和 `add_code_sample_docstrings`。
# 这些装饰器用于向 `forward` 方法添加文档字符串,提供了模型输入、输出和示例代码的描述。
@add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
# 设置返回字典,如果未提供,则使用配置中的返回字典设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用wavlm模型,传入输入值和额外的参数,并获取输出
outputs = self.wavlm(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出的隐藏状态,并应用dropout进行正则化
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
# 将隐藏状态输入到语言模型头部以获取预测的logits
logits = self.lm_head(hidden_states)
# 初始化损失为None
loss = None
if labels is not None:
# 检查标签是否超出词汇表大小,如果是则引发值错误
if labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
# 根据注意力掩码获取输入长度
attention_mask = (
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# 假设填充的标记用-100表示未被关注时
# 创建标签掩码以指示有效的标签位置和计算目标长度
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
# 对logits进行log_softmax处理,并进行维度变换
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
# 使用ctc_loss计算损失,确保不启用fp16计算
with torch.backends.cudnn.flags(enabled=False):
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
# 如果不需要返回字典,则构建输出元组
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 返回CausalLMOutput对象,封装损失、logits、隐藏状态和注意力张量
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
# 定义一个带有顶部序列分类头部的 WavLM 模型,用于类似 SUPERB 关键词检测任务的应用
@add_start_docstrings(
"""
WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
SUPERB Keyword Spotting.
""",
WAVLM_START_DOCSTRING,
)
class WavLMForSequenceClassification(WavLMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 如果配置允许使用适配器且配置为真,则引发值错误,因为序列分类不支持 WavLM 适配器
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)"
)
# 初始化 WavLM 模型
self.wavlm = WavLMModel(config)
# 计算层数,包括变换器层和输入嵌入
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
# 如果配置使用加权层求和,则初始化层权重
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 用于投影的线性层,将隐藏大小投影到分类器投影大小
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
# 分类器线性层,将分类器投影大小映射到类别数
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor 复制而来
def freeze_feature_extractor(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其参数在训练期间不会更新。
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder 复制而来
def freeze_feature_encoder(self):
"""
调用此函数将禁用特征编码器的梯度计算,使其参数在训练期间不会更新。
"""
self.wavlm.feature_extractor._freeze_parameters()
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model 复制而来
def freeze_base_model(self):
"""
调用此函数将禁用基础模型的梯度计算,使其参数在训练期间不会更新。只有分类头部将会更新。
"""
for param in self.wavlm.parameters():
param.requires_grad = False
@add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward复制过来,替换Wav2Vec2为WavLM,wav2vec2为wavlm
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确定是否返回字典格式的输出,若未指定则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据配置决定是否输出隐藏状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用wavlm模型进行正向传播
outputs = self.wavlm(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置中指定使用加权层求和,则对隐藏状态进行加权求和操作
if self.config.use_weighted_layer_sum:
# 获取隐藏状态
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
# 在指定维度上堆叠隐藏状态
hidden_states = torch.stack(hidden_states, dim=1)
# 计算加权层的softmax权重
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
# 对隐藏状态进行加权求和操作
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
# 否则直接使用第一个输出作为隐藏状态
hidden_states = outputs[0]
# 将加权求和后的隐藏状态投影到目标维度
hidden_states = self.projector(hidden_states)
# 如果没有指定attention_mask,则将隐藏状态进行均值池化操作
if attention_mask is None:
pooled_output = hidden_states.mean(dim=1)
else:
# 否则,根据给定的attention_mask计算padding_mask
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
# 将非padding位置的隐藏状态设置为0
hidden_states[~padding_mask] = 0.0
# 对padding后的隐藏状态进行求和并除以padding_mask的求和得到均值池化的结果
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
# 将池化后的输出传入分类器得到logits
logits = self.classifier(pooled_output)
# 初始化损失为None
loss = None
# 如果给定了labels,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
# 如果不返回字典格式的输出,则按顺序返回logits和隐藏状态列表
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 如果返回字典格式的输出,则使用SequenceClassifierOutput对象包装结果并返回
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 用于在顶部添加模型文档字符串,描述该模型是在音频帧分类任务上带有分类头的WavLM模型
@add_start_docstrings(
"""
WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
""",
WAVLM_START_DOCSTRING,
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification复制而来,将Wav2Vec2->WavLM,wav2vec2->wavlm,WAV_2_VEC_2->WAVLM
class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 如果配置中有add_adapter属性且为True,则引发值错误,因为音频帧分类不支持使用WavLM适配器
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)"
)
# 初始化WavLM模型
self.wavlm = WavLMModel(config)
# 计算层数,包括变压器层和输入嵌入层
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
# 如果配置中使用加权层求和,则初始化层权重
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 分类器层,将隐藏状态大小映射到类标签数量
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.num_labels = config.num_labels
# 初始化模型权重
self.init_weights()
# Deprecated警告,已弃用freeze_feature_extractor方法,请使用freeze_feature_encoder代替
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
# 冻结特征编码器,禁止特征编码器参数的梯度计算,使其在训练过程中不会更新
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.wavlm.feature_extractor._freeze_parameters()
# 冻结基础模型,禁止基础模型参数的梯度计算,使其在训练过程中不会更新,仅分类头会更新
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
for param in self.wavlm.parameters():
param.requires_grad = False
# 为模型前向传播方法添加模型输入的文档字符串,引用WAVLM_INPUTS_DOCSTRING,并提供代码示例文档字符串
@add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_FRAME_CLASS_CHECKPOINT,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_FRAME_EXPECTED_OUTPUT,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确定是否使用返回字典,如果未指定则使用配置中的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据配置决定是否输出隐藏状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用语言模型的前向传播,获取模型的输出结果
outputs = self.wavlm(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置指定使用加权层求和,则处理隐藏状态
if self.config.use_weighted_layer_sum:
# 获取模型输出中的隐藏状态
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
# 将隐藏状态堆叠在一起
hidden_states = torch.stack(hidden_states, dim=1)
# 计算加权层的权重并进行softmax归一化
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
# 对隐藏状态进行加权求和
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
# 否则直接使用模型输出的第一个隐藏状态
hidden_states = outputs[0]
# 使用分类器对隐藏状态进行分类预测
logits = self.classifier(hidden_states)
# 初始化损失为None
loss = None
# 如果提供了标签,则计算损失
if labels is not None:
loss_fct = CrossEntropyLoss()
# 计算交叉熵损失
loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
# 如果不要求返回字典,则返回分类器的输出和隐藏状态
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return output
# 否则返回一个TokenClassifierOutput对象,包括损失、预测的logits、隐藏状态和注意力权重
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss 复制而来,定义了一个 AMSoftmaxLoss 类
class AMSoftmaxLoss(nn.Module):
def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
super(AMSoftmaxLoss, self).__init__()
self.scale = scale # 缩放参数,用于调整余弦相似度的范围
self.margin = margin # 间隔参数,用于调整余弦相似度与边界的距离
self.num_labels = num_labels # 标签类别数量
self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True) # 损失函数使用的权重参数
self.loss = nn.CrossEntropyLoss() # 使用交叉熵作为损失函数
def forward(self, hidden_states, labels):
labels = labels.flatten() # 将标签展平为一维张量
weight = nn.functional.normalize(self.weight, dim=0) # 对权重进行 L2 归一化
hidden_states = nn.functional.normalize(hidden_states, dim=1) # 对隐藏状态进行 L2 归一化
cos_theta = torch.mm(hidden_states, weight) # 计算余弦相似度
psi = cos_theta - self.margin # 计算带有间隔的余弦相似度
onehot = nn.functional.one_hot(labels, self.num_labels) # 将标签转换为独热编码
logits = self.scale * torch.where(onehot.bool(), psi, cos_theta) # 根据标签和间隔调整后的余弦相似度计算最终的 logits
loss = self.loss(logits, labels) # 计算损失值
return loss
# 从 transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer 复制而来,定义了一个 TDNNLayer 类
class TDNNLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id] # 输入维度
self.out_conv_dim = config.tdnn_dim[layer_id] # 输出维度
self.kernel_size = config.tdnn_kernel[layer_id] # 卷积核大小
self.dilation = config.tdnn_dilation[layer_id] # 膨胀率
self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim) # 线性层作为卷积核
self.activation = nn.ReLU() # ReLU 激活函数
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
if is_peft_available(): # 检查是否可用 peft 库
from peft.tuners.lora import LoraLayer # 导入 LoraLayer
if isinstance(self.kernel, LoraLayer): # 如果 kernel 是 LoraLayer 类型
warnings.warn(
"Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
"You should exclude TDNNLayer from LoRA's target modules.",
)
# 为了向后兼容性,保留 nn.Linear,但调用 F.conv1d 以提高速度
hidden_states = hidden_states.transpose(1, 2) # 转置隐藏状态的维度
weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2) # 调整权重的维度
hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) # 使用 conv1d 进行卷积操作
hidden_states = hidden_states.transpose(1, 2) # 再次转置隐藏状态的维度
hidden_states = self.activation(hidden_states) # 应用 ReLU 激活函数
return hidden_states
@add_start_docstrings(
"""
WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
""",
WAVLM_START_DOCSTRING,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector 复制而来,定义了一个 WavLMForXVector 类,用于 XVector 特征提取
class WavLMForXVector(WavLMPreTrainedModel):
def __init__(self, config):
super().__init__(config) # 调用父类的初始化方法,传递配置参数给父类
self.wavlm = WavLMModel(config) # 创建一个语音语言模型对象
num_layers = config.num_hidden_layers + 1 # 计算层的数量:变换器层 + 输入嵌入层
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) # 如果配置启用了加权层求和,则创建一个权重参数
self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) # 创建一个线性层投影器
tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] # 创建一系列TDNN层
self.tdnn = nn.ModuleList(tdnn_layers) # 将TDNN层存储在模块列表中
self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim) # 创建特征提取器的线性层
self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim) # 创建分类器的线性层
self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels) # 创建AMSoftmax损失函数对象
self.init_weights() # 初始化模型权重
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder() # 警告已弃用此方法,建议使用等效的 `freeze_feature_encoder` 方法
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.wavlm.feature_extractor._freeze_parameters() # 冻结特征编码器的参数,禁用其在训练期间的梯度计算
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
for param in self.wavlm.parameters():
param.requires_grad = False # 禁用基础模型的梯度计算,使其参数在训练期间不会更新。仅更新分类头部的参数。
def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
Computes the output length of the TDNN layers
"""
def _conv_out_length(input_length, kernel_size, stride):
# 1D convolutional layer output length formula taken
# from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
return (input_length - kernel_size) // stride + 1 # 计算1D卷积层的输出长度公式
for kernel_size in self.config.tdnn_kernel:
input_lengths = _conv_out_length(input_lengths, kernel_size, 1) # 遍历TDNN内核大小,计算输入长度的输出长度
return input_lengths
@add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_XVECTOR_CHECKPOINT,
output_type=XVectorOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_XVECTOR_EXPECTED_OUTPUT,
)
# 定义一个方法 `forward`,用于模型前向传播
def forward(
# 输入值,可以是一个 PyTorch 张量,可选参数
self,
input_values: Optional[torch.Tensor],
# 注意力掩码,用于指定模型注意力分布,可选参数
attention_mask: Optional[torch.Tensor] = None,
# 是否输出注意力分布,可选参数,默认为 None
output_attentions: Optional[bool] = None,
# 是否输出隐藏状态,可选参数,默认为 None
output_hidden_states: Optional[bool] = None,
# 是否返回一个字典作为输出,可选参数,默认为 None
return_dict: Optional[bool] = None,
# 标签数据,可选参数,用于某些任务的监督学习
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, XVectorOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 根据返回参数设置是否使用返回字典
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据配置决定是否输出隐藏层状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用 WAVLM 模型进行语音识别任务的计算
outputs = self.wavlm(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置中使用加权层求和,则对隐藏状态进行加权求和操作
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
hidden_states = torch.stack(hidden_states, dim=1)
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
hidden_states = outputs[0]
# 对隐藏状态进行投影
hidden_states = self.projector(hidden_states)
# 通过一系列 TDNN 层处理隐藏状态特征
for tdnn_layer in self.tdnn:
hidden_states = tdnn_layer(hidden_states)
# 统计池化操作
if attention_mask is None:
# 如果没有注意力掩码,则对隐藏状态在第一维上进行均值和标准差计算
mean_features = hidden_states.mean(dim=1)
std_features = hidden_states.std(dim=1)
else:
# 根据注意力掩码计算特征提取器输出的长度
feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
mean_features = []
std_features = []
for i, length in enumerate(tdnn_output_lengths):
# 对每个序列进行统计池化操作
mean_features.append(hidden_states[i, :length].mean(dim=0))
std_features.append(hidden_states[i, :length].std(dim=0))
mean_features = torch.stack(mean_features)
std_features = torch.stack(std_features)
statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
# 通过特征提取器得到最终的输出特征向量
output_embeddings = self.feature_extractor(statistic_pooling)
# 使用分类器进行最终的分类预测
logits = self.classifier(output_embeddings)
# 计算损失
loss = None
if labels is not None:
loss = self.objective(logits, labels)
# 根据返回参数决定返回值的组织方式
if not return_dict:
# 如果不使用返回字典,则返回元组形式的结果
output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 使用自定义的输出类构造返回结果
return XVectorOutput(
loss=loss,
logits=logits,
embeddings=output_embeddings,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\wavlm\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_wavlm": ["WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "WavLMConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_wavlm"] = [
"WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST",
"WavLMForAudioFrameClassification",
"WavLMForCTC",
"WavLMForSequenceClassification",
"WavLMForXVector",
"WavLMModel",
"WavLMPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_wavlm import (
WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST,
WavLMForAudioFrameClassification,
WavLMForCTC,
WavLMForSequenceClassification,
WavLMForXVector,
WavLMModel,
WavLMPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\whisper\configuration_whisper.py
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
from ...utils import logging
if TYPE_CHECKING:
from ...feature_extraction_utils import FeatureExtractionMixin
from ...tokenization_utils_base import PreTrainedTokenizerBase
from ...utils import TensorType
logger = logging.get_logger(__name__)
WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/config.json",
}
NON_SPEECH_TOKENS = [
1, 2, 7, 8, 9, 10, 14, 25,
26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
]
NON_SPEECH_TOKENS_MULTI = [
1, 2, 7, 8, 9, 10, 14, 25,
26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362
]
class WhisperConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Whisper
[openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
rest of this class docstring for more information.
"""
model_type = "whisper"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=51865,
num_mel_bins=80,
encoder_layers=4,
encoder_attention_heads=6,
decoder_layers=4,
decoder_attention_heads=6,
decoder_ffn_dim=1536,
encoder_ffn_dim=1536,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
decoder_start_token_id=50257,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu",
d_model=384,
dropout=0.0,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
scale_embedding=False,
max_source_positions=1500,
max_target_positions=448,
pad_token_id=50256,
bos_token_id=50256,
eos_token_id=50256,
suppress_tokens=None,
begin_suppress_tokens=[220, 50256],
use_weighted_layer_sum=False,
classifier_proj_size=256,
apply_spec_augment=False,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
mask_feature_min_masks=0,
median_filter_width=7,
**kwargs,
):
self.vocab_size = vocab_size
self.num_mel_bins = num_mel_bins
self.d_model = d_model
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_ffn_dim = encoder_ffn_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
self.max_source_positions = max_source_positions
self.max_target_positions = max_target_positions
self.classifier_proj_size = classifier_proj_size
self.use_weighted_layer_sum = use_weighted_layer_sum
self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
self.median_filter_width = median_filter_width
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
suppress_tokens=suppress_tokens,
begin_suppress_tokens=begin_suppress_tokens,
**kwargs,
)
class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
common_inputs = OrderedDict(
[
("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
]
)
if self.use_past:
common_inputs["decoder_input_ids"] = {0: "batch"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
return common_inputs
def generate_dummy_inputs(
self,
preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional["TensorType"] = None,
sampling_rate: int = 22050,
time_duration: float = 5.0,
frequency: int = 220,
) -> Mapping[str, Any]:
dummy_inputs = OrderedDict()
encoder_inputs = OnnxConfig.generate_dummy_inputs(
self,
preprocessor=preprocessor.feature_extractor,
batch_size=batch_size,
framework=framework,
sampling_rate=sampling_rate,
time_duration=time_duration,
frequency=frequency,
)
encoder_sequence_length = encoder_inputs["input_features"].shape[2]
seq_length = encoder_sequence_length // 2 if self.use_past else seq_length
decoder_inputs = super().generate_dummy_inputs(
preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
)
dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")
if "past_key_values" in decoder_inputs:
dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")
return dummy_inputs
@property
def atol_for_validation(self) -> float:
return 1e-3
.\models\whisper\convert_openai_to_hf.py
"""Converts a Whisper model in OpenAI format to Hugging Face format."""
import argparse
import io
import json
import os
import tempfile
import urllib
import warnings
from typing import Any, Optional, Tuple
import torch
from huggingface_hub.utils import insecure_hashlib
from torch import nn
from tqdm import tqdm
from transformers import (
GenerationConfig,
WhisperConfig,
WhisperFeatureExtractor,
WhisperForConditionalGeneration,
WhisperProcessor,
WhisperTokenizer,
WhisperTokenizerFast,
)
from transformers.models.whisper.tokenization_whisper import LANGUAGES, bytes_to_unicode
from transformers.utils.import_utils import _is_package_available
_MODELS = {
"tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
"tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
"base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
"base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
"small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
"small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
"medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
"medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
"large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
"large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
"large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
}
_TOKENIZERS = {
"multilingual": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken",
"english": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken",
}
def _get_generation_config(
is_multilingual: bool,
num_languages: int = 100,
openai_version: Optional[str] = None,
) -> GenerationConfig:
"""
Loads the appropriate generation config from HF repo based on provided parameters.
Args:
is_multilingual (bool): Flag indicating if multilingual model is used.
num_languages (int, optional): Number of languages for the model (default is 100).
openai_version (Optional[str], optional): Version of OpenAI model to load (default is None).
Returns:
GenerationConfig: Config object for generation.
"""
if openai_version is not None:
repo = f"openai/whisper-{openai_version}"
elif not is_multilingual:
repo = "openai/whisper-medium.en"
elif num_languages < 100:
repo = "openai/whisper-large-v2"
else:
repo = "openai/whisper-large-v3"
gen_cfg = GenerationConfig.from_pretrained(repo)
if openai_version is None:
gen_cfg.alignment_heads = None
warnings.warn(
"Alignment heads have not been included in the generation config, since they are available "
"only for the original OpenAI checkpoints."
"If you want to use word-level timestamps with a custom version of Whisper,"
"see https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb"
"for the example of how to produce word-level timestamps manually."
)
return gen_cfg
def remove_ignore_keys_(state_dict):
"""
Remove specific keys from the provided state_dict.
Args:
state_dict (dict): Dictionary containing the model's state.
Returns:
None
"""
ignore_keys = ["layers", "blocks"]
for k in ignore_keys:
state_dict.pop(k, None)
WHISPER_MAPPING = {
"blocks": "layers",
"mlp.0": "fc1",
"mlp.2": "fc2",
"mlp_ln": "final_layer_norm",
".attn.query": ".self_attn.q_proj",
".attn.key": ".self_attn.k_proj",
".attn.value": ".self_attn.v_proj",
".attn_ln": ".self_attn_layer_norm",
".attn.out": ".self_attn.out_proj",
".cross_attn.query": ".encoder_attn.q_proj",
".cross_attn.key": ".encoder_attn.k_proj",
".cross_attn.value": ".encoder_attn.v_proj",
".cross_attn_ln": ".encoder_attn_layer_norm",
".cross_attn.out": ".encoder_attn.out_proj",
"decoder.ln.": "decoder.layer_norm.",
"encoder.ln.": "encoder.layer_norm.",
"token_embedding": "embed_tokens",
"encoder.positional_embedding": "encoder.embed_positions.weight",
"decoder.positional_embedding": "decoder.embed_positions.weight",
"ln_post": "layer_norm",
}
def rename_keys(s_dict):
"""
Rename keys in the provided dictionary according to pre-defined mapping.
Args:
s_dict (dict): Dictionary whose keys need to be renamed.
Returns:
dict: Dictionary with renamed keys.
"""
keys = list(s_dict.keys())
for key in keys:
new_key = key
for k, v in WHISPER_MAPPING.items():
if k in key:
new_key = new_key.replace(k, v)
print(f"{key} -> {new_key}")
s_dict[new_key] = s_dict.pop(key)
return s_dict
def make_linear_from_emb(emb):
"""
Create a linear layer from an embedding layer.
Args:
emb (nn.Embedding): Embedding layer.
Returns:
nn.Linear: Linear layer initialized with the same weights as the embedding.
"""
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def _download(url: str, root: str) -> Any:
"""
Download a file from a URL to a specified directory.
Args:
url (str): URL of the file to download.
root (str): Directory where the file should be saved.
Returns:
Any: Not explicitly returned value.
"""
os.makedirs(root, exist_ok=True)
filename = os.path.basename(url)
expected_sha256 = url.split("/")[-2]
download_target = os.path.join(root, filename)
if os.path.exists(download_target) and not os.path.isfile(download_target):
raise RuntimeError(f"{download_target} exists and is not a regular file")
if os.path.isfile(download_target):
model_bytes = open(download_target, "rb").read()
if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
return torch.load(io.BytesIO(model_bytes))
else:
warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
with tqdm(
total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
) as loop:
while True:
buffer = source.read(8192)
if not buffer:
break
output.write(buffer)
loop.update(len(buffer))
model_bytes = open(download_target, "rb").read()
if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
raise RuntimeError(
"Model has been downloaded but the SHA256 checksum does not match. Please retry loading the model."
)
return torch.load(io.BytesIO(model_bytes))
def convert_openai_whisper_to_tfms(
checkpoint_path, pytorch_dump_folder_path
) -> Tuple[WhisperForConditionalGeneration, bool, int]:
if ".pt" not in checkpoint_path:
root = os.path.dirname(pytorch_dump_folder_path) or "."
original_checkpoint = _download(_MODELS[checkpoint_path], root)
openai_version = checkpoint_path
else:
original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
openai_version = None
dimensions = original_checkpoint["dims"]
state_dict = original_checkpoint["model_state_dict"]
proj_out_weights = state_dict["decoder.token_embedding.weight"]
remove_ignore_keys_(state_dict)
rename_keys(state_dict)
tie_embeds = True
ffn_dim = state_dict["decoder.layers.0.fc1.weight"].shape[0]
endoftext_id = 50257 if dimensions["n_vocab"] > 51865 else 50256
config = WhisperConfig(
vocab_size=dimensions["n_vocab"],
encoder_ffn_dim=ffn_dim,
decoder_ffn_dim=ffn_dim,
num_mel_bins=dimensions["n_mels"],
d_model=dimensions["n_audio_state"],
max_target_positions=dimensions["n_text_ctx"],
encoder_layers=dimensions["n_audio_layer"],
encoder_attention_heads=dimensions["n_audio_head"],
decoder_layers=dimensions["n_text_layer"],
decoder_attention_heads=dimensions["n_text_head"],
max_source_positions=dimensions["n_audio_ctx"],
eos_token_id=endoftext_id,
bos_token_id=endoftext_id,
pad_token_id=endoftext_id,
decoder_start_token_id=endoftext_id + 1,
)
model = WhisperForConditionalGeneration(config)
missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
if len(missing) > 0 and not set(missing) <= {
"encoder.embed_positions.weights",
"decoder.embed_positions.weights",
}:
raise ValueError(
"Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights` are allowed to be missing,"
f" but all the following weights are missing {missing}"
)
if tie_embeds:
model.proj_out = make_linear_from_emb(model.model.decoder.embed_tokens)
else:
model.proj_out.weight.data = proj_out_weights
is_multilingual = model.config.vocab_size >= 51865
num_languages = model.config.vocab_size - 51765 - int(is_multilingual)
model.generation_config = _get_generation_config(
is_multilingual,
num_languages,
openai_version,
)
return model, is_multilingual, num_languages
def _bpe(mergeable_ranks, token: bytes, max_rank=None) -> list[bytes]:
parts = [bytes([b]) for b in token]
return parts
while True:
min_idx = None
min_rank = None
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
rank = mergeable_ranks.get(pair[0] + pair[1])
if rank is not None and (min_rank is None or rank < min_rank):
min_idx = i
min_rank = rank
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
break
assert min_idx is not None
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
return parts
def convert_tiktoken_bpe_to_hf(tiktoken_url: str):
bpe_ranks = load_tiktoken_bpe(tiktoken_url)
byte_encoder = bytes_to_unicode()
def token_bytes_to_string(b):
return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
merges = []
vocab = {}
for token, rank in bpe_ranks.items():
vocab[token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = tuple(_bpe(bpe_ranks, token, max_rank=rank))
if len(merged) == 2:
merges.append(" ".join(map(token_bytes_to_string, merged)))
return vocab, merges
def convert_tiktoken_to_hf(
multilingual: bool = True, num_languages: int = 100, time_precision=0.02
) -> WhisperTokenizer:
tiktoken_tokenizer_path = _TOKENIZERS["multilingual" if multilingual else "english"]
start_of_transcript = ["<|endoftext|>", "<|startoftranscript|>"]
control_tokens = [
"<|translate|>",
"<|transcribe|>",
"<|startoflm|>",
"<|startofprev|>",
"<|nospeech|>",
"<|notimestamps|>",
]
language_tokens = [f"<|{k}|>" for k in list(LANGUAGES)[:num_languages]]
timestamp_tokens = [("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)]
vocab, merges = convert_tiktoken_bpe_to_hf(tiktoken_tokenizer_path)
with tempfile.TemporaryDirectory() as tmpdirname:
vocab_file = f"{tmpdirname}/vocab.json"
merge_file = f"{tmpdirname}/merges.txt"
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens in merges:
writer.write(bpe_tokens + "\n")
hf_tokenizer = WhisperTokenizer(vocab_file, merge_file)
hf_tokenizer.add_tokens(start_of_transcript + language_tokens + control_tokens, special_tokens=True)
hf_tokenizer.add_tokens(timestamp_tokens, special_tokens=False)
return hf_tokenizer
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", type=str, help="下载的检查点的路径")
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="输出 PyTorch 模型的路径.")
parser.add_argument(
"--convert_preprocessor",
type=bool,
default=False,
help="是否将预处理器(分词器 + 特征提取器)与模型一起转换.",
)
args = parser.parse_args()
model, is_multilingual, num_languages = convert_openai_whisper_to_tfms(
args.checkpoint_path, args.pytorch_dump_folder_path
)
if args.convert_preprocessor:
try:
if not _is_package_available("tiktoken"):
raise """`tiktoken` is not installed, use `pip install tiktoken` to convert the tokenizer"""
except Exception:
pass
else:
from tiktoken.load import load_tiktoken_bpe
tokenizer = convert_tiktoken_to_hf(is_multilingual, num_languages)
feature_extractor = WhisperFeatureExtractor(
feature_size=model.config.num_mel_bins,
)
processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(args.pytorch_dump_folder_path)
fast_tokenizer = WhisperTokenizerFast.from_pretrained(args.pytorch_dump_folder_path)
fast_tokenizer.save_pretrained(args.pytorch_dump_folder_path, legacy_format=False)
model.save_pretrained(args.pytorch_dump_folder_path)
.\models\whisper\english_normalizer.py
import re
import unicodedata
from fractions import Fraction
from typing import Iterator, List, Match, Optional, Union
import regex
ADDITIONAL_DIACRITICS = {
"œ": "oe",
"Œ": "OE",
"ø": "o",
"Ø": "O",
"æ": "ae",
"Æ": "AE",
"ß": "ss",
"ẞ": "SS",
"đ": "d",
"Đ": "D",
"ð": "d",
"Ð": "D",
"þ": "th",
"Þ": "th",
"ł": "l",
"Ł": "L",
}
def remove_symbols_and_diacritics(s: str, keep=""):
"""
替换文本中的标记、符号和标点为空格,并且移除所有重音符号(类别为'Mn')和一些手动映射的特殊字符
"""
def replace_character(char):
if char in keep:
return char
elif char in ADDITIONAL_DIACRITICS:
return ADDITIONAL_DIACRITICS[char]
elif unicodedata.category(char) == "Mn":
return ""
elif unicodedata.category(char)[0] in "MSP":
return " "
return char
return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s))
def remove_symbols(s: str):
"""
替换文本中的标记、符号和标点为空格,保留重音符号
"""
return "".join(" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s))
class BasicTextNormalizer:
"""
文本基础清理类,根据初始化参数移除重音符号和分隔字母
"""
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
self.split_letters = split_letters
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)
s = re.sub(r"\(([^)]+?)\)", "", s)
s = self.clean(s).lower()
if self.split_letters:
s = " ".join(regex.findall(r"\X", s, regex.U))
s = re.sub(r"\s+", " ", s)
return s
class EnglishNumberNormalizer:
"""
英文数字标准化类,将文本中的英文数字转换为阿拉伯数字,并保留后缀,如`1960s`, `274th`, `32nd`等
"""
def __init__(self):
pass
def __call__(self, s: str):
"""
对输入的字符串进行处理,替换文本中的英文数字为阿拉伯数字,并保留后缀
"""
s = s.lower()
s = re.sub(r"\s+", " ", s)
return s
"""
This class provides methods for preprocessing and postprocessing text transformations related to numbers and currencies.
"""
def preprocess(self, s: str):
results = []
segments = re.split(r"\band\s+a\s+half\b", s)
for i, segment in enumerate(segments):
if len(segment.strip()) == 0:
continue
if i == len(segments) - 1:
results.append(segment)
else:
results.append(segment)
last_word = segment.rsplit(maxsplit=2)[-1]
if last_word in self.decimals or last_word in self.multipliers:
results.append("point five")
else:
results.append("and a half")
s = " ".join(results)
s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
return s
def postprocess(self, s: str):
def combine_cents(m: Match):
try:
currency = m.group(1)
integer = m.group(2)
cents = int(m.group(3))
return f"{currency}{integer}.{cents:02d}"
except ValueError:
return m.string
def extract_cents(m: Match):
try:
return f"¢{int(m.group(1))}"
except ValueError:
return m.string
s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
s = re.sub(r"\b1(s?)\b", r"one\1", s)
return s
def __call__(self, s: str):
s = self.preprocess(s)
s = " ".join(word for word in self.process_words(s.split()) if word is not None)
s = self.postprocess(s)
return s
class EnglishSpellingNormalizer:
"""
Applies British-American spelling mappings as listed in [1].
[1] https://www.tysto.com/uk-us-spelling-list.html
"""
def __init__(self, english_spelling_mapping):
self.mapping = english_spelling_mapping
def __call__(self, s: str):
return " ".join(self.mapping.get(word, word) for word in s.split())
class EnglishTextNormalizer:
def __init__(self, english_spelling_mapping):
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
self.replacers = {
r"\bwon't\b": "will not",
r"\bcan't\b": "can not",
r"\blet's\b": "let us",
r"\bain't\b": "aint",
r"\by'all\b": "you all",
r"\bwanna\b": "want to",
r"\bgotta\b": "got to",
r"\bgonna\b": "going to",
r"\bi'ma\b": "i am going to",
r"\bimma\b": "i am going to",
r"\bwoulda\b": "would have",
r"\bcoulda\b": "could have",
r"\bshoulda\b": "should have",
r"\bma'am\b": "madam",
r"\bmr\b": "mister ",
r"\bmrs\b": "missus ",
r"\bst\b": "saint ",
r"\bdr\b": "doctor ",
r"\bprof\b": "professor ",
r"\bcapt\b": "captain ",
r"\bgov\b": "governor ",
r"\bald\b": "alderman ",
r"\bgen\b": "general ",
r"\bsen\b": "senator ",
r"\brep\b": "representative ",
r"\bpres\b": "president ",
r"\brev\b": "reverend ",
r"\bhon\b": "honorable ",
r"\basst\b": "assistant ",
r"\bassoc\b": "associate ",
r"\blt\b": "lieutenant ",
r"\bcol\b": "colonel ",
r"\bjr\b": "junior ",
r"\bsr\b": "senior ",
r"\besq\b": "esquire ",
r"'d been\b": " had been",
r"'s been\b": " has been",
r"'d gone\b": " had gone",
r"'s gone\b": " has gone",
r"'d done\b": " had done",
r"'s got\b": " has got",
r"n't\b": " not",
r"'re\b": " are",
r"'s\b": " is",
r"'d\b": " would",
r"'ll\b": " will",
r"'t\b": " not",
r"'ve\b": " have",
r"'m\b": " am",
}
self.standardize_numbers = EnglishNumberNormalizer()
self.standardize_spellings = EnglishSpellingNormalizer(english_spelling_mapping)
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)
s = re.sub(r"\(([^)]+?)\)", "", s)
s = re.sub(self.ignore_patterns, "", s)
s = re.sub(r"\s+'", "'", s)
for pattern, replacement in self.replacers.items():
s = re.sub(pattern, replacement, s)
s = re.sub(r"(\d),(\d)", r"\1\2", s)
s = re.sub(r"\.([^0-9]|$)", r" \1", s)
s = remove_symbols_and_diacritics(s, keep=".%$¢€£")
s = self.standardize_numbers(s)
s = self.standardize_spellings(s)
s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
s = re.sub(r"([^0-9])%", r"\1 ", s)
s = re.sub(r"\s+", " ", s)
return s
.\models\whisper\feature_extraction_whisper.py
"""
Feature extractor class for Whisper
"""
from typing import List, Optional, Union
import numpy as np
from ... import is_torch_available
from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import TensorType, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
class WhisperFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a Whisper feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.
This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
Fourier Transform` which should match pytorch's `torch.stft` equivalent.
Args:
feature_size (`int`, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
hop_length (`int`, defaults to 160):
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
chunk_length (`int`, defaults to 30):
The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
sequences.
n_fft (`int`, defaults to 400):
Size of the Fourier transform.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
"""
model_input_names = ["input_features"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
hop_length=160,
chunk_length=30,
n_fft=400,
padding_value=0.0,
return_attention_mask=False,
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
self.n_fft = n_fft
self.hop_length = hop_length
self.chunk_length = chunk_length
self.n_samples = chunk_length * sampling_rate
self.nb_max_frames = self.n_samples // hop_length
self.sampling_rate = sampling_rate
self.mel_filters = mel_filter_bank(
num_frequency_bins=1 + n_fft // 2,
num_mel_filters=feature_size,
min_frequency=0.0,
max_frequency=8000.0,
sampling_rate=sampling_rate,
norm="slaney",
mel_scale="slaney",
)
def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
"""
Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
implementation with 1e-5 tolerance.
"""
log_spec = spectrogram(
waveform,
window_function(self.n_fft, "hann"),
frame_length=self.n_fft,
hop_length=self.hop_length,
power=2.0,
mel_filters=self.mel_filters,
log_mel="log10",
)
log_spec = log_spec[:, :-1]
log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0
return log_spec
def _torch_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
"""
Compute the log-mel spectrogram of the provided audio using the PyTorch STFT implementation.
"""
waveform = torch.from_numpy(waveform).type(torch.float32)
window = torch.hann_window(self.n_fft)
stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
magnitudes = stft[..., :-1].abs() ** 2
mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
mel_spec = mel_filters.T @ magnitudes
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0
return log_spec.numpy()
@staticmethod
def zero_mean_unit_var_norm(
input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
) -> List[np.ndarray]:
"""
Every array in the list is normalized to have zero mean and unit variance
"""
if attention_mask is not None:
attention_mask = np.array(attention_mask, np.int32)
normed_input_values = []
for vector, length in zip(input_values, attention_mask.sum(-1)):
normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
if length < normed_slice.shape[0]:
normed_slice[length:] = padding_value
normed_input_values.append(normed_slice)
else:
normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
return normed_input_values
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
truncation: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = None,
padding: Optional[str] = "max_length",
max_length: Optional[int] = None,
sampling_rate: Optional[int] = None,
do_normalize: Optional[bool] = None,
**kwargs,