Transformers 源码解析(十二)
.\models\auto\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_torch_available,
)
_import_structure = {
"auto_factory": ["get_values"],
"configuration_auto": ["ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"],
"feature_extraction_auto": ["FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor"],
"image_processing_auto": ["IMAGE_PROCESSOR_MAPPING", "AutoImageProcessor"],
"processing_auto": ["PROCESSOR_MAPPING", "AutoProcessor"],
"tokenization_auto": ["TOKENIZER_MAPPING", "AutoTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
pass
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
pass
_import_structure["modeling_tf_auto"] = [
"TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_MASK_GENERATION_MAPPING",
"TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
"TF_MODEL_FOR_MASKED_LM_MAPPING",
"TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
"TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"TF_MODEL_FOR_PRETRAINING_MAPPING",
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_TEXT_ENCODING_MAPPING",
"TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"TF_MODEL_FOR_VISION_2_SEQ_MAPPING",
"TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
"TF_MODEL_MAPPING",
"TF_MODEL_WITH_LM_HEAD_MAPPING",
"TFAutoModel",
"TFAutoModelForAudioClassification",
"TFAutoModelForCausalLM",
"TFAutoModelForImageClassification",
"TFAutoModelForMaskedImageModeling",
"TFAutoModelForMaskedLM",
"TFAutoModelForMaskGeneration",
"TFAutoModelForMultipleChoice",
"TFAutoModelForNextSentencePrediction",
"TFAutoModelForPreTraining",
"TFAutoModelForDocumentQuestionAnswering",
"TFAutoModelForQuestionAnswering",
"TFAutoModelForSemanticSegmentation",
"TFAutoModelForSeq2SeqLM",
"TFAutoModelForSequenceClassification",
"TFAutoModelForSpeechSeq2Seq",
"TFAutoModelForTableQuestionAnswering",
"TFAutoModelForTextEncoding",
"TFAutoModelForTokenClassification",
"TFAutoModelForVision2Seq",
"TFAutoModelForZeroShotImageClassification",
"TFAutoModelWithLMHead",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_auto"] = [
"FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_CAUSAL_LM_MAPPING",
"FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_MASKED_LM_MAPPING",
"FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
"FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"FLAX_MODEL_FOR_PRETRAINING_MAPPING",
"FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING",
"FLAX_MODEL_MAPPING",
"FlaxAutoModel",
"FlaxAutoModelForCausalLM",
"FlaxAutoModelForImageClassification",
"FlaxAutoModelForMaskedLM",
"FlaxAutoModelForMultipleChoice",
"FlaxAutoModelForNextSentencePrediction",
"FlaxAutoModelForPreTraining",
"FlaxAutoModelForQuestionAnswering",
"FlaxAutoModelForSeq2SeqLM",
"FlaxAutoModelForSequenceClassification",
"FlaxAutoModelForSpeechSeq2Seq",
"FlaxAutoModelForTokenClassification",
"FlaxAutoModelForVision2Seq",
]
if TYPE_CHECKING:
from .auto_factory import get_values
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
from .feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
from .image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
from .processing_auto import PROCESSOR_MAPPING, AutoProcessor
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_auto import (
TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_MASK_GENERATION_MAPPING,
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TEXT_ENCODING_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
TFAutoModel,
TFAutoModelForAudioClassification,
TFAutoModelForCausalLM,
TFAutoModelForDocumentQuestionAnswering,
TFAutoModelForImageClassification,
TFAutoModelForMaskedImageModeling,
TFAutoModelForMaskedLM,
TFAutoModelForMaskGeneration,
TFAutoModelForMultipleChoice,
TFAutoModelForNextSentencePrediction,
TFAutoModelForPreTraining,
TFAutoModelForQuestionAnswering,
TFAutoModelForSemanticSegmentation,
TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification,
TFAutoModelForSpeechSeq2Seq,
TFAutoModelForTableQuestionAnswering,
TFAutoModelForTextEncoding,
TFAutoModelForTokenClassification,
TFAutoModelForVision2Seq,
TFAutoModelForZeroShotImageClassification,
TFAutoModelWithLMHead,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_auto import (
FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
FLAX_MODEL_FOR_MASKED_LM_MAPPING,
FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
FLAX_MODEL_FOR_PRETRAINING_MAPPING,
FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING,
FLAX_MODEL_MAPPING,
FlaxAutoModel,
FlaxAutoModelForCausalLM,
FlaxAutoModelForImageClassification,
FlaxAutoModelForMaskedLM,
FlaxAutoModelForMultipleChoice,
FlaxAutoModelForNextSentencePrediction,
FlaxAutoModelForPreTraining,
FlaxAutoModelForQuestionAnswering,
FlaxAutoModelForSeq2SeqLM,
FlaxAutoModelForSequenceClassification,
FlaxAutoModelForSpeechSeq2Seq,
FlaxAutoModelForTokenClassification,
FlaxAutoModelForVision2Seq,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\autoformer\configuration_autoformer.py
""" Autoformer model configuration"""
from typing import List, Optional
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"huggingface/autoformer-tourism-monthly": "https://huggingface.co/huggingface/autoformer-tourism-monthly/resolve/main/config.json",
}
class AutoformerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of an [`AutoformerModel`]. It is used to instantiate an
Autoformer model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Autoformer
[huggingface/autoformer-tourism-monthly](https://huggingface.co/huggingface/autoformer-tourism-monthly)
architecture.
Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
```
>>> from transformers import AutoformerConfig, AutoformerModel
>>> # Initializing a default Autoformer configuration
>>> configuration = AutoformerConfig()
>>> # Randomly initializing a model (with random weights) from the configuration
>>> model = AutoformerModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "autoformer"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
"num_hidden_layers": "encoder_layers",
}
def __init__(
self,
prediction_length: Optional[int] = None,
context_length: Optional[int] = None,
distribution_output: str = "student_t",
loss: str = "nll",
input_size: int = 1,
lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
scaling: bool = True,
num_time_features: int = 0,
num_dynamic_real_features: int = 0,
num_static_categorical_features: int = 0,
num_static_real_features: int = 0,
cardinality: Optional[List[int]] = None,
embedding_dimension: Optional[List[int]] = None,
d_model: int = 64,
encoder_attention_heads: int = 2,
decoder_attention_heads: int = 2,
encoder_layers: int = 2,
decoder_layers: int = 2,
encoder_ffn_dim: int = 32,
decoder_ffn_dim: int = 32,
activation_function: str = "gelu",
dropout: float = 0.1,
encoder_layerdrop: float = 0.1,
decoder_layerdrop: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
num_parallel_samples: int = 100,
init_std: float = 0.02,
use_cache: bool = True,
is_encoder_decoder=True,
label_length: int = 10,
moving_average: int = 25,
autocorrelation_factor: int = 3,
**kwargs,
self.prediction_length = prediction_length
self.context_length = context_length if context_length is not None else prediction_length
self.distribution_output = distribution_output
self.loss = loss
self.input_size = input_size
self.num_time_features = num_time_features
self.lags_sequence = lags_sequence
self.scaling = scaling
self.num_dynamic_real_features = num_dynamic_real_features
self.num_static_real_features = num_static_real_features
self.num_static_categorical_features = num_static_categorical_features
if cardinality is not None and num_static_categorical_features > 0:
if len(cardinality) != num_static_categorical_features:
raise ValueError(
"The cardinality should be a list of the same length as `num_static_categorical_features`"
)
self.cardinality = cardinality
else:
self.cardinality = [0]
if embedding_dimension is not None and num_static_categorical_features > 0:
if len(embedding_dimension) != num_static_categorical_features:
raise ValueError(
"The embedding dimension should be a list of the same length as `num_static_categorical_features`"
)
self.embedding_dimension = embedding_dimension
else:
self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
self.num_parallel_samples = num_parallel_samples
self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features
self.d_model = d_model
self.encoder_attention_heads = encoder_attention_heads
self.decoder_attention_heads = decoder_attention_heads
self.encoder_ffn_dim = encoder_ffn_dim
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_layers = encoder_layers
self.decoder_layers = decoder_layers
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.activation_function = activation_function
self.init_std = init_std
self.use_cache = use_cache
self.label_length = label_length
self.moving_average = moving_average
self.autocorrelation_factor = autocorrelation_factor
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
def _number_of_features(self) -> int:
return (
sum(self.embedding_dimension)
+ self.num_dynamic_real_features
+ self.num_time_features
+ self.num_static_real_features
+ self.input_size * 2
)
.\models\autoformer\modeling_autoformer.py
""" PyTorch Autoformer model. """
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
ModelOutput,
SampleTSPredictionOutput,
Seq2SeqTSPredictionOutput,
)
from ...modeling_utils import PreTrainedModel
from ...time_series_utils import (
NegativeBinomialOutput,
NormalOutput,
StudentTOutput,
)
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_autoformer import AutoformerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "AutoformerConfig"
@dataclass
class AutoFormerDecoderOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
"""
pass
last_hidden_state: torch.FloatTensor = None
trend: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class AutoformerModelOutput(ModelOutput):
"""
Autoformer model output that contains the additional trend output.
"""
last_hidden_state: torch.FloatTensor = None
trend: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
loc: Optional[torch.FloatTensor] = None
scale: Optional[torch.FloatTensor] = None
static_features: Optional[torch.FloatTensor] = None
AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"huggingface/autoformer-tourism-monthly",
]
class AutoformerFeatureEmbedder(nn.Module):
"""
Embed a sequence of categorical features.
Args:
cardinalities (`list[int]`):
List of cardinalities of the categorical features.
embedding_dims (`list[int]`):
List of embedding dimensions of the categorical features.
"""
def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
super().__init__()
self.num_features = len(cardinalities)
self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
def forward(self, features: torch.Tensor) -> torch.Tensor:
if self.num_features > 1:
cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
else:
cat_feature_slices = [features]
return torch.cat(
[
embed(cat_feature_slice.squeeze(-1))
for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
],
dim=-1,
)
class AutoformerStdScaler(nn.Module):
"""
Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
subtracting from the mean and dividing by the standard deviation.
"""
def __init__(self, config: AutoformerConfig):
super().__init__()
self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
def forward(
self, data: torch.Tensor, observed_indicator: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Parameters:
data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
input for Batch norm calculation
observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
Calculating the scale on the observed indicator.
Returns:
tuple of `torch.Tensor` of shapes
(`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
`(batch_size, 1, num_input_channels)`)
"""
denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
denominator = denominator.clamp_min(1.0)
loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
scale = torch.sqrt(variance + self.minimum_scale)
return (data - loc) / scale, loc, scale
class AutoformerMeanScaler(nn.Module):
"""
Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
accordingly.
"""
def __init__(self, config: AutoformerConfig):
super().__init__()
self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
def forward(
self, data: torch.Tensor, observed_indicator: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Parameters:
data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
input for Batch norm calculation
observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
Calculating the scale on the observed indicator.
Returns:
tuple of `torch.Tensor` of shapes
(`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
`(batch_size, 1, num_input_channels)`)
"""
ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
num_observed = observed_indicator.sum(self.dim, keepdim=True)
scale = ts_sum / torch.clamp(num_observed, min=1)
if self.default_scale is None:
batch_sum = ts_sum.sum(dim=0)
batch_observations = torch.clamp(num_observed.sum(0), min=1)
default_scale = torch.squeeze(batch_sum / batch_observations)
else:
default_scale = self.default_scale * torch.ones_like(scale)
scale = torch.where(num_observed > 0, scale, default_scale)
scale = torch.clamp(scale, min=self.minimum_scale)
scaled_data = data / scale
if not self.keepdim:
scale = scale.squeeze(dim=self.dim)
return scaled_data, torch.zeros_like(scale), scale
class AutoformerNOPScaler(nn.Module):
"""
Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
"""
def __init__(self, config: AutoformerConfig):
super().__init__()
self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
def forward(
self, data: torch.Tensor, observed_indicator: torch.Tensor = None
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
return data, loc, scale
def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
"""
计算给定维度上张量的加权平均值,并对与权重为零相关的值进行掩码处理,
这意味着你将得到`nan * 0 = nan`的替代值`0 * 0 = 0`。
Args:
input_tensor (`torch.FloatTensor`):
输入张量,需要计算平均值。
weights (`torch.FloatTensor`, *可选*):
权重张量,与`input_tensor`形状相同。
dim (`int`, *可选*):
沿着哪个维度对`input_tensor`进行平均。
Returns:
`torch.FloatTensor`: 沿指定`dim`平均值的张量。
"""
if weights is not None:
weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
else:
return input_tensor.mean(dim=dim)
def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
"""
从输入分布计算与目标相关的负对数似然损失。
"""
return -input.log_prob(target)
class AutoformerSinusoidalPositionalEmbedding(nn.Embedding):
"""该模块产生任意长度的正弦位置嵌入。"""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
super().__init__(num_positions, embedding_dim)
self.weight = self._init_weight(self.weight)
@staticmethod
def _init_weight(out: nn.Parameter) -> nn.Parameter:
"""
与XLM create_sinusoidal_embeddings相同,除了特征不是交错的。余弦特征在向量的第二半部分。[dim // 2:]
"""
n_pos, dim = out.shape
position_enc = np.array(
[[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
)
out.requires_grad = False
sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
return out
@torch.no_grad()
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
class AutoformerValueEmbedding(nn.Module):
def __init__(self, feature_size, d_model):
super().__init__()
self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
def forward(self, x):
return self.value_projection(x)
class AutoformerSeriesDecompositionLayer(nn.Module):
"""
返回时间序列的趋势和季节部分。计算方式为:
x_trend = AvgPool(Padding(X)) and x_seasonal = X - x_trend
"""
def __init__(self, config: AutoformerConfig):
super().__init__()
self.kernel_size = config.moving_average
self.avg = nn.AvgPool1d(kernel_size=self.kernel_size, stride=1, padding=0)
def forward(self, x):
"""输入形状: Batch x Time x EMBED_DIM"""
num_of_pads = (self.kernel_size - 1) // 2
front = x[:, 0:1, :].repeat(1, num_of_pads, 1)
end = x[:, -1:, :].repeat(1, num_of_pads, 1)
x_padded = torch.cat([front, x, end], dim=1)
x_trend = self.avg(x_padded.permute(0, 2, 1)).permute(0, 2, 1)
x_seasonal = x - x_trend
return x_seasonal, x_trend
class AutoformerLayernorm(nn.Module):
"""
为季节部分设计的特殊层归一化,计算方式为: AutoformerLayernorm(x) = nn.LayerNorm(x)
- torch.mean(nn.LayerNorm(x))
"""
def __init__(self, config: AutoformerConfig):
super().__init__()
self.layernorm = nn.LayerNorm(config.d_model)
def forward(self, x):
x_hat = self.layernorm(x)
bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
return x_hat - bias
class AutoformerAttention(nn.Module):
"""
自相关机制,包含以下两个阶段:
(1) 基于周期的依赖发现 (2) 时间延迟聚合
该模块替代了传统的自注意力机制。
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
autocorrelation_factor: int = 3,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.autocorrelation_factor = autocorrelation_factor
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
class AutoformerEncoderLayer(nn.Module):
def __init__(self, config: AutoformerConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = AutoformerAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
autocorrelation_factor=config.autocorrelation_factor,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = AutoformerLayernorm(config)
self.decomp1 = AutoformerSeriesDecompositionLayer(config)
self.decomp2 = AutoformerSeriesDecompositionLayer(config)
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: torch.FloatTensor,
layer_head_mask: torch.FloatTensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, _ = self.decomp1(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states, _ = self.decomp2(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class AutoformerDecoderLayer(nn.Module):
def __init__(self, config: AutoformerConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = AutoformerAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
autocorrelation_factor=config.autocorrelation_factor,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = AutoformerAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
autocorrelation_factor=config.autocorrelation_factor,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = AutoformerLayernorm(config)
self.decomp1 = AutoformerSeriesDecompositionLayer(config)
self.decomp2 = AutoformerSeriesDecompositionLayer(config)
self.decomp3 = AutoformerSeriesDecompositionLayer(config)
self.trend_projection = nn.Conv1d(
in_channels=self.embed_dim,
out_channels=config.feature_size,
kernel_size=3,
stride=1,
padding=1,
padding_mode="circular",
bias=False,
)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class AutoformerPreTrainedModel(PreTrainedModel):
config_class = AutoformerConfig
base_model_prefix = "model"
main_input_name = "past_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, AutoformerSinusoidalPositionalEmbedding):
pass
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
AUTOFORMER_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`AutoformerConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
AUTOFORMER_INPUTS_DOCSTRING = r"""
"""
class AutoformerEncoder(AutoformerPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`AutoformerEncoderLayer`].
Args:
config: AutoformerConfig
"""
def __init__(self, config: AutoformerConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
if config.prediction_length is None:
raise ValueError("The `prediction_length` config needs to be specified.")
self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
config.context_length + config.prediction_length, config.d_model
)
self.layers = nn.ModuleList([AutoformerEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def __init__(self, config: AutoformerConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
if config.prediction_length is None:
raise ValueError("The `prediction_length` config needs to be specified.")
self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
self.embed_positions = AutoformerSinusoidalPositionalEmbedding(
config.context_length + config.prediction_length, config.d_model
)
self.layers = nn.ModuleList([AutoformerDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.seasonality_projection = nn.Linear(config.d_model, config.feature_size)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
trend: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class AutoformerModel(AutoformerPreTrainedModel):
def __init__(self, config: AutoformerConfig):
super().__init__(config)
if config.scaling == "mean" or config.scaling is True:
self.scaler = AutoformerMeanScaler(config)
elif config.scaling == "std":
self.scaler = AutoformerStdScaler(config)
else:
self.scaler = AutoformerNOPScaler(config)
if config.num_static_categorical_features > 0:
self.embedder = AutoformerFeatureEmbedder(
cardinalities=config.cardinality, embedding_dims=config.embedding_dimension
)
self.encoder = AutoformerEncoder(config)
self.decoder = AutoformerDecoder(config)
self.decomposition_layer = AutoformerSeriesDecompositionLayer(config)
self.post_init()
@property
def _past_length(self) -> int:
return self.config.context_length + max(self.config.lags_sequence)
def get_lagged_subsequences(
self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
) -> torch.Tensor:
"""
Returns lagged subsequences of a given sequence. Returns a tensor of shape (batch_size, subsequences_length,
feature_size, indices_length), containing lagged subsequences. Specifically, lagged[i, j, :, k] = sequence[i,
-indices[k]-subsequences_length+j, :].
Args:
sequence (`torch.Tensor` or shape `(batch_size, context_length,
feature_size)`): The sequence from which lagged subsequences should be extracted.
subsequences_length (`int`):
Length of the subsequences to be extracted.
shift (`int`, *optional* defaults to 0):
Shift the lags by this amount back in the time index.
"""
indices = [lag - shift for lag in self.config.lags_sequence]
sequence_length = sequence.shape[1]
if max(indices) + subsequences_length > sequence_length:
raise ValueError(
f"lags cannot go further than history length, found lag {max(indices)} "
f"while history length is only {sequence_length}"
)
lagged_values = []
for lag_index in indices:
begin_index = -lag_index - subsequences_length
end_index = -lag_index if lag_index > 0 else None
lagged_values.append(sequence[:, begin_index:end_index, ...])
return torch.stack(lagged_values, dim=-1)
def create_network_inputs(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
past_observed_mask: Optional[torch.Tensor] = None,
future_values: Optional[torch.Tensor] = None,
future_time_features: Optional[torch.Tensor] = None,
):
"""
Creates inputs for the network by combining past values, time features, and optional static features.
Args:
past_values (`torch.Tensor`): Tensor containing past values of shape (batch_size, context_length, feature_size).
past_time_features (`torch.Tensor`): Tensor containing time features for the past values.
static_categorical_features (`Optional[torch.Tensor]`, *optional*):
Tensor containing static categorical features.
static_real_features (`Optional[torch.Tensor]`, *optional*):
Tensor containing static real-valued features.
past_observed_mask (`Optional[torch.Tensor]`, *optional*):
Mask indicating which past values are observed.
future_values (`Optional[torch.Tensor]`, *optional*):
Tensor containing future values if available.
future_time_features (`Optional[torch.Tensor]`, *optional*):
Tensor containing time features for the future values.
"""
return NotImplementedError
def get_encoder(self):
"""
Returns the encoder object associated with this model.
"""
return self.encoder
def get_decoder(self):
"""
Returns the decoder object associated with this model.
"""
return self.decoder
@add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=AutoformerModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
past_observed_mask: torch.Tensor,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
future_values: Optional[torch.Tensor] = None,
future_time_features: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
use_cache: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The Autoformer Model with a distribution head on top for time-series forecasting.",
AUTOFORMER_START_DOCSTRING,
)
class AutoformerForPrediction(AutoformerPreTrainedModel):
def __init__(self, config: AutoformerConfig):
super().__init__(config)
self.model = AutoformerModel(config)
if config.distribution_output == "student_t":
self.distribution_output = StudentTOutput(dim=config.input_size)
elif config.distribution_output == "normal":
self.distribution_output = NormalOutput(dim=config.input_size)
elif config.distribution_output == "negative_binomial":
self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
else:
raise ValueError(f"Unknown distribution output {config.distribution_output}")
self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.feature_size)
self.target_shape = self.distribution_output.event_shape
if config.loss == "nll":
self.loss = nll
else:
raise ValueError(f"Unknown loss function {config.loss}")
self.post_init()
def output_params(self, decoder_output):
return self.parameter_projection(decoder_output[:, -self.config.prediction_length :, :])
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
@torch.jit.ignore
def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
sliced_params = params
if trailing_n is not None:
sliced_params = [p[:, -trailing_n:] for p in params]
return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
@add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqTSPredictionOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
past_observed_mask: torch.Tensor,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
future_values: Optional[torch.Tensor] = None,
future_time_features: Optional[torch.Tensor] = None,
future_observed_mask: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
use_cache: Optional[bool] = None,
return_dict: Optional[bool] = None,
@torch.no_grad()
def generate(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
future_time_features: torch.Tensor,
past_observed_mask: Optional[torch.Tensor] = None,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
.\models\autoformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_autoformer": [
"AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"AutoformerConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_autoformer"] = [
"AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"AutoformerForPrediction",
"AutoformerModel",
"AutoformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_autoformer import (
AUTOFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
AutoformerConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_autoformer import (
AUTOFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
AutoformerForPrediction,
AutoformerModel,
AutoformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bark\configuration_bark.py
""" BARK model configuration"""
import os
from typing import Dict, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...utils import add_start_docstrings, logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
BARK_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"suno/bark-small": "https://huggingface.co/suno/bark-small/resolve/main/config.json",
"suno/bark": "https://huggingface.co/suno/bark/resolve/main/config.json",
}
BARK_SUBMODELCONFIG_START_DOCSTRING = """
This is the configuration class to store the configuration of a [`{model}`]. It is used to instantiate the model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Bark [suno/bark](https://huggingface.co/suno/bark)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 设置块大小,定义模型可能使用的最大序列长度,默认为 1024。通常设置为较大值(例如 512、1024 或 2048),以防万一。
block_size (`int`, *optional*, defaults to 1024):
# 输入词汇表大小,用于 Bark 子模型。定义在调用 `{model}` 时可以表示的不同 token 数量。默认为 10,048,但应根据所选子模型仔细考虑。
input_vocab_size (`int`, *optional*, defaults to 10_048):
# 输出词汇表大小,用于 Bark 子模型。定义在向前传递 `{model}` 时可以表示的不同 token 数量。默认为 10,048,但应根据所选子模型仔细考虑。
output_vocab_size (`int`, *optional*, defaults to 10_048):
# 给定子模型中的隐藏层数量。默认为 12。
num_layers (`int`, *optional*, defaults to 12):
# Transformer 架构中每个注意力层的注意力头数量。默认为 12。
num_heads (`int`, *optional*, defaults to 12):
# 架构中“中间”(通常称为前馈)层的维度大小。默认为 768。
hidden_size (`int`, *optional*, defaults to 768):
# 嵌入层、编码器和池化器中所有全连接层的 dropout 概率。默认为 0.0,即不使用 dropout。
dropout (`float`, *optional*, defaults to 0.0):
# 是否在线性层和层归一化层中使用偏置。默认为 `True`。
bias (`bool`, *optional*, defaults to `True`):
# 初始化所有权重矩阵的截断正态初始化器的标准差。默认为 0.02。
initializer_range (`float`, *optional*, defaults to 0.02):
# 模型是否应返回最后的键/值注意力。并非所有模型都使用此功能。默认为 `True`。
use_cache (`bool`, *optional*, defaults to `True`):
"""
定义了一个名为 BarkSubModelConfig 的类,继承自 PretrainedConfig。
model_type 属性指定为 "bark_module",用于标识模型类型为 Bark 模块。
keys_to_ignore_at_inference 属性指定在推断时要忽略的键,这里包括 "past_key_values"。
attribute_map 属性是一个映射字典,将类内部属性名映射到外部使用的名称,例如将 num_attention_heads 映射为 num_heads。
__init__ 方法用于初始化类的实例,接受多个参数来设置模型配置的各个属性,如 block_size、input_vocab_size 等。
from_pretrained 方法是一个类方法,用于从预训练模型加载配置。它接受预训练模型的名称或路径,并支持设置缓存目录、强制下载等参数。
在方法内部,通过调用 cls.get_config_dict 方法获取预训练模型的配置字典。如果配置字典中的 model_type 为 "bark",则从中提取对应的 Bark 配置。
警告日志用于提示用户,如果加载的预训练模型类型与当前类定义的模型类型不匹配,可能会导致错误。
"""
# 获取模型的配置信息
configuration = model.config
# 在 `BarkSubModelConfig` 的基础上定义了一个名为 `BarkSemanticConfig` 的类
class BarkSemanticConfig(BarkSubModelConfig):
# 设定模型类型为 "semantic"
model_type = "semantic"
# 在 `BarkSubModelConfig` 的基础上定义了一个名为 `BarkCoarseConfig` 的类
@add_start_docstrings(
# 添加起始文档字符串,使用 `BARK_SUBMODELCONFIG_START_DOCSTRING` 格式化字符串
BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkCoarseConfig", model="BarkCoarseModel"),
"""
Example:
```
>>> from transformers import BarkCoarseConfig, BarkCoarseModel
>>>
>>> configuration = BarkCoarseConfig()
>>>
>>> model = BarkCoarseModel(configuration)
>>>
>>> configuration = model.config
```""",
)
class BarkCoarseConfig(BarkSubModelConfig):
# 设定模型类型为 "coarse_acoustics"
model_type = "coarse_acoustics"
# 在 `BarkSubModelConfig` 的基础上定义了一个名为 `BarkFineConfig` 的类
@add_start_docstrings(
# 添加起始文档字符串,使用 `BARK_SUBMODELCONFIG_START_DOCSTRING` 格式化字符串
BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkFineConfig", model="BarkFineModel"),
"""
n_codes_total (`int`, *optional*, defaults to 8):
The total number of audio codebooks predicted. Used in the fine acoustics sub-model.
n_codes_given (`int`, *optional*, defaults to 1):
The number of audio codebooks predicted in the coarse acoustics sub-model. Used in the acoustics
sub-models.
Example:
```
>>> from transformers import BarkFineConfig, BarkFineModel
>>>
>>> configuration = BarkFineConfig()
>>>
>>> model = BarkFineModel(configuration)
>>>
>>> configuration = model.config
```""",
)
class BarkFineConfig(BarkSubModelConfig):
# 设定模型类型为 "fine_acoustics"
model_type = "fine_acoustics"
def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs):
# 初始化方法,设定了一些参数和默认值
self.n_codes_total = n_codes_total # 总音频码书预测数量,默认为8
self.n_codes_given = n_codes_given # 粗声学子模型中音频码书预测数量,默认为1
# 调用父类的初始化方法
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
# 继承自 `PretrainedConfig` 的 `BarkConfig` 类
class BarkConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`BarkModel`]. It is used to instantiate a Bark
model according to the specified sub-models configurations, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Bark
[suno/bark](https://huggingface.co/suno/bark) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
semantic_config ([`BarkSemanticConfig`], *optional*):
Configuration of the underlying semantic sub-model.
coarse_acoustics_config ([`BarkCoarseConfig`], *optional*):
Configuration of the underlying coarse acoustics sub-model.
fine_acoustics_config ([`BarkFineConfig`], *optional*):
Configuration of the underlying fine acoustics sub-model.
"""
codec_config ([`AutoConfig`], *optional*):
Configuration of the underlying codec sub-model.
model_type = "bark"
def __init__(
self,
semantic_config: Dict = None,
coarse_acoustics_config: Dict = None,
fine_acoustics_config: Dict = None,
codec_config: Dict = None,
initializer_range=0.02,
**kwargs,
):
# 如果semantic_config为None,则使用默认空字典并记录日志
if semantic_config is None:
semantic_config = {}
logger.info("semantic_config is None. initializing the semantic model with default values.")
# 如果coarse_acoustics_config为None,则使用默认空字典并记录日志
if coarse_acoustics_config is None:
coarse_acoustics_config = {}
logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.")
# 如果fine_acoustics_config为None,则使用默认空字典并记录日志
if fine_acoustics_config is None:
fine_acoustics_config = {}
logger.info("fine_acoustics_config is None. initializing the fine model with default values.")
# 如果codec_config为None,则使用默认空字典并记录日志
if codec_config is None:
codec_config = {}
logger.info("codec_config is None. initializing the codec model with default values.")
# 初始化各个配置对象,如果给定配置为空,则创建默认配置对象
self.semantic_config = BarkSemanticConfig(**semantic_config)
self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config)
self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config)
# 确定codec_model_type,如果未指定则默认为"encodec"
codec_model_type = codec_config["model_type"] if "model_type" in codec_config else "encodec"
self.codec_config = CONFIG_MAPPING[codec_model_type](**codec_config)
# 设置初始化范围
self.initializer_range = initializer_range
super().__init__(**kwargs)
@classmethod
def from_sub_model_configs(
cls,
semantic_config: BarkSemanticConfig,
coarse_acoustics_config: BarkCoarseConfig,
fine_acoustics_config: BarkFineConfig,
codec_config: PretrainedConfig,
**kwargs,
):
):
r"""
从bark子模型配置中实例化一个[`BarkConfig`](或派生类)。
Returns:
[`BarkConfig`]: 配置对象的一个实例
"""
return cls(
semantic_config=semantic_config.to_dict(), # 将语义配置转换为字典形式
coarse_acoustics_config=coarse_acoustics_config.to_dict(), # 将粗略声学配置转换为字典形式
fine_acoustics_config=fine_acoustics_config.to_dict(), # 将精细声学配置转换为字典形式
codec_config=codec_config.to_dict(), # 将编解码器配置转换为字典形式
**kwargs, # 传递额外的关键字参数
)
.\models\bark\convert_suno_to_hf.py
"""Convert Bark checkpoint."""
import argparse
import os
from pathlib import Path
import torch
from bark.generation import _load_model as _bark_load_model
from huggingface_hub import hf_hub_download
from transformers import EncodecConfig, EncodecModel, set_seed
from transformers.models.bark.configuration_bark import (
BarkCoarseConfig,
BarkConfig,
BarkFineConfig,
BarkSemanticConfig,
)
from transformers.models.bark.generation_configuration_bark import (
BarkCoarseGenerationConfig,
BarkFineGenerationConfig,
BarkGenerationConfig,
BarkSemanticGenerationConfig,
)
from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
set_seed(770)
new_layer_name_dict = {
"c_attn": "att_proj",
"c_proj": "out_proj",
"c_fc": "in_proj",
"transformer.": "",
"h.": "layers.",
"ln_1": "layernorm_1",
"ln_2": "layernorm_2",
"ln_f": "layernorm_final",
"wpe": "position_embeds_layer",
"wte": "input_embeds_layer",
}
REMOTE_MODEL_PATHS = {
"text_small": {
"repo_id": "suno/bark",
"file_name": "text.pt",
},
"coarse_small": {
"repo_id": "suno/bark",
"file_name": "coarse.pt",
},
"fine_small": {
"repo_id": "suno/bark",
"file_name": "fine.pt",
},
"text": {
"repo_id": "suno/bark",
"file_name": "text_2.pt",
},
"coarse": {
"repo_id": "suno/bark",
"file_name": "coarse_2.pt",
},
"fine": {
"repo_id": "suno/bark",
"file_name": "fine_2.pt",
},
}
CUR_PATH = os.path.dirname(os.path.abspath(__file__))
default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
def _get_ckpt_path(model_type, use_small=False):
key = model_type
if use_small:
key += "_small"
return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
def _download(from_hf_path, file_name):
os.makedirs(CACHE_DIR, exist_ok=True)
hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
def _load_model(ckpt_path, device, use_small=False, model_type="text"):
if model_type == "text":
ModelClass = BarkSemanticModel
ConfigClass = BarkSemanticConfig
GenerationConfigClass = BarkSemanticGenerationConfig
elif model_type == "coarse":
ModelClass = BarkCoarseModel
ConfigClass = BarkCoarseConfig
GenerationConfigClass = BarkCoarseGenerationConfig
elif model_type == "fine":
ModelClass = BarkFineModel
ConfigClass = BarkFineConfig
GenerationConfigClass = BarkFineGenerationConfig
else:
raise NotImplementedError()
model_key = f"{model_type}_small" if use_small else model_type
model_info = REMOTE_MODEL_PATHS[model_key]
if not os.path.exists(ckpt_path):
logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
_download(model_info["repo_id"], model_info["file_name"])
checkpoint = torch.load(ckpt_path, map_location=device)
model_args = checkpoint["model_args"]
if "input_vocab_size" not in model_args:
model_args["input_vocab_size"] = model_args["vocab_size"]
model_args["output_vocab_size"] = model_args["vocab_size"]
del model_args["vocab_size"]
model_args["num_heads"] = model_args.pop("n_head")
model_args["hidden_size"] = model_args.pop("n_embd")
model_args["num_layers"] = model_args.pop("n_layer")
model_config = ConfigClass(**checkpoint["model_args"])
model = ModelClass(config=model_config)
model_generation_config = GenerationConfigClass()
model.generation_config = model_generation_config
state_dict = checkpoint["model"]
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
new_k = k[len(unwanted_prefix):]
for old_layer_name in new_layer_name_dict:
new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name])
state_dict[new_k] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")}
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")}
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
raise ValueError(f"missing keys: {missing_keys}")
model.load_state_dict(state_dict, strict=False)
n_params = model.num_parameters(exclude_embeddings=True)
val_loss = checkpoint["best_val_loss"].item()
logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
model.eval()
model.to(device)
del checkpoint, state_dict
return model
def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
if model_type not in ("text", "coarse", "fine"):
raise NotImplementedError()
device = "cpu"
ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small)
bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small)
if model_type == "text":
bark_model = bark_model["model"]
if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params():
raise ValueError("initial and new models don't have the same number of parameters")
batch_size = 5
sequence_length = 10
if model_type in ["text", "coarse"]:
vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int)
output_old_model = bark_model(vec)[0]
output_new_model_total = model(vec)
output_new_model = output_new_model_total.logits[:, [-1], :]
else:
prediction_codeboook_channel = 3
n_codes_total = 8
vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
output_new_model_total = model(prediction_codeboook_channel, vec)
output_old_model = bark_model(prediction_codeboook_channel, vec)
output_new_model = output_new_model_total.logits
if output_new_model.shape != output_old_model.shape:
raise ValueError("initial and new outputs don't have the same shape")
if (output_new_model - output_old_model).abs().max().item() > 1e-3:
raise ValueError("initial and new outputs are not equal")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
def load_whole_bark_model(
semantic_path,
coarse_path,
fine_path,
append_text,
hub_path,
folder_path,
):
pytorch_dump_folder_path = os.path.join(folder_path, append_text)
semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json"))
coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json"))
fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json"))
codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz")
semantic = BarkSemanticModel.from_pretrained(semantic_path)
coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path)
fineAcoustic = BarkFineModel.from_pretrained(fine_path)
codec = EncodecModel.from_pretrained("facebook/encodec_24khz")
bark_config = BarkConfig.from_sub_model_configs(
semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig
)
bark_generation_config = BarkGenerationConfig.from_sub_model_configs(
semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config
)
bark = BarkModel(bark_config)
bark.semantic = semantic
bark.coarse_acoustics = coarseAcoustic
bark.fine_acoustics = fineAcoustic
bark.codec_model = codec
bark.generation_config = bark_generation_config
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("model_type", type=str, help="text, coarse or fine.")
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.")
args = parser.parse_args()
load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small)
.\models\bark\generation_configuration_bark.py
""" BARK model generation configuration"""
import copy
from typing import Dict
from ...generation.configuration_utils import GenerationConfig
from ...utils import logging
logger = logging.get_logger(__name__)
class BarkSemanticGenerationConfig(GenerationConfig):
model_type = "semantic"
def __init__(
self,
eos_token_id=10_000,
renormalize_logits=True,
max_new_tokens=768,
output_scores=False,
return_dict_in_generate=False,
output_hidden_states=False,
output_attentions=False,
temperature=1.0,
do_sample=False,
text_encoding_offset=10_048,
text_pad_token=129_595,
semantic_infer_token=129_599,
semantic_vocab_size=10_000,
max_input_semantic_length=256,
semantic_rate_hz=49.9,
min_eos_p=None,
**kwargs,
):
super().__init__(
eos_token_id=eos_token_id,
renormalize_logits=renormalize_logits,
max_new_tokens=max_new_tokens,
output_scores=output_scores,
return_dict_in_generate=return_dict_in_generate,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
temperature=temperature,
do_sample=do_sample,
**kwargs,
)
self.text_encoding_offset = text_encoding_offset
self.text_pad_token = text_pad_token
self.semantic_infer_token = semantic_infer_token
self.semantic_vocab_size = semantic_vocab_size
self.max_input_semantic_length = max_input_semantic_length
self.semantic_rate_hz = semantic_rate_hz
self.min_eos_p = min_eos_p
class BarkCoarseGenerationConfig(GenerationConfig):
model_type = "coarse_acoustics"
def __init__(
self,
renormalize_logits=True,
output_scores=False,
return_dict_in_generate=False,
output_hidden_states=False,
output_attentions=False,
temperature=1.0,
do_sample=False,
coarse_semantic_pad_token=12_048,
coarse_rate_hz=75,
n_coarse_codebooks=2,
coarse_infer_token=12_050,
max_coarse_input_length=256,
max_coarse_history: int = 630,
sliding_window_len: int = 60,
**kwargs,
):
super().__init__(
renormalize_logits=renormalize_logits,
output_scores=output_scores,
return_dict_in_generate=return_dict_in_generate,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
temperature=temperature,
do_sample=do_sample,
**kwargs,
)
self.coarse_semantic_pad_token = coarse_semantic_pad_token
self.coarse_rate_hz = coarse_rate_hz
self.n_coarse_codebooks = n_coarse_codebooks
self.coarse_infer_token = coarse_infer_token
self.max_coarse_input_length = max_coarse_input_length
self.max_coarse_history = max_coarse_history
self.sliding_window_len = sliding_window_len
class BarkFineGenerationConfig(GenerationConfig):
model_type = "fine_acoustics"
def __init__(
self,
temperature=1.0,
max_fine_history_length=512,
max_fine_input_length=1024,
n_fine_codebooks=8,
**kwargs,
):
super().__init__(
temperature=temperature,
**kwargs,
)
self.max_fine_history_length = max_fine_history_length
self.max_fine_input_length = max_fine_input_length
self.n_fine_codebooks = n_fine_codebooks
):
"""
Class that holds a generation configuration for `BarkFineModel`.
`BarkFineModel` is an autoencoder model, so should not usually be used for generation. However, under the
hood, it uses `temperature` when used by `BarkModel`.
This configuration inherits from `GenerationConfig` and can be used to control the model generation. Read the
documentation from `GenerationConfig` for more information.
Args:
temperature (`float`, *optional*):
The value used to modulate the next token probabilities.
max_fine_history_length (`int`, *optional*, defaults to 512):
Max length of the fine history vector.
max_fine_input_length (`int`, *optional*, defaults to 1024):
Max length of fine input vector.
n_fine_codebooks (`int`, *optional*, defaults to 8):
Number of codebooks used.
"""
super().__init__(temperature=temperature)
self.max_fine_history_length = max_fine_history_length
self.max_fine_input_length = max_fine_input_length
self.n_fine_codebooks = n_fine_codebooks
def validate(self, **kwargs):
"""
Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
temperature.
"""
pass
class BarkGenerationConfig(GenerationConfig):
model_type = "bark"
is_composition = True
def __init__(
self,
semantic_config: Dict = None,
coarse_acoustics_config: Dict = None,
fine_acoustics_config: Dict = None,
):
r"""
Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration.
Returns:
[`BarkGenerationConfig`]: An instance of a configuration object
"""
return cls(
semantic_config=semantic_config.to_dict(),
coarse_acoustics_config=coarse_acoustics_config.to_dict(),
fine_acoustics_config=fine_acoustics_config.to_dict(),
**kwargs,
)
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["semantic_config"] = self.semantic_config.to_dict()
output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict()
output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
.\models\bark\modeling_bark.py
class BarkSelfAttention(nn.Module):
def __init__(self, config, is_causal=False):
super().__init__()
self.dropout = config.dropout
self.attn_dropout = nn.Dropout(config.dropout)
self.resid_dropout = nn.Dropout(config.dropout)
self.embed_dim = config.hidden_size
self.num_heads = config.num_heads
self.head_dim = self.embed_dim // self.num_heads
if config.hidden_size % config.num_heads != 0:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.att_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias)
self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.bias)
self.is_causal = is_causal
if is_causal:
block_size = config.block_size
bias = torch.tril(torch.ones((block_size, block_size), dtype=bool)).view(1, 1, block_size, block_size)
self.register_buffer("bias", bias)
def _split_heads(self, tensor, num_heads, attn_head_size):
"""
Splits hidden_size dim into attn_head_size and num_heads
"""
new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
tensor = tensor.view(new_shape)
return tensor.permute(0, 2, 1, 3)
def _merge_heads(self, tensor, num_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden_size
"""
tensor = tensor.transpose(1, 2).contiguous()
tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))
return tensor
def _attn(self, query, key, value, attention_mask=None, head_mask=None):
attn_weights = torch.matmul(query, key.transpose(-1, -2)) * (1.0 / math.sqrt(self.head_dim))
if self.is_causal:
query_length, key_length = query.size(-2), key.size(-2)
attn_weights = attn_weights.masked_fill(
self.bias[:, :, key_length - query_length : key_length, :key_length] == 0,
torch.finfo(attn_weights.dtype).min,
)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights.to(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
def forward(
self,
hidden_states,
attention_mask=None,
past_key_values=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)
query = self._split_heads(query, self.num_heads, self.head_dim)
key = self._split_heads(key, self.num_heads, self.head_dim)
value = self._split_heads(value, self.num_heads, self.head_dim)
if past_key_values is not None:
past_key = past_key_values[0]
past_value = past_key_values[1]
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
present = (key, value)
else:
present = None
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
attn_output = self.out_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
class BarkSelfFlashAttention2(BarkSelfAttention):
"""
Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def _split_heads(self, tensor, num_heads, attn_head_size):
"""
Splits hidden_size dim into attn_head_size and num_heads
"""
new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
tensor = tensor.view(new_shape)
return tensor
def _merge_heads(self, tensor, num_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden_size
"""
tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,))
return tensor
def forward(
self,
hidden_states,
attention_mask=None,
past_key_values=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
batch_size, query_len, _ = hidden_states.size()
query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2)
query = self._split_heads(query, self.num_heads, self.head_dim)
key = self._split_heads(key, self.num_heads, self.head_dim)
value = self._split_heads(value, self.num_heads, self.head_dim)
if past_key_values is not None:
past_key = past_key_values[0].transpose(1, 2)
past_value = past_key_values[1].transpose(1, 2)
key = torch.cat((past_key, key), dim=1)
value = torch.cat((past_value, value), dim=1)
if use_cache is True:
present = (key.transpose(1, 2), value.transpose(1, 2))
else:
present = None
attn_output = self._flash_attention_forward(query, key, value, attention_mask, query_len, dropout=self.dropout)
attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
attn_output = self.out_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
attn_weights = None
outputs += (attn_weights,)
return outputs
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
BARK_ATTENTION_CLASSES = {
"eager": BarkSelfAttention,
"flash_attention_2": BarkSelfFlashAttention2,
}
class BarkLayerNorm(nn.Module):
"""LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False."""
def __init__(self, hidden_size, bias=True):
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None
def forward(self, input):
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, eps=1e-5)
class BarkMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.in_proj = nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=config.bias)
self.out_proj = nn.Linear(4 * config.hidden_size, config.hidden_size, bias=config.bias)
self.dropout = nn.Dropout(config.dropout)
self.gelu = nn.GELU()
def forward(self, hidden_states):
hidden_states = self.in_proj(hidden_states)
hidden_states = self.gelu(hidden_states)
hidden_states = self.out_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class BarkBlock(nn.Module):
def __init__(self, config, is_causal=False):
super().__init__()
if is_causal:
self.layernorm_1 = BarkLayerNorm(config.hidden_size, bias=config.bias)
self.layernorm_2 = BarkLayerNorm(config.hidden_size, bias=config.bias)
else:
self.layernorm_1 = nn.LayerNorm(config.hidden_size)
self.layernorm_2 = nn.LayerNorm(config.hidden_size)
self.attn = BARK_ATTENTION_CLASSES[config._attn_implementation](config, is_causal=is_causal)
self.mlp = BarkMLP(config)
def forward(
self,
hidden_states,
past_key_values=None,
attention_mask=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
):
intermediary_hidden_states = self.layernorm_1(hidden_states)
attn_outputs = self.attn(
intermediary_hidden_states,
past_key_values=past_key_values,
attention_mask=attention_mask,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
intermediary_hidden_states = hidden_states + attn_output
intermediary_hidden_states = intermediary_hidden_states + self.mlp(
self.layernorm_2(intermediary_hidden_states)
)
if use_cache:
outputs = (intermediary_hidden_states,) + outputs
else:
outputs = (intermediary_hidden_states,) + outputs[1:]
return outputs
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
class BarkPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BarkConfig
supports_gradient_checkpointing = False
_supports_flash_attn_2 = True
def _init_weights(self, module):
"""Initialize the weights of the module based on its type."""
if isinstance(module, (nn.Linear,)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
@property
def device(self) -> torch.device:
"""
`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
device).
"""
if not hasattr(self, "_hf_hook"):
return get_parameter_device(self)
for module in self.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return get_parameter_device(self)
"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`{config}`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BARK_MODEL_START_DOCSTRING = BARK_MODEL_START_DOCSTRING.strip()
"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
"""
BARK_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BarkConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BARK_FINE_INPUTS_DOCSTRING = r"""
Args:
codebook_idx (`int`):
Index of the codebook that will be predicted.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is
predicted recursively by attending the previously predicted channels. The model predicts on windows of
length 1024.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET.
input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
`past_key_values` is used, optionally only the last `input_embeds` have to be input (see
`past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
associated vectors than the model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
BARK_CAUSAL_MODEL_INPUTS_DOCSTRING = r"""
"""
# GPT2-like autoregressive model
class BarkCausalModel(BarkPreTrainedModel):
# 使用BarkSubModelConfig类来配置模型
config_class = BarkSubModelConfig
# 初始化方法,接受一个配置参数 config
def __init__(self, config):
# 调用父类的初始化方法,传入配置参数 config
super().__init__(config)
# 将配置参数 config 保存在实例变量中
self.config = config
# 初始化输入词嵌入层,根据输入词汇大小和隐藏层大小创建 Embedding 层
self.input_embeds_layer = nn.Embedding(config.input_vocab_size, config.hidden_size)
# 初始化位置嵌入层,根据块大小和隐藏层大小创建 Embedding 层
self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
# 初始化 Dropout 层,使用配置中的 dropout 比率
self.drop = nn.Dropout(config.dropout)
# 使用列表推导式创建多层 BarkBlock 模块组成的模块列表,每层使用相同的配置和是因果的标志
self.layers = nn.ModuleList([BarkBlock(config, is_causal=True) for _ in range(config.num_layers)])
# 根据配置中的 _attn_implementation 判断是否使用 Flash Attention 2
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
# 初始化最终的 LayerNorm 层,使用隐藏层大小和偏置配置进行初始化
self.layernorm_final = BarkLayerNorm(config.hidden_size, bias=config.bias)
# 初始化语言模型的线性层,将隐藏层映射到输出词汇大小的空间,没有偏置
self.lm_head = nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
# 禁用渐变检查点,用于后续的优化和训练过程
self.gradient_checkpointing = False
# 执行后期初始化,可能包括权重初始化和其他配置
self.post_init()
# 返回输入词嵌入层
def get_input_embeddings(self):
return self.input_embeds_layer
# 设置新的输入词嵌入层
def set_input_embeddings(self, new_embeddings):
self.input_embeds_layer = new_embeddings
# 准备用于生成的输入,根据传入的参数进行处理
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
# 从 kwargs 中获取输入的嵌入向量(如果有)
input_embeds = kwargs.get("input_embeds", None)
# 从 kwargs 中获取注意力遮罩(如果有)
attention_mask = kwargs.get("attention_mask", None)
# 从 kwargs 中获取位置编码(如果有)
position_ids = kwargs.get("position_ids", None)
# 如果 past_key_values 不为 None,则执行以下操作
if past_key_values is not None:
# 忽略已经被 past_key_values 覆盖的 token
seq_len = input_ids.shape[1] # 获取输入序列的长度
past_length = past_key_values[0][0].shape[2] # 获取过去键值的长度
# 如果输入序列的长度大于过去键值的长度,则移除前缀长度为 past_length
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 否则,默认行为:仅保留最后一个 token
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:] # 更新 input_ids 为移除前缀后的序列
# input_embeds 已经被使用过,不再需要
input_embeds = None
else:
# 如果 past_key_values 为 None,则执行以下操作
if input_embeds is not None and kwargs.get("use_cache"):
seq_len = input_embeds.shape[1] # 获取嵌入向量的序列长度
else:
seq_len = input_ids.shape[1] # 获取输入序列的长度
# 确保 attention_mask 和 position_ids 的形状与奇怪的 Bark hack 对序列长度的减少对齐
if attention_mask is not None:
attention_mask = attention_mask[:, :seq_len] # 调整 attention_mask 的长度为 seq_len
if position_ids is not None:
position_ids = position_ids[:, :seq_len] # 调整 position_ids 的长度为 seq_len
# 如果 attention_mask 存在且 position_ids 不存在,则执行以下操作
if attention_mask is not None and position_ids is None:
# 在批次生成时动态创建 position_ids
position_ids = attention_mask.long().cumsum(-1) - 1 # 根据 attention_mask 创建 position_ids
position_ids.masked_fill_(attention_mask == 0, 1) # 将 position_ids 中 attention_mask 为 0 的位置填充为 1
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :] # 如果 past_key_values 存在,截取最后 input_ids 长度的 position_ids
else:
position_ids = None # 否则置为 None
# 如果 input_embeds 存在且 use_cache 为 True,则返回以下结果字典
if input_embeds is not None and kwargs.get("use_cache"):
return {
"input_ids": None,
"input_embeds": input_embeds,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
}
# 否则,返回以下结果字典
return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
}
# 将 Bark_causal_model 的输入文档字符串添加到模型前向方法中
@add_start_docstrings_to_model_forward(BARK_CAUSAL_MODEL_INPUTS_DOCSTRING)
# 定义一个类方法 `forward`,用于模型的前向传播。
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token IDs张量,可选
past_key_values: Optional[Tuple[torch.FloatTensor]] = None, # 用于存储过去的键值,可选
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码张量,可选
position_ids: Optional[torch.Tensor] = None, # 位置ID张量,可选
head_mask: Optional[torch.Tensor] = None, # 头部掩码张量,可选
labels: Optional[torch.LongTensor] = None, # 标签张量,可选
input_embeds: Optional[torch.Tensor] = None, # 输入的嵌入张量,可选
use_cache: Optional[bool] = None, # 是否使用缓存,可选
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选
return_dict: Optional[bool] = None, # 是否返回字典格式的结果,可选
):
# 静态方法 `_reorder_cache`,用于在beam搜索或采样时重新排序 `past_key_values` 缓存,
# 以确保在每个生成步骤中与正确的beam_idx匹配。
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], # 包含过去键值的元组,必须为张量
beam_idx: torch.Tensor # beam索引张量,指定重新排序顺序
) -> Tuple[Tuple[torch.Tensor]]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
"""
# 对于每个层的过去状态,使用 `beam_idx` 在设备上选择正确的过去状态,返回重新排序后的元组
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
# 定义 BarkSemanticModel 类,继承自 BarkCausalModel 类
@add_start_docstrings(
"""Bark semantic (or text) model. It shares the same architecture as the coarse model.
It is a GPT-2 like autoregressive model with a language modeling head on top.""",
BARK_MODEL_START_DOCSTRING.format(config="BarkSemanticConfig"),
)
class BarkSemanticModel(BarkCausalModel):
# 指定模型的前缀名称为 'semantic'
base_model_prefix = "semantic"
# 指定配置类为 BarkSemanticConfig
config_class = BarkSemanticConfig
# 定义生成方法
def generate(
self,
input_ids: torch.Tensor,
semantic_generation_config: BarkSemanticGenerationConfig = None,
history_prompt: Optional[Dict[str, torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
# 定义 BarkCoarseModel 类,继承自 BarkCausalModel 类
@add_start_docstrings(
"""Bark coarse acoustics model.
It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
language modeling head on top.""",
BARK_MODEL_START_DOCSTRING.format(config="BarkCoarseConfig"),
)
class BarkCoarseModel(BarkCausalModel):
# 指定模型的前缀名称为 'coarse_acoustics'
base_model_prefix = "coarse_acoustics"
# 指定配置类为 BarkCoarseConfig
config_class = BarkCoarseConfig
# 定义预处理历史数据的方法
def preprocess_histories(
self,
max_coarse_history: int,
semantic_to_coarse_ratio: int,
batch_size: int,
semantic_generation_config: int,
codebook_size: int,
history_prompt: Optional[Dict[str, torch.Tensor]] = None,
# 定义生成方法
def generate(
self,
semantic_output: torch.Tensor,
semantic_generation_config: BarkSemanticGenerationConfig = None,
coarse_generation_config: BarkCoarseGenerationConfig = None,
codebook_size: int = 1024,
history_prompt: Optional[Dict[str, torch.Tensor]] = None,
return_output_lengths: Optional[bool] = None,
**kwargs,
# 定义 BarkFineModel 类,继承自 BarkPreTrainedModel 类
@add_start_docstrings(
"""Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
language modeling heads, one for each codebook.""",
BARK_MODEL_START_DOCSTRING.format(config="BarkFineConfig"),
)
class BarkFineModel(BarkPreTrainedModel):
# 指定模型的前缀名称为 'fine_acoustics'
base_model_prefix = "fine_acoustics"
# 指定配置类为 BarkFineConfig
config_class = BarkFineConfig
# 主输入名称为 'codebook_idx'
main_input_name = "codebook_idx"
def __init__(self, config):
# non-causal gpt-like model with one embedding layer and one lm_head for each codebook of Encodec
# 使用给定的配置初始化模型
super().__init__(config)
self.config = config
# initialize a modified non causal GPT-like model
# note that for there is one embedding layer and one lm_head for each codebook of Encodec
# 初始化修改后的非因果关系的类似GPT的模型
# 每个Encodec编码书中都有一个嵌入层和一个lm_head
self.input_embeds_layers = nn.ModuleList(
[nn.Embedding(config.input_vocab_size, config.hidden_size) for _ in range(config.n_codes_total)]
)
# 初始化输入嵌入层列表,每个编码书一个嵌入层
self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size)
# 初始化位置嵌入层,用于位置编码
self.drop = nn.Dropout(config.dropout)
# 初始化Dropout层,用于随机丢弃输入的一部分特征
self.layers = nn.ModuleList([BarkBlock(config, is_causal=False) for _ in range(config.num_layers)])
# 初始化模型的层列表,每层使用BarkBlock,is_causal参数为False表示非因果关系
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
# 根据配置决定是否使用Flash Attention版本2
self.layernorm_final = nn.LayerNorm(config.hidden_size)
# 初始化最终的Layer Norm层,对模型的隐藏状态进行归一化
self.lm_heads = nn.ModuleList(
[
nn.Linear(config.hidden_size, config.output_vocab_size, bias=False)
for _ in range(config.n_codes_given, config.n_codes_total)
]
)
# 初始化语言模型头列表,每个编码书一个lm_head
self.gradient_checkpointing = False
# 梯度检查点默认关闭
self.n_codes_total = config.n_codes_total
# 记录总编码书的数量
# Initialize weights and apply final processing
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
# one embedding layers for each codebook
# 返回每个编码书的输入嵌入层列表
return self.input_embeds_layers
def set_input_embeddings(self, new_embeddings):
# one embedding layers for each codebook
# 设置每个编码书的输入嵌入层列表
self.input_embeds_layers = new_embeddings
def get_output_embeddings(self):
# one lm_head for each codebook
# 返回每个编码书的语言模型头列表
return self.lm_heads
def set_output_embeddings(self, new_output_embeddings):
# one lm_head for each codebook
# 设置每个编码书的语言模型头列表
self.lm_heads = new_output_embeddings
def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
old_embeddings_list = self.get_input_embeddings()
# 获取当前的输入嵌入层列表
new_embeddings_list = nn.ModuleList(
[
self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
for old_embeddings in old_embeddings_list
]
)
# 生成新的调整大小后的输入嵌入层列表
self.set_input_embeddings(new_embeddings_list)
# 设置模型的新输入嵌入层列表
new_num_tokens = new_embeddings_list[0].weight.shape[0]
# 更新新的嵌入层中的标记数量
# if word embeddings are not tied, make sure that lm head is resized as well
# 如果单词嵌入未绑定,则确保也调整lm头的大小
if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
old_lm_head_list = self.get_output_embeddings()
# 获取当前的语言模型头列表
new_lm_head_list = nn.ModuleList(
[self._get_resized_lm_head(old_lm_head, new_num_tokens) for old_lm_head in old_lm_head_list]
)
# 生成新的调整大小后的语言模型头列表
self.set_output_embeddings(new_lm_head_list)
# 设置模型的新语言模型头列表
return self.get_input_embeddings()
def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
) -> nn.Embedding:
"""
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Arguments:
new_num_tokens (`int`, *optional*):
The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
pad_to_multiple_of (`int`, *optional*):
If set will pad the embedding matrix to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
details about this, or help on choosing the correct value for resizing, refer to this guide:
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
Return:
`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
"""
# 调用内部方法调整模型的词嵌入大小,并获取返回的词嵌入模块
model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
# 如果未指定新的词汇量大小和填充到的倍数,直接返回原始的词嵌入模块
if new_num_tokens is None and pad_to_multiple_of is None:
return model_embeds
# 更新基础模型和当前模型配置的词汇量大小
self.config.output_vocab_size = model_embeds[0].weight.shape[0]
self.config.vocab_size = model_embeds[0].weight.shape[0]
# 更新当前对象的输出词汇量大小和词汇量大小
self.output_vocab_size = model_embeds[0].weight.shape[0]
self.vocab_size = model_embeds[0].weight.shape[0]
# 如果需要,重新绑定权重
self.tie_weights()
# 返回调整后的词嵌入模块
return model_embeds
# 将输入嵌入列表和输出嵌入列表之间的权重进行绑定或克隆。
if getattr(self.config, "tie_word_embeddings", True):
# 如果配置中设置了torchscript标志,则无法处理参数共享,因此我们克隆权重而不是绑定。
self._tied_weights_keys = [] # 初始化存储绑定权重的键列表
output_embeddings = self.get_output_embeddings() # 获取输出嵌入层对象
input_embeddings = self.get_input_embeddings() # 获取输入嵌入层对象
for i in range(self.config.n_codes_total - self.config.n_codes_given):
# 将输出嵌入层i的权重绑定到输入嵌入层i+1的权重上
self._tie_or_clone_weights(output_embeddings[i], input_embeddings[i + 1])
# 记录已绑定权重的键名,格式为"lm_heads.{i}.weight"
self._tied_weights_keys.append(f"lm_heads.{i}.weight")
# 递归地对模型的所有子模块调用 _tie_weights 方法,如果模块有该方法的话
for module in self.modules():
if hasattr(module, "_tie_weights"):
module._tie_weights()
# 生成模型的前向传播方法,接受一系列输入和可选的配置参数作为输入。
@add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING)
def forward(
self,
codebook_idx: int, # 用于预测的代码本身的附加索引
input_ids: Optional[torch.Tensor] = None, # 输入的token id张量
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码张量
position_ids: Optional[torch.Tensor] = None, # 位置id张量
head_mask: Optional[torch.Tensor] = None, # 头部掩码张量
labels: Optional[torch.LongTensor] = None, # 标签张量(用于监督学习)
input_embeds: Optional[torch.Tensor] = None, # 输入嵌入张量(可以替代input_ids)
output_attentions: Optional[bool] = None, # 是否输出注意力权重
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态
return_dict: Optional[bool] = None, # 是否以字典形式返回结果
):
# 生成模型的生成方法,接受粗粒度输出、语义生成配置、粗粒度生成配置、细粒度生成配置、代码本尺寸等参数以及历史提示的可选字典输入。
def generate(
self,
coarse_output: torch.Tensor, # 粗粒度输出的张量
semantic_generation_config: BarkSemanticGenerationConfig = None, # 语义生成配置对象
coarse_generation_config: BarkCoarseGenerationConfig = None, # 粗粒度生成配置对象
fine_generation_config: BarkFineGenerationConfig = None, # 细粒度生成配置对象
codebook_size: int = 1024, # 代码本尺寸,默认为1024
history_prompt: Optional[Dict[str, torch.Tensor]] = None, # 历史提示的可选字典输入
**kwargs, # 其他关键字参数
):
"""
The full Bark model, a text-to-speech model composed of 4 sub-models:
- `BarkSemanticModel` (also referred to as the 'text' model): a causal auto-regressive transformer model that
takes as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
- `BarkCoarseModel` (also referred to as the 'coarse acoustics' model), also a causal autoregressive transformer,
that takes as input the results of the last model. It aims at regressing the first two audio codebooks necessary
to `encodec`.
- `BarkFineModel` (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
predicts the last codebooks based on the sum of the previous codebooks embeddings.
- After having predicted all the codebook channels from the `EncodecModel`, Bark uses it to decode the output audio
array.
It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
output sound according to a specific predefined voice.
"""
@add_start_docstrings(
"""
The full Bark model, a text-to-speech model composed of 4 sub-models:
- [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
takes
as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
- [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer,
that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
to `encodec`.
- [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
predicts the last codebooks based on the sum of the previous codebooks embeddings.
- having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
array.
It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
output sound according to specific predefined voice.
""",
BARK_START_DOCSTRING,
)
class BarkModel(BarkPreTrainedModel):
config_class = BarkConfig
def __init__(self, config):
super().__init__(config)
# Initialize the BarkSemanticModel with the provided semantic configuration
self.semantic = BarkSemanticModel(config.semantic_config)
# Initialize the BarkCoarseModel with the provided coarse acoustics configuration
self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
# Initialize the BarkFineModel with the provided fine acoustics configuration
self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
# Initialize the codec_model using AutoModel and the provided codec configuration
self.codec_model = AutoModel.from_config(config.codec_config)
# Store the provided configuration for later reference
self.config = config
@property
def device(self) -> torch.device:
"""
`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
device).
"""
# Determine the device on which the BarkModel resides
# Check if semantic model has the _hf_hook attribute indicating it has been offloaded
if not hasattr(self.semantic, "_hf_hook"):
return get_parameter_device(self) # Return the device of the current module
# If semantic model has _hf_hook, find the execution device from the hook
for module in self.semantic.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device) # Return the execution device found
def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
r"""
Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
the next sub-model runs.
Args:
gpu_id (`int`, *optional*, defaults to 0):
GPU id on which the sub-models will be loaded and offloaded.
"""
if is_accelerate_available():
from accelerate import cpu_offload_with_hook # Importing the function for offloading to CPU
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
device = torch.device(f"cuda:{gpu_id}") # Define the CUDA device based on given GPU id
if self.device.type != "cpu":
self.to("cpu") # Move the entire model to CPU
torch.cuda.empty_cache() # Clear GPU cache to free up memory
# Offload the input embedding layer to CPU and receive a hook for later layers
self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
hook = None
# Offload each sub-model to CPU sequentially and chain hooks between them
for cpu_offloaded_model in [
self.semantic,
self.coarse_acoustics,
self.fine_acoustics,
]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
self.fine_acoustics_hook = hook # Store the final hook after offloading fine_acoustics
_, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
self.codec_model_hook = hook # Store the hook after offloading the codec_model
# We'll offload the last model manually.
self.codec_model_hook = hook
def codec_decode(self, fine_output, output_lengths=None):
"""Turn quantized audio codes into audio array using encodec."""
fine_output = fine_output.transpose(0, 1)
emb = self.codec_model.quantizer.decode(fine_output)
if output_lengths is not None:
out = [sample[:, :l].unsqueeze(0) for (sample, l) in zip(emb, output_lengths)]
audio_arr = [self.codec_model.decoder(sample).squeeze() for sample in out]
else:
out = self.codec_model.decoder(emb)
audio_arr = out.squeeze(1)
return audio_arr
@torch.no_grad()
def generate(
self,
input_ids: Optional[torch.Tensor] = None,
history_prompt: Optional[Dict[str, torch.Tensor]] = None,
return_output_lengths: Optional[bool] = None,
**kwargs,
):
"""Method for generating outputs based on input_ids and optional prompts."""
pass
@classmethod
def _check_and_enable_flash_attn_2(
cls,
config,
torch_dtype: Optional[torch.dtype] = None,
device_map: Optional[Union[str, Dict[str, int]]] = None,
hard_check_only: bool = False,
check_device_map: bool = False,
):
"""Check and potentially enable flash attention mechanism."""
pass
"""
`_check_and_enable_flash_attn_2`原本不扩展闪存注意力使能到模型的子配置。我们重写原始方法以确保Bark子模型在需要时使用Flash Attention。
如果你不了解Flash Attention,请查看官方的Flash Attention存储库:
https://github.com/Dao-AILab/flash-attention
若要直接通过`BetterTransformer` API使用Flash Attention 1.0,请查看文档的特定部分以了解更多信息:
https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models
该方法检查当前设置是否与Flash Attention兼容,因为它要求模型处于半精度并且不在CPU上运行。
如果所有检查都通过且`hard_check_only`为False,则该方法将把配置属性`_attn_implementation`设置为"flash_attention_2",以便模型可以初始化正确的注意力模块。
"""
config = super()._check_and_enable_flash_attn_2(
config, torch_dtype, device_map, hard_check_only=hard_check_only, check_device_map=check_device_map
)
config.semantic_config._attn_implementation = config._attn_implementation
config.coarse_acoustics_config._attn_implementation = config._attn_implementation
config.fine_acoustics_config._attn_implementation = config._attn_implementation
return config
.\models\bark\processing_bark.py
"""
Processor class for Bark
"""
import json
import os
from typing import Optional
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...processing_utils import ProcessorMixin
from ...utils import logging
from ...utils.hub import get_file_from_repo
from ..auto import AutoTokenizer
logger = logging.get_logger(__name__)
class BarkProcessor(ProcessorMixin):
r"""
Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.
Args:
tokenizer ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`].
speaker_embeddings (`Dict[Dict[str]]`, *optional*):
Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
`"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
[here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
a list of `voice_preset_names`.
"""
tokenizer_class = "AutoTokenizer"
attributes = ["tokenizer"]
preset_shape = {
"semantic_prompt": 1,
"coarse_prompt": 2,
"fine_prompt": 2,
}
def __init__(self, tokenizer, speaker_embeddings=None):
super().__init__(tokenizer)
self.speaker_embeddings = speaker_embeddings
@classmethod
def from_pretrained(
cls, pretrained_processor_name_or_path, speaker_embeddings_dict_path="speaker_embeddings_path.json", **kwargs
):
):
r"""
Instantiate a Bark processor associated with a pretrained model.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
huggingface.co.
- a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
method, e.g., `./my_model_directory/`.
speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
The name of the `.json` file containing the speaker_embeddings dictionary located in
`pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
**kwargs
Additional keyword arguments passed along to both
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
"""
if speaker_embeddings_dict_path is not None:
speaker_embeddings_path = get_file_from_repo(
pretrained_processor_name_or_path,
speaker_embeddings_dict_path,
subfolder=kwargs.pop("subfolder", None),
cache_dir=kwargs.pop("cache_dir", None),
force_download=kwargs.pop("force_download", False),
proxies=kwargs.pop("proxies", None),
resume_download=kwargs.pop("resume_download", False),
local_files_only=kwargs.pop("local_files_only", False),
token=kwargs.pop("use_auth_token", None),
revision=kwargs.pop("revision", None),
)
if speaker_embeddings_path is None:
logger.warning(
f"""`{os.path.join(pretrained_processor_name_or_path,speaker_embeddings_dict_path)}` does not exists
, no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
dictionnary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
)
speaker_embeddings = None
else:
with open(speaker_embeddings_path) as speaker_embeddings_json:
speaker_embeddings = json.load(speaker_embeddings_json)
else:
speaker_embeddings = None
tokenizer = AutoTokenizer.from_pretrained(pretrained_processor_name_or_path, **kwargs)
return cls(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
def save_pretrained(
self,
save_directory,
speaker_embeddings_dict_path="speaker_embeddings_path.json",
speaker_embeddings_directory="speaker_embeddings",
push_to_hub: bool = False,
**kwargs,
"""
Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
using the [`~BarkProcessor.from_pretrained`] method.
Args:
save_directory (`str` or `os.PathLike`):
Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
if it does not exist).
speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
The name of the folder in which the speaker_embeddings arrays will be saved.
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace).
kwargs:
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
"""
if self.speaker_embeddings is not None:
os.makedirs(os.path.join(save_directory, speaker_embeddings_directory, "v2"), exist_ok=True)
embeddings_dict = {}
embeddings_dict["repo_or_path"] = save_directory
for prompt_key in self.speaker_embeddings:
if prompt_key != "repo_or_path":
voice_preset = self._load_voice_preset(prompt_key)
tmp_dict = {}
for key in self.speaker_embeddings[prompt_key]:
np.save(
os.path.join(
embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}"
),
voice_preset[key],
allow_pickle=False,
)
tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy")
embeddings_dict[prompt_key] = tmp_dict
with open(os.path.join(save_directory, speaker_embeddings_dict_path), "w") as fp:
json.dump(embeddings_dict, fp)
super().save_pretrained(save_directory, push_to_hub, **kwargs)
def _load_voice_preset(self, voice_preset: str = None, **kwargs):
voice_preset_paths = self.speaker_embeddings[voice_preset]
voice_preset_dict = {}
for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
if key not in voice_preset_paths:
raise ValueError(
f"Voice preset unrecognized, missing {key} as a key in self.speaker_embeddings[{voice_preset}]."
)
path = get_file_from_repo(
self.speaker_embeddings.get("repo_or_path", "/"),
voice_preset_paths[key],
subfolder=kwargs.pop("subfolder", None),
cache_dir=kwargs.pop("cache_dir", None),
force_download=kwargs.pop("force_download", False),
proxies=kwargs.pop("proxies", None),
resume_download=kwargs.pop("resume_download", False),
local_files_only=kwargs.pop("local_files_only", False),
token=kwargs.pop("use_auth_token", None),
revision=kwargs.pop("revision", None),
)
if path is None:
raise ValueError(
f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"), voice_preset_paths[key])}` does not exists
, no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset}
embeddings."""
)
voice_preset_dict[key] = np.load(path)
return voice_preset_dict
def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None):
for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]:
if key not in voice_preset:
raise ValueError(f"Voice preset unrecognized, missing {key} as a key.")
if not isinstance(voice_preset[key], np.ndarray):
raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
if len(voice_preset[key].shape) != self.preset_shape[key]:
raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.")
def __call__(
self,
text=None,
voice_preset=None,
return_tensors="pt",
max_length=256,
add_special_tokens=False,
return_attention_mask=True,
return_token_type_ids=False,
**kwargs,
.\models\bark\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_bark": [
"BARK_PRETRAINED_CONFIG_ARCHIVE_MAP",
"BarkCoarseConfig",
"BarkConfig",
"BarkFineConfig",
"BarkSemanticConfig",
],
"processing_bark": ["BarkProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_bark"] = [
"BARK_PRETRAINED_MODEL_ARCHIVE_LIST",
"BarkFineModel",
"BarkSemanticModel",
"BarkCoarseModel",
"BarkModel",
"BarkPreTrainedModel",
"BarkCausalModel",
]
if TYPE_CHECKING:
from .configuration_bark import (
BARK_PRETRAINED_CONFIG_ARCHIVE_MAP,
BarkCoarseConfig,
BarkConfig,
BarkFineConfig,
BarkSemanticConfig,
)
from .processing_bark import BarkProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_bark import (
BARK_PRETRAINED_MODEL_ARCHIVE_LIST,
BarkCausalModel,
BarkCoarseModel,
BarkFineModel,
BarkModel,
BarkPreTrainedModel,
BarkSemanticModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bart\configuration_bart.py
import warnings
from collections import OrderedDict
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
from ...onnx.utils import compute_effective_axis_dimension
from ...utils import TensorType, is_torch_available, logging
logger = logging.get_logger(__name__)
BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
}
class BartConfig(PretrainedConfig):
r"""
这是用于存储 [`BartModel`] 配置的类。它用于根据指定的参数实例化 BART 模型,定义模型架构。
使用默认参数实例化配置将得到类似于 BART [facebook/bart-large](https://huggingface.co/facebook/bart-large) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。更多信息请阅读 [`PretrainedConfig`] 的文档。
Example:
```
>>> from transformers import BartConfig, BartModel
>>> # 初始化一个 BART facebook/bart-large 风格的配置
>>> configuration = BartConfig()
>>> # 使用该配置初始化一个模型(带有随机权重)
>>> model = BartModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "bart"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=50265,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
activation_function="gelu",
d_model=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
classifier_dropout=0.0,
scale_embedding=False,
use_cache=True,
num_labels=3,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
is_encoder_decoder=True,
decoder_start_token_id=2,
forced_eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
super().__init__(
num_labels=num_labels,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
self.forced_bos_token_id = self.bos_token_id
warnings.warn(
f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
"The config can simply be saved and uploaded again to be fixed."
)
class BartOnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
common_inputs["decoder_input_ids"] = {0: "batch"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
elif self.task == "causal-lm":
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
else:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
]
)
return common_inputs
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_outputs = super().outputs
else:
common_outputs = super(OnnxConfigWithPast, self).outputs
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
return common_outputs
def _generate_dummy_inputs_for_default_and_seq2seq_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
decoder_seq_length = seq_length if not self.use_past else 1
decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, decoder_seq_length, is_pair, framework
)
decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
common_inputs = dict(**encoder_inputs, **decoder_inputs)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, encoder_seq_length = common_inputs["input_ids"].shape
decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
encoder_shape = (
batch,
num_encoder_attention_heads,
encoder_seq_length,
self._config.hidden_size // num_encoder_attention_heads,
)
decoder_past_length = decoder_seq_length + 3
decoder_shape = (
batch,
num_decoder_attention_heads,
decoder_past_length,
self._config.hidden_size // num_decoder_attention_heads,
)
common_inputs["decoder_attention_mask"] = torch.cat(
[common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
)
common_inputs["past_key_values"] = []
num_encoder_layers, num_decoder_layers = self.num_layers
min_num_layers = min(num_encoder_layers, num_decoder_layers)
remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
for _ in range(min_num_layers):
common_inputs["past_key_values"].append(
(
torch.zeros(decoder_shape),
torch.zeros(decoder_shape),
torch.zeros(encoder_shape),
torch.zeros(encoder_shape),
)
)
shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
for _ in range(min_num_layers, max_num_layers):
common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
return common_inputs
def _generate_dummy_inputs_for_causal_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
num_encoder_layers, _ = self.num_layers
num_encoder_attention_heads, _ = self.num_attention_heads
past_shape = (
batch,
num_encoder_attention_heads,
past_key_values_length,
self._config.hidden_size // num_encoder_attention_heads,
)
mask_dtype = common_inputs["attention_mask"].dtype
common_inputs["attention_mask"] = torch.cat(
[common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
common_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
]
return common_inputs
def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
batch_size = compute_effective_axis_dimension(
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
)
token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
seq_length = compute_effective_axis_dimension(
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
)
dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
return common_inputs
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
elif self.task == "causal-lm":
common_inputs = self._generate_dummy_inputs_for_causal_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
else:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
return common_inputs
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
if self.task in ["default", "seq2seq-lm"]:
flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
else:
flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
flattened_output, name, idx, t
)
.\models\bart\convert_bart_original_pytorch_checkpoint_to_pytorch.py
import argparse
import os
from pathlib import Path
import fairseq
import torch
from packaging import version
from torch import nn
from transformers import (
BartConfig,
BartForConditionalGeneration,
BartForSequenceClassification,
BartModel,
BartTokenizer,
)
from transformers.utils import logging
FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"]
extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification}
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
raise Exception("requires fairseq >= 0.9.0")
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SAMPLE_TEXT = " Hello world! cécé herlolip"
mnli_rename_keys = [
("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
]
def remove_ignore_keys_(state_dict):
ignore_keys = [
"encoder.version",
"decoder.version",
"model.encoder.version",
"model.decoder.version",
"_float_tensor",
]
for k in ignore_keys:
state_dict.pop(k, None)
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def load_xsum_checkpoint(checkpoint_path):
"""Checkpoint path should end in model.pt"""
sd = torch.load(checkpoint_path, map_location="cpu")
hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
hub_interface.model.load_state_dict(sd["model"])
return hub_interface
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
@torch.no_grad()
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
"""
Copy/paste/tweak model's weights to our BERT structure.
"""
if not os.path.exists(checkpoint_path):
bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
else:
bart = load_xsum_checkpoint(checkpoint_path)
bart.model.upgrade_state_dict(bart.model.state_dict())
if hf_checkpoint_name is None:
hf_checkpoint_name = checkpoint_path.replace(".", "-")
config = BartConfig.from_pretrained(hf_checkpoint_name)
tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
if not torch.eq(tokens, tokens2).all():
raise ValueError(
f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}"
)
if checkpoint_path == "bart.large.mnli":
state_dict = bart.state_dict()
remove_ignore_keys_(state_dict)
state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
for src, dest in mnli_rename_keys:
rename_key(state_dict, src, dest)
model = BartForSequenceClassification(config).eval()
model.load_state_dict(state_dict)
fairseq_output = bart.predict("mnli", tokens, return_logits=True)
new_model_outputs = model(tokens)[0]
else:
state_dict = bart.model.state_dict()
remove_ignore_keys_(state_dict)
state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
fairseq_output = bart.extract_features(tokens)
if hf_checkpoint_name == "facebook/bart-large":
model = BartModel(config).eval()
model.load_state_dict(state_dict)
new_model_outputs = model(tokens).model[0]
else:
model = BartForConditionalGeneration(config).eval()
model.model.load_state_dict(state_dict)
if hasattr(model, "lm_head"):
model.lm_head = make_linear_from_emb(model.model.shared)
new_model_outputs = model.model(tokens)[0]
if fairseq_output.shape != new_model_outputs.shape:
raise ValueError(
f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}"
)
if (fairseq_output != new_model_outputs).any().item():
raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
)
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument(
"--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
)
args = parser.parse_args()
convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
.\models\bart\modeling_bart.py
""" PyTorch BART模型."""
import copy
import math
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import (
_prepare_4d_attention_mask,
_prepare_4d_attention_mask_for_sdpa,
_prepare_4d_causal_attention_mask,
_prepare_4d_causal_attention_mask_for_sdpa,
)
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqQuestionAnsweringModelOutput,
Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)
from .configuration_bart import BartConfig
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/bart-base"
_CONFIG_FOR_DOC = "BartConfig"
_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2"
_SEQ_CLASS_EXPECTED_LOSS = 0.0
_SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'"
_CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1"
_QA_EXPECTED_LOSS = 0.59
_QA_EXPECTED_OUTPUT = "' nice puppet'"
BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/bart-large",
]
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class BartLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
"""`input_ids' shape is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids.shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
).expand(bsz, -1)
return super().forward(positions + self.offset)
class BartAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[BartConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
"""
Bart flash attention module. This module inherits from `BartAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
"""
Reshape the input tensor into the desired shape.
Args:
- tensor (torch.Tensor): The input tensor to reshape.
- seq_len (int): Length of the sequence.
- bsz (int): Batch size.
Returns:
- torch.Tensor: Reshaped tensor of shape (bsz, seq_len, num_heads, head_dim).
"""
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
"""
Perform the forward pass of the attention module.
Args:
- hidden_states (torch.Tensor): Input hidden states.
- key_value_states (Optional[torch.Tensor]): Key and value states if provided separately.
- past_key_value (Optional[Tuple[torch.Tensor]]): Past key and value tensors.
- attention_mask (Optional[torch.Tensor]): Mask for attention computation.
- layer_head_mask (Optional[torch.Tensor]): Mask for heads within a layer.
- output_attentions (bool): Whether to output attentions.
Returns:
- torch.Tensor: Output tensor from the attention module.
"""
pass
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
"""
Perform forward pass specific to flash attention mechanism.
Args:
- query_states: Query states tensor.
- key_states: Key states tensor.
- value_states: Value states tensor.
- attention_mask: Mask for attention computation.
- query_length: Length of the query sequence.
- dropout (float): Dropout rate.
- softmax_scale: Scaling factor for softmax computation.
Returns:
- torch.Tensor: Output tensor after applying flash attention.
"""
pass
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
class BartSdpaAttention(BartAttention):
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
pass
BART_ATTENTION_CLASSES = {
"eager": BartAttention,
"sdpa": BartSdpaAttention,
"flash_attention_2": BartFlashAttention2,
}
class BartEncoderLayer(nn.Module):
def __init__(self, config: BartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: torch.FloatTensor,
layer_head_mask: torch.FloatTensor,
output_attentions: Optional[bool] = False,
):
pass
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class BartDecoderLayer(nn.Module):
def __init__(self, config: BartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
pass
class BartClassificationHead(nn.Module):
def __init__(
self,
input_dim: int,
inner_dim: int,
num_classes: int,
pooler_dropout: float,
):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class BartPreTrainedModel(PreTrainedModel):
config_class = BartConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"]
_no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
_supports_sdpa = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@property
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs
class PretrainedBartModel(BartPreTrainedModel):
def __init_subclass__(self):
warnings.warn(
"The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
FutureWarning,
)
class BartPretrainedModel(BartPreTrainedModel):
def __init_subclass__(self):
warnings.warn(
"The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.",
FutureWarning,
)
BART_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BartConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BART_GENERATION_EXAMPLE = r"""
Summarization example:
```
>>> from transformers import AutoTokenizer, BartForConditionalGeneration
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
>>> ARTICLE_TO_SUMMARIZE = (
... "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
```
Mask filling example:
```
>>> from transformers import AutoTokenizer, BartForConditionalGeneration
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
>>> TXT = "My friends are <mask> but they eat too many carbs."
>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
>>> logits = model(input_ids).logits
"""
masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tokenizer.decode(predictions).split()
['not', 'good', 'healthy', 'great', 'very']
BART_INPUTS_DOCSTRING = r"""
"""
class BartEncoder(BartPreTrainedModel):
"""
BART 编码器,由 *config.encoder_layers* 个自注意力层组成的Transformer编码器。
Args:
config: BartConfig
embed_tokens (nn.Embedding): 输出的嵌入表示
"""
def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = BartLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self._use_sdpa = config._attn_implementation == "sdpa"
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = BartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self._use_sdpa = config._attn_implementation == "sdpa"
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The BART Model outputting raw hidden-states without any specific head on top.",
BART_START_DOCSTRING,
)
class BartModel(BartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BartConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = BartEncoder(config, self.shared)
self.decoder = BartDecoder(config, self.shared)
self.post_init()
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def __init__(self, config: BartConfig):
super().__init__(config)
self.model = BartModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BART_GENERATION_EXAMPLE)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Returns either a tuple or `Seq2SeqLMOutput` depending on `return_dict`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if use_cache:
logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0])
lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
masked_lm_loss = None
if labels is not None:
labels = labels.to(lm_logits.device)
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
decoder_attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"""
Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
BART_START_DOCSTRING,
)
class BartForSequenceClassification(BartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BartConfig, **kwargs):
super().__init__(config, **kwargs)
self.model = BartModel(config)
self.classification_head = BartClassificationHead(
config.d_model,
config.d_model,
config.num_labels,
config.classifier_dropout,
)
self.post_init()
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the BartForSequenceClassification model.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding tokens.
decoder_input_ids: Indices of decoder input sequence tokens in the vocabulary.
decoder_attention_mask: Mask to avoid performing attention on padding tokens for decoder.
head_mask: Mask to nullify selected heads of the self-attention modules.
decoder_head_mask: Mask to nullify selected heads of the cross-attention modules.
cross_attn_head_mask: Mask to nullify selected heads of the cross-attention modules.
encoder_outputs: Hidden states of the encoder at each layer.
inputs_embeds: Optional tensor of embeddings to be used instead of input_ids.
decoder_inputs_embeds: Optional tensor of embeddings to be used instead of decoder_input_ids.
labels: Labels for computing the sequence classification/regression loss.
use_cache: Whether or not to use the pre-computed hidden states cache.
output_attentions: Whether or not to return the attentions tensors.
output_hidden_states: Whether or not to return the hidden states tensors.
return_dict: Whether or not to return a dictionary as output.
Returns:
Depending on `return_dict`, either a dictionary (`Seq2SeqSequenceClassifierOutput`) or
a tuple with sequence classifier output and optional hidden states and attentions.
"""
@add_start_docstrings(
"""
BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BART_START_DOCSTRING,
)
class BartForQuestionAnswering(BartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.model = BartModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_QA,
output_type=Seq2SeqQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_loss=_QA_EXPECTED_LOSS,
expected_output=_QA_EXPECTED_OUTPUT,
)
def forward(
self,
input_ids: torch.Tensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class BartDecoderWrapper(BartPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
super().__init__(config)
self.decoder = BartDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
@add_start_docstrings(
"""
BART decoder with with a language modeling head on top (linear layer with weights tied to the input embeddings).
""",
BART_START_DOCSTRING,
)
class BartForCausalLM(BartPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = BartDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
...
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
...
):
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past