Transformers 源码解析(一百一十二)
.\models\timesformer\modeling_timesformer.py
""" PyTorch TimeSformer model."""
import collections
from typing import Optional, Tuple, Union
import torch
import torch.nn.functional
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_timesformer import TimesformerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "TimesformerConfig"
_CHECKPOINT_FOR_DOC = "facebook/timesformer"
TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/timesformer-base-finetuned-k400",
]
class TimesformerPatchEmbeddings(nn.Module):
"""Image to Patch Embedding"""
def __init__(self, config):
super().__init__()
image_size = config.image_size
patch_size = config.patch_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_patches = num_patches
self.projection = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values):
batch_size, num_frames, num_channels, height, width = pixel_values.shape
pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
embeddings = self.projection(pixel_values)
patch_width = embeddings.size(-1)
embeddings = embeddings.flatten(2).transpose(1, 2)
return embeddings, num_frames, patch_width
class TimesformerEmbeddings(nn.Module):
"""
Construct the patch and position embeddings.
"""
def __init__(self, config):
super().__init__()
embed_dim = config.hidden_size
num_frames = config.num_frames
drop_rate = config.hidden_dropout_prob
attention_type = config.attention_type
self.attention_type = attention_type
self.patch_embeddings = TimesformerPatchEmbeddings(config)
self.num_patches = self.patch_embeddings.num_patches
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_patches + 1, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)
if attention_type != "space_only":
self.time_embeddings = nn.Parameter(torch.zeros(1, num_frames, embed_dim))
self.time_drop = nn.Dropout(p=drop_rate)
def forward(self, pixel_values):
batch_size = pixel_values.shape[0]
embeddings, num_frames, patch_width = self.patch_embeddings(pixel_values)
cls_tokens = self.cls_token.expand(embeddings.size(0), -1, -1)
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
if embeddings.size(1) != self.position_embeddings.size(1):
position_embeddings = self.position_embeddings
cls_pos_embed = position_embeddings[0, 0, :].unsqueeze(0).unsqueeze(1)
other_pos_embed = position_embeddings[0, 1:, :].unsqueeze(0).transpose(1, 2)
patch_num = int(other_pos_embed.size(2) ** 0.5)
patch_height = embeddings.size(1) // patch_width
other_pos_embed = other_pos_embed.reshape(1, embeddings.size(2), patch_num, patch_num)
new_pos_embed = nn.functional.interpolate(
other_pos_embed, size=(patch_height, patch_width), mode="nearest"
)
new_pos_embed = new_pos_embed.flatten(2)
new_pos_embed = new_pos_embed.transpose(1, 2)
new_pos_embed = torch.cat((cls_pos_embed, new_pos_embed), 1)
embeddings = embeddings + new_pos_embed
else:
embeddings = embeddings + self.position_embeddings
embeddings = self.pos_drop(embeddings)
if self.attention_type != "space_only":
cls_tokens = embeddings[:batch_size, 0, :].unsqueeze(1)
embeddings = embeddings[:, 1:]
_, patch_height, patch_width = embeddings.shape
embeddings = (
embeddings.reshape(batch_size, num_frames, patch_height, patch_width)
.permute(0, 2, 1, 3)
.reshape(batch_size * patch_height, num_frames, patch_width)
)
if num_frames != self.time_embeddings.size(1):
time_embeddings = self.time_embeddings.transpose(1, 2)
new_time_embeddings = nn.functional.interpolate(time_embeddings, size=(num_frames), mode="nearest")
new_time_embeddings = new_time_embeddings.transpose(1, 2)
embeddings = embeddings + new_time_embeddings
else:
embeddings = embeddings + self.time_embeddings
embeddings = self.time_drop(embeddings)
embeddings = embeddings.view(batch_size, patch_height, num_frames, patch_width).reshape(
batch_size, patch_height * num_frames, patch_width
)
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
return embeddings
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class TimeSformerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class TimesformerSelfAttention(nn.Module):
def __init__(self, config: TimesformerConfig):
super().__init__()
num_heads = config.num_attention_heads
qkv_bias = config.qkv_bias
attention_dropout_prob = config.attention_probs_dropout_prob
self.num_heads = num_heads
head_dim = config.hidden_size // num_heads
self.scale = head_dim**-0.5
self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attention_dropout_prob)
def forward(self, hidden_states, output_attentions: bool = False):
batch_size, hidden_size, num_channels = hidden_states.shape
qkv = (
self.qkv(hidden_states)
.reshape(batch_size, hidden_size, 3, self.num_heads, num_channels // self.num_heads)
.permute(2, 0, 3, 1, 4)
)
query, key, value = qkv[0], qkv[1], qkv[2]
attention_probs = (query @ key.transpose(-2, -1)) * self.scale
attention_probs = attention_probs.softmax(dim=-1)
attention_probs = self.attn_drop(attention_probs)
context_layer = (attention_probs @ value).transpose(1, 2).reshape(batch_size, hidden_size, num_channels)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class TimesformerSelfOutput(nn.Module):
"""
The residual connection is defined in TimesformerLayer instead of here (as is the case with other models), due to
the layernorm applied before each block.
"""
def __init__(self, config: TimesformerConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class TimeSformerAttention(nn.Module):
def __init__(self, config: TimesformerConfig) -> None:
super().__init__()
self.attention = TimesformerSelfAttention(config)
self.output = TimesformerSelfOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_outputs = self.attention(hidden_states, output_attentions)
attention_output = self.output(self_outputs[0])
outputs = (attention_output,) + self_outputs[1:]
return outputs
class TimesformerIntermediate(nn.Module):
def __init__(self, config: TimesformerConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class TimesformerOutput(nn.Module):
def __init__(self, config: TimesformerConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class TimesformerLayer(nn.Module):
def __init__(self, config: TimesformerConfig, layer_index: int) -> None:
super().__init__()
attention_type = config.attention_type
drop_path_rates = [
x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)
]
drop_path_rate = drop_path_rates[layer_index]
self.drop_path = TimeSformerDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.attention = TimeSformerAttention(config)
self.intermediate = TimesformerIntermediate(config)
self.output = TimesformerOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.config = config
self.attention_type = attention_type
if attention_type not in ["divided_space_time", "space_only", "joint_space_time"]:
raise ValueError("Unknown attention type: {}".format(attention_type))
if self.attention_type == "divided_space_time":
self.temporal_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.temporal_attention = TimeSformerAttention(config)
self.temporal_dense = nn.Linear(config.hidden_size, config.hidden_size)
class TimesformerEncoder(nn.Module):
def __init__(self, config: TimesformerConfig) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([TimesformerLayer(config, ind) for ind in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class TimesformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = TimesformerConfig
base_model_prefix = "timesformer"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
if isinstance(module, (nn.Linear, nn.Conv2d)):
nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.LayerNorm):
nn.init.constant_(module.bias, 0)
nn.init.constant_(module.weight, 1.0)
elif isinstance(module, TimesformerEmbeddings):
nn.init.trunc_normal_(module.cls_token, std=self.config.initializer_range)
nn.init.trunc_normal_(module.position_embeddings, std=self.config.initializer_range)
module.patch_embeddings.apply(self._init_weights)
TIMESFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
Parameters:
config ([`TimesformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
TIMESFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`VideoMAEImageProcessor.preprocess`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare TimeSformer Model transformer outputting raw hidden-states without any specific head on top.",
TIMESFORMER_START_DOCSTRING,
)
class TimesformerModel(TimesformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = TimesformerEmbeddings(config)
self.encoder = TimesformerEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_init()
def get_input_embeddings(self):
"""
Returns the patch embeddings used in the model's input layer.
"""
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model.
Args:
heads_to_prune (dict): dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel.
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the Timesformer Model.
Args:
pixel_values (torch.FloatTensor): Pixel values of shape `(batch_size, num_frames, num_channels, height, width)`.
output_attentions (bool, optional): Whether to return attentions tensors of all attention layers.
output_hidden_states (bool, optional): Whether to return hidden states of all layers.
return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
Returns:
BaseModelOutput or tuple:
A BaseModelOutput (if return_dict=True) or a tuple of torch.FloatTensor containing various model outputs.
"""
pass
@add_start_docstrings(
"""TimeSformer Model transformer with a video classification head on top (a linear layer on top of the final hidden state
of the [CLS] token) e.g. for ImageNet.""",
TIMESFORMER_START_DOCSTRING,
)
class TimesformerForVideoClassification(TimesformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.timesformer = TimesformerModel(config)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.post_init()
@add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the TimesformerForVideoClassification model.
Args:
pixel_values (torch.FloatTensor): Pixel values of shape `(batch_size, num_frames, num_channels, height, width)`.
output_attentions (bool, optional): Whether to return attentions tensors of all attention layers.
output_hidden_states (bool, optional): Whether to return hidden states of all layers.
return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
Returns:
BaseModelOutput or tuple:
A BaseModelOutput (if return_dict=True) or a tuple of torch.FloatTensor containing various model outputs.
"""
pass
@replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\timesformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_timesformer"] = [
"TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TimesformerModel",
"TimesformerForVideoClassification",
"TimesformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_timesformer import (
TIMESFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
TimesformerForVideoClassification,
TimesformerModel,
TimesformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\time_series_transformer\configuration_time_series_transformer.py
from typing import List, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"huggingface/time-series-transformer-tourism-monthly": (
"https://huggingface.co/huggingface/time-series-transformer-tourism-monthly/resolve/main/config.json"
),
}
class TimeSeriesTransformerConfig(PretrainedConfig):
r"""
这是用于存储 [`TimeSeriesTransformerModel`] 配置的类。根据指定的参数实例化一个 Time Series Transformer 模型的配置,
定义模型架构。使用默认配置实例化将产生类似于 Time Series Transformer
[huggingface/time-series-transformer-tourism-monthly](https://huggingface.co/huggingface/time-series-transformer-tourism-monthly)
架构的配置。
配置对象继承自 [`PretrainedConfig`],可以用于控制模型的输出。查阅 [`PretrainedConfig`] 的文档以获取更多信息。
```
>>> from transformers import TimeSeriesTransformerConfig, TimeSeriesTransformerModel
>>> # 使用 12 个时间步进行预测初始化 Time Series Transformer 配置
>>> configuration = TimeSeriesTransformerConfig(prediction_length=12)
>>> # 从配置中随机初始化一个模型(带有随机权重)
>>> model = TimeSeriesTransformerModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "time_series_transformer"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
"num_hidden_layers": "encoder_layers",
}
def __init__(
self,
prediction_length: Optional[int] = None,
context_length: Optional[int] = None,
distribution_output: str = "student_t",
loss: str = "nll",
input_size: int = 1,
lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
scaling: Optional[Union[str, bool]] = "mean",
num_dynamic_real_features: int = 0,
num_static_categorical_features: int = 0,
num_static_real_features: int = 0,
num_time_features: int = 0,
cardinality: Optional[List[int]] = None,
embedding_dimension: Optional[List[int]] = None,
encoder_ffn_dim: int = 32,
decoder_ffn_dim: int = 32,
encoder_attention_heads: int = 2,
decoder_attention_heads: int = 2,
encoder_layers: int = 2,
decoder_layers: int = 2,
is_encoder_decoder: bool = True,
activation_function: str = "gelu",
d_model: int = 64,
dropout: float = 0.1,
encoder_layerdrop: float = 0.1,
decoder_layerdrop: float = 0.1,
attention_dropout: float = 0.1,
activation_dropout: float = 0.1,
num_parallel_samples: int = 100,
init_std: float = 0.02,
use_cache=True,
**kwargs,
self.prediction_length = prediction_length
self.context_length = context_length or prediction_length
self.distribution_output = distribution_output
self.loss = loss
self.input_size = input_size
self.num_time_features = num_time_features
self.lags_sequence = lags_sequence
self.scaling = scaling
self.num_dynamic_real_features = num_dynamic_real_features
self.num_static_real_features = num_static_real_features
self.num_static_categorical_features = num_static_categorical_features
if cardinality and num_static_categorical_features > 0:
if len(cardinality) != num_static_categorical_features:
raise ValueError(
"The cardinality should be a list of the same length as `num_static_categorical_features`"
)
self.cardinality = cardinality
else:
self.cardinality = [0]
if embedding_dimension and num_static_categorical_features > 0:
if len(embedding_dimension) != num_static_categorical_features:
raise ValueError(
"The embedding dimension should be a list of the same length as `num_static_categorical_features`"
)
self.embedding_dimension = embedding_dimension
else:
self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]
self.num_parallel_samples = num_parallel_samples
self.feature_size = input_size * len(lags_sequence) + self._number_of_features
self.d_model = d_model
self.encoder_attention_heads = encoder_attention_heads
self.decoder_attention_heads = decoder_attention_heads
self.encoder_ffn_dim = encoder_ffn_dim
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_layers = encoder_layers
self.decoder_layers = decoder_layers
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.activation_function = activation_function
self.init_std = init_std
self.use_cache = use_cache
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
@property
def _number_of_features(self) -> int:
return (
sum(self.embedding_dimension)
+ self.num_dynamic_real_features
+ self.num_time_features
+ self.num_static_real_features
+ self.input_size * 2
)
.\models\time_series_transformer\modeling_time_series_transformer.py
""" PyTorch 时间序列 Transformer 模型。"""
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from torch import nn
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
SampleTSPredictionOutput,
Seq2SeqTSModelOutput,
Seq2SeqTSPredictionOutput,
)
from ...modeling_utils import PreTrainedModel
from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_time_series_transformer import TimeSeriesTransformerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "TimeSeriesTransformerConfig"
TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"huggingface/time-series-transformer-tourism-monthly",
]
class TimeSeriesFeatureEmbedder(nn.Module):
"""
Embed a sequence of categorical features.
Args:
cardinalities (`list[int]`):
List of cardinalities of the categorical features.
embedding_dims (`list[int]`):
List of embedding dimensions of the categorical features.
"""
def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
super().__init__()
self.num_features = len(cardinalities)
self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])
def forward(self, features: torch.Tensor) -> torch.Tensor:
if self.num_features > 1:
cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
else:
cat_feature_slices = [features]
return torch.cat(
[
embed(cat_feature_slice.squeeze(-1))
for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
],
dim=-1,
)
class TimeSeriesStdScaler(nn.Module):
"""
Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
subtracting from the mean and dividing by the standard deviation.
"""
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__()
self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
def forward(
self, data: torch.Tensor, observed_indicator: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Parameters:
data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
input for Batch norm calculation
observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
Calculating the scale on the observed indicator.
Returns:
tuple of `torch.Tensor` of shapes
(`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
`(batch_size, 1, num_input_channels)`)
"""
denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
denominator = denominator.clamp_min(1.0)
loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
scale = torch.sqrt(variance + self.minimum_scale)
return (data - loc) / scale, loc, scale
class TimeSeriesMeanScaler(nn.Module):
"""
Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
accordingly.
"""
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__()
self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
def forward(
self, data: torch.Tensor, observed_indicator: torch.Tensor
) -> torch.Tensor:
"""
Parameters:
data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
input for Batch norm calculation
observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
Calculating the scale on the observed indicator.
Returns:
`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`:
Normalized data.
"""
scale = (torch.abs(data) * observed_indicator).sum(self.dim, keepdim=self.keepdim)
scale = scale.clamp_min(self.minimum_scale)
if self.default_scale is not None:
scale = torch.where(scale == 0, torch.tensor(self.default_scale).to(scale.device), scale)
return data / scale
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Parameters:
data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
input for Batch norm calculation
observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
Calculating the scale on the observed indicator.
Returns:
tuple of `torch.Tensor` of shapes
(`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
`(batch_size, 1, num_input_channels)`)
"""
ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
num_observed = observed_indicator.sum(self.dim, keepdim=True)
scale = ts_sum / torch.clamp(num_observed, min=1)
if self.default_scale is None:
batch_sum = ts_sum.sum(dim=0)
batch_observations = torch.clamp(num_observed.sum(0), min=1)
default_scale = torch.squeeze(batch_sum / batch_observations)
else:
default_scale = self.default_scale * torch.ones_like(scale)
scale = torch.where(num_observed > 0, scale, default_scale)
scale = torch.clamp(scale, min=self.minimum_scale)
scaled_data = data / scale
if not self.keepdim:
scale = scale.squeeze(dim=self.dim)
return scaled_data, torch.zeros_like(scale), scale
class TimeSeriesNOPScaler(nn.Module):
"""
Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
"""
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__()
self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
def forward(
self, data: torch.Tensor, observed_indicator: torch.Tensor = None
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Parameters:
data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
input for Batch norm calculation
Returns:
tuple of `torch.Tensor` of shapes
(`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
`(batch_size, 1, num_input_channels)`)
"""
scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
return data, loc, scale
def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
"""
Computes the negative log likelihood loss from input distribution with respect to target.
"""
return -input.log_prob(target)
def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
"""
Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
Args:
input_tensor (`torch.FloatTensor`):
Input tensor, of which the average must be computed.
weights (`torch.FloatTensor`, *optional*):
Weights tensor, of the same shape as `input_tensor`.
dim (`int`, *optional*):
The dim along which to average `input_tensor`.
Returns:
`torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
"""
if weights is not None:
weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
else:
return input_tensor.mean(dim=dim)
class TimeSeriesSinusoidalPositionalEmbedding(nn.Embedding):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
super().__init__(num_positions, embedding_dim)
self.weight = self._init_weight(self.weight)
@staticmethod
def _init_weight(out: nn.Parameter) -> nn.Parameter:
"""
与 XLM 的 create_sinusoidal_embeddings 方法相同,不同之处在于特征没有交错。
余弦特征位于向量的第二半部分 [dim // 2:]
"""
n_pos, dim = out.shape
position_enc = np.array(
[[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
)
out.requires_grad = False
sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
return out
@torch.no_grad()
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
"""`input_ids_shape` 期望是 [bsz x seqlen]。"""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
class TimeSeriesValueEmbedding(nn.Module):
def __init__(self, feature_size, d_model):
super().__init__()
self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)
def forward(self, x):
return self.value_projection(x)
class TimeSeriesTransformerAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[TimeSeriesTransformerConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
pass
class TimeSeriesTransformerEncoderLayer(nn.Module):
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES = {
"eager": TimeSeriesTransformerAttention,
}
class TimeSeriesTransformerDecoderLayer(nn.Module):
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
class TimeSeriesTransformerPreTrainedModel(PreTrainedModel):
config_class = TimeSeriesTransformerConfig
base_model_prefix = "model"
main_input_name = "past_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, TimeSeriesSinusoidalPositionalEmbedding):
pass
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
TIME_SERIES_TRANSFORMER_START_DOCSTRING = r"""
# 该模型继承自PreTrainedModel。查看超类文档,了解库实现的所有模型的通用方法(例如下载或保存、调整输入嵌入、剪枝头等)。
# 该模型也是PyTorch的torch.nn.Module子类。将其视为常规的PyTorch模块,并查阅PyTorch文档,了解与一般用法和行为有关的所有事项。
# 参数:
# config ([TimeSeriesTransformerConfig]):
# 模型配置类,包含模型的所有参数。使用配置文件进行初始化不会加载与模型关联的权重,只会加载配置。查看PreTrainedModel.from_pretrained方法以加载模型权重。
"""
TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING = r"""
"""
class TimeSeriesTransformerEncoder(TimeSeriesTransformerPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`TimeSeriesTransformerEncoderLayer`].
Args:
config: TimeSeriesTransformerConfig
"""
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
if config.prediction_length is None:
raise ValueError("The `prediction_length` config needs to be specified.")
self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
config.context_length + config.prediction_length, config.d_model
)
self.layers = nn.ModuleList([TimeSeriesTransformerEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
在输入的时间序列数据上执行前向传播。
Args:
attention_mask: 可选的注意力遮罩张量
head_mask: 可选的注意力头部遮罩张量
inputs_embeds: 可选的输入嵌入张量
output_attentions: 可选的是否输出注意力张量
output_hidden_states: 可选的是否输出隐藏状态张量
return_dict: 可选的是否返回字典形式的输出
Returns:
输出字典或元组,根据 return_dict 参数决定
"""
pass
class TimeSeriesTransformerDecoder(TimeSeriesTransformerPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
[`TimeSeriesTransformerDecoderLayer`]
Args:
config: TimeSeriesTransformerConfig
"""
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
if config.prediction_length is None:
raise ValueError("The `prediction_length` config needs to be specified.")
self.value_embedding = TimeSeriesValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
self.embed_positions = TimeSeriesSinusoidalPositionalEmbedding(
config.context_length + config.prediction_length, config.d_model
)
self.layers = nn.ModuleList([TimeSeriesTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The bare Time Series Transformer Model outputting raw hidden-states without any specific head on top.",
TIME_SERIES_TRANSFORMER_START_DOCSTRING,
)
class TimeSeriesTransformerModel(TimeSeriesTransformerPreTrainedModel):
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__(config)
if config.scaling == "mean" or config.scaling is True:
self.scaler = TimeSeriesMeanScaler(config)
elif config.scaling == "std":
self.scaler = TimeSeriesStdScaler(config)
else:
self.scaler = TimeSeriesNOPScaler(config)
if config.num_static_categorical_features > 0:
self.embedder = TimeSeriesFeatureEmbedder(
cardinalities=config.cardinality,
embedding_dims=config.embedding_dimension,
)
self.encoder = TimeSeriesTransformerEncoder(config)
self.decoder = TimeSeriesTransformerDecoder(config)
self.post_init()
@property
def _past_length(self) -> int:
return self.config.context_length + max(self.config.lags_sequence)
def get_lagged_subsequences(
self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
) -> torch.Tensor:
"""
Returns lagged subsequences of a given sequence. Returns a tensor of shape (N, S, C, I),
where S = subsequences_length and I = len(indices), containing lagged subsequences. Specifically, lagged[i,
j, :, k] = sequence[i, -indices[k]-S+j, :].
Args:
sequence: Tensor
The sequence from which lagged subsequences should be extracted. Shape: (N, T, C).
subsequences_length : int
Length of the subsequences to be extracted.
shift: int
Shift the lags by this amount back.
"""
sequence_length = sequence.shape[1]
indices = [lag - shift for lag in self.config.lags_sequence]
if max(indices) + subsequences_length > sequence_length:
raise ValueError(
f"lags cannot go further than history length, found lag {max(indices)} "
f"while history length is only {sequence_length}"
)
lagged_values = []
for lag_index in indices:
begin_index = -lag_index - subsequences_length
end_index = -lag_index if lag_index > 0 else None
lagged_values.append(sequence[:, begin_index:end_index, ...])
return torch.stack(lagged_values, dim=-1)
def create_network_inputs(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
past_observed_mask: Optional[torch.Tensor] = None,
future_values: Optional[torch.Tensor] = None,
future_time_features: Optional[torch.Tensor] = None,
time_feat = (
torch.cat(
(
past_time_features[:, self._past_length - self.config.context_length :, ...],
future_time_features,
),
dim=1,
)
if future_values is not None
else past_time_features[:, self._past_length - self.config.context_length :, ...]
)
if past_observed_mask is None:
past_observed_mask = torch.ones_like(past_values)
context = past_values[:, -self.config.context_length :]
observed_context = past_observed_mask[:, -self.config.context_length :]
_, loc, scale = self.scaler(context, observed_context)
inputs = (
(torch.cat((past_values, future_values), dim=1) - loc) / scale
if future_values is not None
else (past_values - loc) / scale
)
log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
static_feat = torch.cat((log_abs_loc, log_scale), dim=1)
if static_real_features is not None:
static_feat = torch.cat((static_real_features, static_feat), dim=1)
if static_categorical_features is not None:
embedded_cat = self.embedder(static_categorical_features)
static_feat = torch.cat((embedded_cat, static_feat), dim=1)
expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)
features = torch.cat((expanded_static_feat, time_feat), dim=-1)
subsequences_length = (
self.config.context_length + self.config.prediction_length
if future_values is not None
else self.config.context_length
)
lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
lags_shape = lagged_sequence.shape
reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)
if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
raise ValueError(
f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
)
transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)
return transformer_inputs, loc, scale, static_feat
def forward(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
past_observed_mask: torch.Tensor,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
future_values: Optional[torch.Tensor] = None,
future_time_features: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
use_cache: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The Time Series Transformer Model with a distribution head on top for time-series forecasting.",
TIME_SERIES_TRANSFORMER_START_DOCSTRING,
)
class TimeSeriesTransformerForPrediction(TimeSeriesTransformerPreTrainedModel):
def __init__(self, config: TimeSeriesTransformerConfig):
super().__init__(config)
self.model = TimeSeriesTransformerModel(config)
if config.distribution_output == "student_t":
self.distribution_output = StudentTOutput(dim=config.input_size)
elif config.distribution_output == "normal":
self.distribution_output = NormalOutput(dim=config.input_size)
elif config.distribution_output == "negative_binomial":
self.distribution_output = NegativeBinomialOutput(dim=config.input_size)
else:
raise ValueError(f"Unknown distribution output {config.distribution_output}")
self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.d_model)
self.target_shape = self.distribution_output.event_shape
if config.loss == "nll":
self.loss = nll
else:
raise ValueError(f"Unknown loss function {config.loss}")
self.post_init()
def output_params(self, dec_output):
return self.parameter_projection(dec_output)
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
@torch.jit.ignore
def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution:
sliced_params = params
if trailing_n is not None:
sliced_params = [p[:, -trailing_n:] for p in params]
return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale)
@add_start_docstrings_to_model_forward(TIME_SERIES_TRANSFORMER_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqTSModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
past_observed_mask: torch.Tensor,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
future_values: Optional[torch.Tensor] = None,
future_time_features: Optional[torch.Tensor] = None,
future_observed_mask: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
use_cache: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
@torch.no_grad()
def generate(
self,
past_values: torch.Tensor,
past_time_features: torch.Tensor,
future_time_features: torch.Tensor,
past_observed_mask: Optional[torch.Tensor] = None,
static_categorical_features: Optional[torch.Tensor] = None,
static_real_features: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
):
.\models\time_series_transformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_time_series_transformer": [
"TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"TimeSeriesTransformerConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_time_series_transformer"] = [
"TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TimeSeriesTransformerForPrediction",
"TimeSeriesTransformerModel",
"TimeSeriesTransformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_time_series_transformer import (
TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
TimeSeriesTransformerConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_time_series_transformer import (
TIME_SERIES_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
TimeSeriesTransformerForPrediction,
TimeSeriesTransformerModel,
TimeSeriesTransformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\timm_backbone\configuration_timm_backbone.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
class TimmBackboneConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration for a timm backbone [`TimmBackbone`].
It is used to instantiate a timm backbone model according to the specified arguments, defining the model.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
backbone (`str`, *optional*):
The timm checkpoint to load.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
features_only (`bool`, *optional*, defaults to `True`):
Whether to output only the features or also the logits.
use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
Whether to use a pretrained backbone.
out_indices (`List[int]`, *optional*):
If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
many stages the model has). Will default to the last stage if unset.
freeze_batch_norm_2d (`bool`, *optional*, defaults to `False`):
Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`.
Example:
```
>>> from transformers import TimmBackboneConfig, TimmBackbone
>>> # Initializing a timm backbone
>>> configuration = TimmBackboneConfig("resnet50")
>>> # Initializing a model from the configuration
>>> model = TimmBackbone(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "timm_backbone"
def __init__(
self,
backbone=None,
num_channels=3,
features_only=True,
use_pretrained_backbone=True,
out_indices=None,
freeze_batch_norm_2d=False,
**kwargs,
):
super().__init__(**kwargs)
self.backbone = backbone
self.num_channels = num_channels
self.features_only = features_only
self.use_pretrained_backbone = use_pretrained_backbone
self.out_indices = out_indices
self.freeze_batch_norm_2d = freeze_batch_norm_2d
):
super().__init__(**kwargs)
self.backbone = backbone
self.num_channels = num_channels
self.features_only = features_only
self.use_pretrained_backbone = use_pretrained_backbone
self.use_timm_backbone = True
self.out_indices = out_indices if out_indices is not None else (-1,)
self.freeze_batch_norm_2d = freeze_batch_norm_2d
.\models\timm_backbone\modeling_timm_backbone.py
from typing import Optional, Tuple, Union
import torch
from ...modeling_outputs import BackboneOutput
from ...modeling_utils import PreTrainedModel
from ...utils import is_timm_available, is_torch_available, requires_backends
from ...utils.backbone_utils import BackboneMixin
from .configuration_timm_backbone import TimmBackboneConfig
if is_timm_available():
import timm
if is_torch_available():
from torch import Tensor
class TimmBackbone(PreTrainedModel, BackboneMixin):
"""
Wrapper class for timm models to be used as backbones. This enables using the timm models interchangeably with the
other models in the library keeping the same API.
"""
main_input_name = "pixel_values"
supports_gradient_checkpointing = False
config_class = TimmBackboneConfig
def __init__(self, config, **kwargs):
requires_backends(self, "timm")
super().__init__(config)
self.config = config
if config.backbone is None:
raise ValueError("backbone is not set in the config. Please set it to a timm model name.")
if config.backbone not in timm.list_models():
raise ValueError(f"backbone {config.backbone} is not supported by timm.")
if hasattr(config, "out_features") and config.out_features is not None:
raise ValueError("out_features is not supported by TimmBackbone. Please use out_indices instead.")
pretrained = getattr(config, "use_pretrained_backbone", None)
if pretrained is None:
raise ValueError("use_pretrained_backbone is not set in the config. Please set it to True or False.")
out_indices = config.out_indices if getattr(config, "out_indices", None) is not None else (-1,)
self._backbone = timm.create_model(
config.backbone,
pretrained=pretrained,
features_only=config.features_only,
in_chans=config.num_channels,
out_indices=out_indices,
**kwargs,
)
if getattr(config, "freeze_batch_norm_2d", False):
self.freeze_batch_norm_2d()
self._return_layers = self._backbone.return_layers
self._all_layers = {layer["module"]: str(i) for i, layer in enumerate(self._backbone.feature_info.info)}
super()._init_backbone(config)
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
requires_backends(cls, ["vision", "timm"])
from ...models.timm_backbone import TimmBackboneConfig
config = kwargs.pop("config", TimmBackboneConfig())
use_timm = kwargs.pop("use_timm_backbone", True)
if not use_timm:
raise ValueError("use_timm_backbone must be True for timm backbones")
num_channels = kwargs.pop("num_channels", config.num_channels)
features_only = kwargs.pop("features_only", config.features_only)
use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone)
out_indices = kwargs.pop("out_indices", config.out_indices)
config = TimmBackboneConfig(
backbone=pretrained_model_name_or_path,
num_channels=num_channels,
features_only=features_only,
use_pretrained_backbone=use_pretrained_backbone,
out_indices=out_indices,
)
return super()._from_config(config, **kwargs)
def freeze_batch_norm_2d(self):
timm.layers.freeze_batch_norm_2d(self._backbone)
def unfreeze_batch_norm_2d(self):
timm.layers.unfreeze_batch_norm_2d(self._backbone)
def _init_weights(self, module):
"""
Empty init weights function to ensure compatibility of the class in the library.
"""
pass
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
if output_attentions:
raise ValueError("Cannot output attentions for timm backbones at the moment")
if output_hidden_states:
self._backbone.return_layers = self._all_layers
hidden_states = self._backbone(pixel_values, **kwargs)
self._backbone.return_layers = self._return_layers
feature_maps = tuple(hidden_states[i] for i in self.out_indices)
else:
feature_maps = self._backbone(pixel_values, **kwargs)
hidden_states = None
feature_maps = tuple(feature_maps)
hidden_states = tuple(hidden_states) if hidden_states is not None else None
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output = output + (hidden_states,)
return output
return BackboneOutput(feature_maps=feature_maps, hidden_states=hidden_states, attentions=None)
.\models\timm_backbone\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_timm_backbone": ["TimmBackboneConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_timm_backbone"] = ["TimmBackbone"]
if TYPE_CHECKING:
from .configuration_timm_backbone import TimmBackboneConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_timm_backbone import TimmBackbone
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\trocr\configuration_trocr.py
""" TrOCR model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/trocr-base-handwritten": (
"https://huggingface.co/microsoft/trocr-base-handwritten/resolve/main/config.json"
),
}
class TrOCRConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`TrOCRForCausalLM`]. It is used to instantiate an
TrOCR model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the TrOCR
[microsoft/trocr-base-handwritten](https://huggingface.co/microsoft/trocr-base-handwritten) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
vocab_size (`int`, *optional*, defaults to 50265):
TrOCR 模型的词汇表大小,定义了在调用 `TrOCRForCausalLM` 时可以表示的不同标记数量。
d_model (`int`, *optional*, defaults to 1024):
层和池化层的维度。
decoder_layers (`int`, *optional*, defaults to 12):
解码器层数。
decoder_attention_heads (`int`, *optional*, defaults to 16):
Transformer 解码器中每个注意力层的注意力头数。
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
解码器中“中间”(通常称为前馈)层的维度。
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
池化器中的非线性激活函数(函数或字符串)。支持的字符串包括 "gelu"、"relu"、"silu" 和 "gelu_new"。
max_position_embeddings (`int`, *optional*, defaults to 512):
模型可能使用的最大序列长度。通常设置为一个很大的值(例如 512、1024 或 2048)。
dropout (`float`, *optional*, defaults to 0.1):
嵌入层和池化器中所有全连接层的dropout概率。
attention_dropout (`float`, *optional*, defaults to 0.0):
注意力概率的dropout比率。
activation_dropout (`float`, *optional*, defaults to 0.0):
全连接层内部激活的dropout比率。
init_std (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准偏差。
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
解码器的层丢弃概率。详见 LayerDrop 论文(https://arxiv.org/abs/1909.11556) 了解更多细节。
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后的键/值注意力(并非所有模型都使用)。
scale_embedding (`bool`, *optional*, defaults to `False`):
是否对词嵌入进行 sqrt(d_model) 的缩放。
use_learned_position_embeddings (`bool`, *optional*, defaults to `True`):
是否使用学习到的位置嵌入。如果不使用,则使用正弦位置嵌入。
layernorm_embedding (`bool`, *optional*, defaults to `True`):
是否在词 + 位置嵌入后使用 layernorm。
Example:
```
>>> from transformers import TrOCRConfig, TrOCRForCausalLM
>>>
>>> configuration = TrOCRConfig()
```
model = TrOCRForCausalLM(configuration)
configuration = model.config
.\models\trocr\convert_trocr_unilm_to_pytorch.py
"""从 unilm 代码库转换 TrOCR 检查点。"""
import argparse
from pathlib import Path
import requests
import torch
from PIL import Image
from transformers import (
RobertaTokenizer,
TrOCRConfig,
TrOCRForCausalLM,
TrOCRProcessor,
VisionEncoderDecoderModel,
ViTConfig,
ViTImageProcessor,
ViTModel,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(encoder_config, decoder_config):
rename_keys = []
for i in range(encoder_config.num_hidden_layers):
rename_keys.append(
(f"encoder.deit.blocks.{i}.norm1.weight", f"encoder.encoder.layer.{i}.layernorm_before.weight")
)
rename_keys.append((f"encoder.deit.blocks.{i}.norm1.bias", f"encoder.encoder.layer.{i}.layernorm_before.bias"))
rename_keys.append(
(f"encoder.deit.blocks.{i}.attn.proj.weight", f"encoder.encoder.layer.{i}.attention.output.dense.weight")
)
rename_keys.append(
(f"encoder.deit.blocks.{i}.attn.proj.bias", f"encoder.encoder.layer.{i}.attention.output.dense.bias")
)
rename_keys.append(
(f"encoder.deit.blocks.{i}.norm2.weight", f"encoder.encoder.layer.{i}.layernorm_after.weight")
)
rename_keys.append((f"encoder.deit.blocks.{i}.norm2.bias", f"encoder.encoder.layer.{i}.layernorm_after.bias"))
rename_keys.append(
(f"encoder.deit.blocks.{i}.mlp.fc1.weight", f"encoder.encoder.layer.{i}.intermediate.dense.weight")
)
rename_keys.append(
(f"encoder.deit.blocks.{i}.mlp.fc1.bias", f"encoder.encoder.layer.{i}.intermediate.dense.bias")
)
rename_keys.append(
(f"encoder.deit.blocks.{i}.mlp.fc2.weight", f"encoder.encoder.layer.{i}.output.dense.weight")
)
rename_keys.append((f"encoder.deit.blocks.{i}.mlp.fc2.bias", f"encoder.encoder.layer.{i}.output.dense.bias"))
rename_keys.extend(
[
("encoder.deit.cls_token", "encoder.embeddings.cls_token"),
("encoder.deit.pos_embed", "encoder.embeddings.position_embeddings"),
("encoder.deit.patch_embed.proj.weight", "encoder.embeddings.patch_embeddings.projection.weight"),
("encoder.deit.patch_embed.proj.bias", "encoder.embeddings.patch_embeddings.projection.bias"),
("encoder.deit.norm.weight", "encoder.layernorm.weight"),
("encoder.deit.norm.bias", "encoder.layernorm.bias"),
]
)
rename_keys.extend(
[
("encoder.deit.cls_token", "encoder.embeddings.cls_token"),
("encoder.deit.pos_embed", "encoder.embeddings.position_embeddings"),
("encoder.deit.patch_embed.proj.weight", "encoder.embeddings.patch_embeddings.projection.weight"),
("encoder.deit.patch_embed.proj.bias", "encoder.embeddings.patch_embeddings.projection.bias"),
("encoder.deit.norm.weight", "encoder.layernorm.weight"),
("encoder.deit.norm.bias", "encoder.layernorm.bias"),
]
)
return rename_keys
def read_in_q_k_v(state_dict, encoder_config):
for i in range(encoder_config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"encoder.deit.blocks.{i}.attn.qkv.weight")
state_dict[f"encoder.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
: encoder_config.hidden_size, :
]
state_dict[f"encoder.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
encoder_config.hidden_size : encoder_config.hidden_size * 2, :
]
state_dict[f"encoder.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-encoder_config.hidden_size :, :
]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img(checkpoint_url):
if "handwritten" in checkpoint_url:
url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-00.jpg"
elif "printed" in checkpoint_url or "stage1" in checkpoint_url:
url = "https://www.researchgate.net/profile/Dinh-Sang/publication/338099565/figure/fig8/AS:840413229350922@1577381536857/An-receipt-example-in-the-SROIE-2019-dataset_Q640.jpg"
im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
return im
@torch.no_grad()
def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
"""
将模型的权重复制/粘贴/调整到我们的VisionEncoderDecoderModel结构中。
"""
encoder_config = ViTConfig(image_size=384, qkv_bias=False)
decoder_config = TrOCRConfig()
if "base" in checkpoint_url:
decoder_config.encoder_hidden_size = 768
elif "large" in checkpoint_url:
encoder_config.hidden_size = 1024
encoder_config.intermediate_size = 4096
encoder_config.num_hidden_layers = 24
encoder_config.num_attention_heads = 16
decoder_config.encoder_hidden_size = 1024
else:
raise ValueError("Should either find 'base' or 'large' in checkpoint URL")
if "large-printed" in checkpoint_url or "stage1" in checkpoint_url:
decoder_config.tie_word_embeddings = False
decoder_config.activation_function = "relu"
decoder_config.max_position_embeddings = 1024
decoder_config.scale_embedding = True
decoder_config.use_learned_position_embeddings = False
decoder_config.layernorm_embedding = False
encoder = ViTModel(encoder_config, add_pooling_layer=False)
decoder = TrOCRForCausalLM(decoder_config)
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
model.eval()
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["model"]
rename_keys = create_rename_keys(encoder_config, decoder_config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, encoder_config)
del state_dict["encoder.deit.head.weight"]
del state_dict["encoder.deit.head.bias"]
del state_dict["decoder.version"]
for key, val in state_dict.copy().items():
val = state_dict.pop(key)
if key.startswith("decoder") and "output_projection" not in key:
state_dict["decoder.model." + key] = val
else:
state_dict[key] = val
model.load_state_dict(state_dict)
image_processor = ViTImageProcessor(size=encoder_config.image_size)
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-large")
processor = TrOCRProcessor(image_processor, tokenizer)
pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values
decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]])
outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
logits = outputs.logits
expected_shape = torch.Size([1, 1, 50265])
if "trocr-base-handwritten" in checkpoint_url:
expected_slice = torch.tensor(
[-1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311]
)
elif "trocr-large-handwritten" in checkpoint_url:
expected_slice = torch.tensor(
[-2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702, 5.6113, 2.0170]
)
elif "trocr-base-printed" in checkpoint_url:
expected_slice = torch.tensor(
[-5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210]
)
elif "trocr-large-printed" in checkpoint_url:
expected_slice = torch.tensor(
[-6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466, -0.3081, -0.8106, -1.7535]
)
if "stage1" not in checkpoint_url:
assert logits.shape == expected_shape, "Shape of logits not as expected"
assert torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-3), "First elements of logits not as expected"
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving processor to {pytorch_dump_folder_path}")
processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://layoutlm.blob.core.windows.net/trocr/model_zoo/fairseq/trocr-base-handwritten.pt",
type=str,
help="URL to the original PyTorch checkpoint (.pth file).",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model."
)
args = parser.parse_args()
convert_tr_ocr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
.\models\trocr\modeling_trocr.py
import copy
import math
from typing import Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, logging, replace_return_docstrings
from .configuration_trocr import TrOCRConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "TrOCRConfig"
_CHECKPOINT_FOR_DOC = "microsoft/trocr-base-handwritten"
TROCR_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/trocr-base-handwritten",
]
class TrOCRLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
"""`input_ids' shape is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids.shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
).expand(bsz, -1)
return super().forward(positions + self.offset)
class TrOCRSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
super().__init__()
self.offset = 2
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.weights = self.get_embedding(num_positions, embedding_dim, padding_idx)
self.register_buffer("_float_tensor", torch.FloatTensor(1))
@staticmethod
def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
"""
构建正弦嵌入。这与tensor2tensor中的实现相匹配,但与《Attention Is All You Need》第3.5节中的描述略有不同。
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb.to(torch.get_default_dtype())
@torch.no_grad()
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
bsz, seq_len = input_ids.size()
position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
input_ids.device
)
max_pos = self.padding_idx + 1 + seq_len
if self.weights is None or max_pos > self.weights.size(0):
self.weights = self.get_embedding(max_pos, self.embedding_dim, self.padding_idx)
self.weights = self.weights.to(self._float_tensor)
x = self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
return x
def create_position_ids_from_input_ids(
self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
):
"""
将非填充符号替换为它们的位置号码。位置号码从padding_idx+1开始。忽略填充符号。这是从fairseq的`utils.make_positions`修改而来。
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
class TrOCRAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper."""
def __init__(
self,
config,
embed_dim: int,
num_heads: int,
kdim: int = None,
vdim: int = None,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_cross_attention: bool = False,
):
super().__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if not (self.head_dim * num_heads == self.embed_dim):
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
def __init__(self, config: TrOCRConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = TrOCRAttention(
config,
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
if config.is_decoder:
self.encoder_attn = TrOCRAttention(
config,
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
kdim=config.cross_attention_hidden_size,
vdim=config.cross_attention_hidden_size,
dropout=config.attention_dropout,
is_decoder=True,
is_cross_attention=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
class TrOCRPreTrainedModel(PreTrainedModel):
config_class = TrOCRConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
TROCR_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`TrOCRConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
class TrOCRDecoder(TrOCRPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]
Args:
config: TrOCRConfig
"""
def __init__(self, config: TrOCRConfig):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
if config.use_learned_position_embeddings:
self.embed_positions = TrOCRLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
else:
self.embed_positions = TrOCRSinusoidalPositionalEmbedding(
config.max_position_embeddings + self.padding_idx + 1,
config.hidden_size,
self.padding_idx,
)
if config.layernorm_embedding:
self.layernorm_embedding = nn.LayerNorm(config.hidden_size)
else:
self.layernorm_embedding = None
self.layers = nn.ModuleList([TrOCRDecoderLayer(config) for _ in range(config.decoder_layers)])
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
@add_start_docstrings(
"The TrOCR Model with a language modeling head. Can be used for summarization.",
TROCR_START_DOCSTRING,
)
class TrOCRDecoderWrapper(TrOCRPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
super().__init__(config)
self.decoder = TrOCRDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
@add_start_docstrings(
"The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and"
" [`VisionEncoderDecoder`].",
TROCR_START_DOCSTRING,
)
class TrOCRForCausalLM(TrOCRPreTrainedModel):
_tied_weights_keys = ["output_projection.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = TrOCRDecoderWrapper(config)
self.output_projection = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.output_projection
def set_output_embeddings(self, new_embeddings):
self.output_projection = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
):
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\trocr\processing_trocr.py
"""
Processor class for TrOCR.
"""
import warnings
from contextlib import contextmanager
from ...processing_utils import ProcessorMixin
class TrOCRProcessor(ProcessorMixin):
r"""
Constructs a TrOCR processor which wraps a vision image processor and a TrOCR tokenizer into a single processor.
[`TrOCRProcessor`] offers all the functionalities of [`ViTImageProcessor`/`DeiTImageProcessor`] and
[`RobertaTokenizer`/`XLMRobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and [`~TrOCRProcessor.decode`] for
more information.
Args:
image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`], *optional*):
An instance of [`ViTImageProcessor`/`DeiTImageProcessor`]. The image processor is a required input.
tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`], *optional*):
An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
self._in_target_context_manager = False
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
[`~AutoImageProcessor.__call__`] and returns its output. If used in the context
[`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
[`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
images = kwargs.pop("images", None)
text = kwargs.pop("text", None)
if len(args) > 0:
images = args[0]
args = args[1:]
if images is None and text is None:
raise ValueError("You need to specify either an `images` or `text` input to process.")
if images is not None:
inputs = self.image_processor(images, *args, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif images is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
"""
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your images inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.image_processor
self._in_target_context_manager = False
@property
def feature_extractor_class(self):
"""
Warns about deprecation of `feature_extractor_class` and suggests using `image_processor_class`.
"""
warnings.warn(
"`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
FutureWarning,
)
return self.image_processor_class
def feature_extractor(self):
warnings.warn(
"`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
FutureWarning,
)
return self.image_processor
.\models\trocr\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_speech_available,
is_torch_available,
)
_import_structure = {
"configuration_trocr": ["TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP", "TrOCRConfig"],
"processing_trocr": ["TrOCRProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_trocr"] = [
"TROCR_PRETRAINED_MODEL_ARCHIVE_LIST",
"TrOCRForCausalLM",
"TrOCRPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig
from .processing_trocr import TrOCRProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_trocr import TROCR_PRETRAINED_MODEL_ARCHIVE_LIST, TrOCRForCausalLM, TrOCRPreTrainedModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\tvlt\configuration_tvlt.py
""" TVLT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"ZinengTang/tvlt-base": "https://huggingface.co/ZinengTang/tvlt-base/blob/main/config.json",
}
class TvltConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the TVLT
[ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import TvltConfig, TvltModel
>>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
>>> configuration = TvltConfig()
>>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
>>> model = TvltModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "tvlt"
def __init__(
self,
image_size=224,
spectrogram_length=2048,
frequency_length=128,
image_patch_size=[16, 16],
audio_patch_size=[16, 16],
num_image_channels=3,
num_audio_channels=1,
num_frames=8,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-6,
qkv_bias=True,
use_mean_pooling=False,
decoder_num_attention_heads=16,
decoder_hidden_size=512,
decoder_num_hidden_layers=8,
decoder_intermediate_size=2048,
pixel_mask_ratio=0.75,
audio_mask_ratio=0.15,
audio_mask_type="frame-level",
task_matching=True,
task_mae=True,
loss_type="classification",
**kwargs,
):
"""
Initializes TvltConfig with various parameters to define the TVLT model architecture and behavior.
"""
super().__init__(**kwargs)
self.image_size = image_size
self.spectrogram_length = spectrogram_length
self.frequency_length = frequency_length
self.image_patch_size = image_patch_size
self.audio_patch_size = audio_patch_size
self.num_image_channels = num_image_channels
self.num_audio_channels = num_audio_channels
self.num_frames = num_frames
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.qkv_bias = qkv_bias
self.use_mean_pooling = use_mean_pooling
self.decoder_num_attention_heads = decoder_num_attention_heads
self.decoder_hidden_size = decoder_hidden_size
self.decoder_num_hidden_layers = decoder_num_hidden_layers
self.decoder_intermediate_size = decoder_intermediate_size
self.pixel_mask_ratio = pixel_mask_ratio
self.audio_mask_ratio = audio_mask_ratio
self.audio_mask_type = audio_mask_type
self.task_matching = task_matching
self.task_mae = task_mae
self.loss_type = loss_type
self.update(kwargs)
):
super().__init__(**kwargs)
if audio_mask_type not in ("frame-level", "patch_level"):
raise ValueError(
"audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
f"got {audio_mask_type}"
)
self.image_size = image_size
self.spectrogram_length = spectrogram_length
self.frequency_length = frequency_length
self.image_patch_size = image_patch_size
self.audio_patch_size = audio_patch_size
self.num_image_channels = num_image_channels
self.num_audio_channels = num_audio_channels
self.num_frames = num_frames
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.qkv_bias = qkv_bias
self.use_mean_pooling = use_mean_pooling
self.decoder_num_attention_heads = decoder_num_attention_heads
self.decoder_hidden_size = decoder_hidden_size
self.decoder_num_hidden_layers = decoder_num_hidden_layers
self.decoder_intermediate_size = decoder_intermediate_size
self.pixel_mask_ratio = pixel_mask_ratio
self.audio_mask_ratio = audio_mask_ratio
self.audio_mask_type = audio_mask_type
self.task_matching = task_matching
self.task_mae = task_mae
self.loss_type = loss_type
.\models\tvlt\feature_extraction_tvlt.py
"""TVLT的特征提取器类。"""
from math import ceil
from typing import List, Optional, Union
import numpy as np
from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
class TvltFeatureExtractor(SequenceFeatureExtractor):
r"""
构造一个TVLT音频特征提取器。此特征提取器用于准备模型的音频输入数据。
此特征提取器继承自[`FeatureExtractionMixin`],其中包含大多数主要方法。用户
应参考此超类以获取有关这些方法的更多信息。
Args:
spectrogram_length (`Dict[str, int]` *可选*, 默认为2048):
每个音频频谱图的时间长度。
num_channels (`int` *可选*, 默认为1):
音频通道数。
patch_size (`List[int]` *可选*, 默认为`[16, 16]`):
音频补丁嵌入的补丁大小。
feature_size (`int`, *可选*, 默认为128):
音频频谱图的频率长度。
sampling_rate (`int`, *可选*, 默认为44100):
应数字化音频文件的采样率,以赫兹(Hz)表示。
hop_length_to_sampling_rate (`int`, *可选*, 默认为86):
Hop length是用于获取Mel频率系数的STFT的重叠窗口的长度。
例如,对于采样率44100,跳跃长度为512,即44100 / 512 = 86。
n_fft (`int`, *可选*, 默认为2048):
傅里叶变换的大小。
padding_value (`float`, *可选*, 默认为0.0):
用于填充音频的填充值。应该对应于静音部分。
"""
model_input_names = ["audio_values", "audio_mask"]
def __init__(
self,
spectrogram_length=2048,
num_channels=1,
patch_size=[16, 16],
feature_size=128,
sampling_rate=44100,
hop_length_to_sampling_rate=86,
n_fft=2048,
padding_value=0.0,
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
**kwargs,
)
self.spectrogram_length = spectrogram_length
self.num_channels = num_channels
self.patch_size = patch_size
self.freq_len = feature_size // self.patch_size[1]
self.n_fft = n_fft
self.hop_length = sampling_rate // hop_length_to_sampling_rate
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.mel_filters = mel_filter_bank(
num_frequency_bins=1 + n_fft // 2,
num_mel_filters=feature_size,
min_frequency=0.0,
max_frequency=22050.0,
sampling_rate=sampling_rate,
norm="slaney",
mel_scale="slaney",
).T
def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
"""
Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
implementation with 1e-5 tolerance.
"""
log_spec = spectrogram(
waveform,
window_function(self.n_fft, "hann"),
frame_length=self.n_fft,
hop_length=self.hop_length,
power=2.0,
mel_filters=self.mel_filters.T,
log_mel="dB",
db_range=80.0,
)
log_spec = log_spec[:, :-1]
log_spec = log_spec - 20.0
log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
return log_spec
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = True,
sampling_rate: Optional[int] = None,
resample: bool = False,
mask_audio: bool = False,
**kwargs,
.\models\tvlt\image_processing_tvlt.py
"""TVLT 的图像处理类。"""
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
def make_batched(videos) -> List[List[ImageInput]]:
"""将输入的视频或图像列表转换为批处理列表形式。
Args:
videos: 输入的视频或图像列表
Returns:
List[List[ImageInput]]: 批处理后的视频或图像列表
Raises:
ValueError: 如果无法从输入中生成批处理视频
"""
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
videos_dim = np.array(videos[0]).ndim
if videos_dim == 3:
return [videos]
elif videos_dim == 4:
return videos
elif is_valid_image(videos):
videos_dim = np.array(videos).ndim
if videos_dim == 3:
return [[videos]]
elif videos_dim == 4:
return [videos]
elif videos_dim == 5:
return videos
raise ValueError(f"Could not make batched video from {videos}")
class TvltImageProcessor(BaseImageProcessor):
r"""
构造一个 TVLT 图像处理器。
此处理器可用于通过将图像转换为单帧视频来为模型准备视频或图像。
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
`do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Size of the output image after resizing. The shortest edge of the image will be resized to
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
`size` in the `preprocess` method.
patch_size (`List[int]` *optional*, defaults to [16,16]):
The patch size of image patch embedding.
num_frames (`int` *optional*, defaults to 8):
The maximum number of video frames.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
parameter in the `preprocess` method.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
# 定义模型输入的名称列表,包含四个元素
model_input_names = [
"pixel_values", # 像素数值
"pixel_mask", # 像素掩码
"pixel_values_mixed", # 混合像素数值
"pixel_mask_mixed", # 混合像素掩码
]
# 初始化方法,用于设置图像处理器的各种参数和属性
def __init__(
self,
do_resize: bool = True, # 是否进行图像大小调整,默认为True
size: Dict[str, int] = None, # 图像大小的字典,包含最短边或其他指定尺寸,默认为{"shortest_edge": 224}
patch_size: List[int] = [16, 16], # 图像的分块大小,默认为[16, 16]
num_frames: int = 8, # 处理视频时的帧数,默认为8
resample: PILImageResampling = PILImageResampling.BILINEAR, # 图像重采样方法,默认为双线性插值
do_center_crop: bool = True, # 是否进行中心裁剪,默认为True
crop_size: Dict[str, int] = None, # 裁剪后图像的尺寸,默认为{"height": 224, "width": 224}
do_rescale: bool = True, # 是否进行图像像素值缩放,默认为True
rescale_factor: Union[int, float] = 1 / 255, # 图像像素值缩放因子,默认为1/255
do_normalize: bool = True, # 是否进行图像归一化,默认为True
image_mean: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_MEAN, # 图像归一化均值,默认为ImageNet标准均值
image_std: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_STD, # 图像归一化标准差,默认为ImageNet标准标准差
init_mask_generator=False, # 是否初始化遮罩生成器,默认为False
**kwargs, # 其他可选参数
) -> None:
# 调用父类的初始化方法
super().__init__(**kwargs)
# 如果未提供size参数,则设置默认的size字典
size = size if size is not None else {"shortest_edge": 224}
# 根据提供的size参数获取最终的size字典,保证其含有必要的尺寸信息
size = get_size_dict(size, default_to_square=False)
# 如果未提供crop_size参数,则设置默认的crop_size字典
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
# 根据提供的crop_size参数获取最终的crop_size字典
crop_size = get_size_dict(crop_size, param_name="crop_size")
# 将初始化方法中的各个参数设置为对象的属性
self.do_resize = do_resize
self.size = size
self.patch_size = patch_size
self.num_frames = num_frames
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
# 定义一个包含所有有效处理器键的列表,用于后续验证和使用
self._valid_processor_keys = [
"videos",
"do_resize",
"size",
"patch_size",
"num_frames",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"is_mixed",
"return_tensors",
"data_format",
"input_data_format",
]
# 图像大小调整方法,用于调整输入图像的尺寸
def resize(
self,
image: np.ndarray, # 输入的图像数组
size: Dict[str, int], # 目标图像尺寸的字典
resample: PILImageResampling = PILImageResampling.BILINEAR, # 图像重采样方法,默认为双线性插值
data_format: Optional[Union[str, ChannelDimension]] = None, # 数据格式参数
input_data_format: Optional[Union[str, ChannelDimension]] = None, # 输入数据的格式参数
**kwargs, # 其他可选参数
) -> np.ndarray:
"""
Resize an image.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
shortest edge of length `s` while keeping the aspect ratio of the original image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 根据 size 获取实际的大小字典,确保不是默认的正方形输出
size = get_size_dict(size, default_to_square=False)
# 如果 size 字典中包含 "shortest_edge" 键
if "shortest_edge" in size:
# 根据最短边长度调整输出图像大小
output_size = get_resize_output_image_size(
image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
# 如果 size 字典中包含 "height" 和 "width" 键
elif "height" in size and "width" in size:
# 设置输出大小为指定的高度和宽度
output_size = (size["height"], size["width"])
else:
# 如果 size 字典既不包含 "shortest_edge" 也不同时包含 "height" 和 "width" 键,抛出数值错误
raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
# 调用 resize 函数,返回调整大小后的图像
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Preprocesses a single image."""
validate_preprocess_arguments(
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_resize=do_resize,
size=size,
resample=resample,
)
# All transformations expect numpy arrays.
image = to_numpy_array(image) # Convert input image to numpy array format
if is_scaled_image(image) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image) # Infer input data format if not provided
if do_resize:
image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) # Resize image if required
if do_center_crop:
image = self.center_crop(image, size=crop_size, input_data_format=input_data_format) # Perform center cropping if specified
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) # Rescale image if specified
if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) # Normalize image if specified
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # Convert image to desired channel dimension format
return image
def preprocess(
self,
videos: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
patch_size: List[int] = None,
num_frames: int = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
is_mixed: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\tvlt\modeling_tvlt.py
import collections.abc
import math
from copy import deepcopy
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_tvlt import TvltConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "TvltConfig"
_CHECKPOINT_FOR_DOC = "ZinengTang/tvlt-base"
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ZinengTang/tvlt-base",
]
@dataclass
class TvltModelOutput(ModelOutput):
"""
Class for TvltModel's outputs, with potential hidden states and attentions.
"""
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层输出的隐藏状态序列。
last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
模型最后一层输出的像素序列的隐藏状态。
last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
模型最后一层输出的音频序列的隐藏状态。
pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
表示哪些像素补丁被掩盖(置为1),哪些未被掩盖(置为0)的张量。
audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
表示哪些音频补丁被掩盖(置为1),哪些未被掩盖(置为0)的张量。
pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
像素掩盖的id排列顺序的张量。
audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
音频掩盖的id排列顺序的张量。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组,包含模型每层的隐藏状态张量(嵌入输出和每层的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
当参数 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组,包含模型每层的注意力权重张量,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力softmax后的注意力权重,用于计算自注意力头中的加权平均。
当参数 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
"""
# 初始化函数参数,均为None,表示这些参数在调用时可以传入具体的张量数据
last_hidden_state: torch.FloatTensor = None
last_pixel_hidden_state: torch.FloatTensor = None
last_audio_hidden_state: torch.FloatTensor = None
pixel_label_masks: torch.LongTensor = None
audio_label_masks: torch.LongTensor = None
pixel_ids_restore: torch.LongTensor = None
audio_ids_restore: torch.LongTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# TvltDecoderOutput 类用于存储 TvltDecoder 模型的输出结果,可能包含隐藏状态和注意力信息
@dataclass
class TvltDecoderOutput(ModelOutput):
"""
Class for TvltDecoder's outputs, with potential hidden states and attentions.
Args:
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
Pixel reconstruction logits. 像素重构的逻辑回归输出。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs. 模型每一层输出的隐藏状态,包括初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads. 经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# TvltForPreTrainingOutput 类用于存储 TvltForPreTraining 模型的输出结果,可能包含隐藏状态和注意力信息
@dataclass
class TvltForPreTrainingOutput(ModelOutput):
"""
Class for TvltForPreTraining's outputs, with potential hidden states and attentions.
Args:
loss (`torch.FloatTensor` of shape `(1,)`):
Pixel reconstruction loss. 像素重构损失。
matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
Matching objective logits. 匹配目标的逻辑回归输出。
pixel_logits (`torch.FloatTensor` of shape
`(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
logits. 像素重构的逻辑回归输出。
audio_logits (`torch.FloatTensor` of shape
`(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
logits. 音频重构的逻辑回归输出。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs. 模型每一层输出的隐藏状态,包括初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads. 经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
loss: Optional[torch.FloatTensor] = None
# 定义一个变量 matching_logits,类型为 torch 的 FloatTensor,初始值为 None,用于存储匹配 logits
matching_logits: torch.FloatTensor = None
# 定义一个变量 pixel_logits,类型为 torch 的 FloatTensor,初始值为 None,用于存储像素 logits
pixel_logits: torch.FloatTensor = None
# 定义一个变量 audio_logits,类型为 torch 的 FloatTensor,初始值为 None,用于存储音频 logits
audio_logits: torch.FloatTensor = None
# 定义一个变量 hidden_states,类型为可选的元组,元素为 torch 的 FloatTensor,初始值为 None,用于存储隐藏状态
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义一个变量 attentions,类型为可选的元组,元素为 torch 的 FloatTensor,初始值为 None,用于存储注意力机制
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 生成用于像素值屏蔽的噪声,用于音频屏蔽。
def generate_pixel_mask_noise(pixel_values, pixel_mask=None, mask_ratio=0.75):
"""Generate noise for audio masking."""
# 获取批次大小和序列长度
batch_size, seq_len = pixel_values.shape[:2]
# 生成在 [0, 1] 范围内的随机噪声
noise = torch.rand((batch_size, seq_len), device=pixel_values.device) # noise in [0, 1]
# 计算需要保留的序列长度
len_keep = int(seq_len * (1 - mask_ratio))
return noise, len_keep
# 生成用于音频屏蔽的噪声。
def generate_audio_mask_noise(audio_values, audio_mask=None, mask_ratio=0.75, mask_type="patch-level", freq_len=8):
"""Generate noise for audio masking."""
# 获取批次大小和序列长度
batch_size, seq_len = audio_values.shape[:2]
if mask_type == "frame-level":
# 计算帧级别的时间片段数
num_time_patches = seq_len // freq_len
# 生成 [0, 1] 范围内的随机噪声并重复以匹配序列长度
noise = (
torch.rand(batch_size, num_time_patches, device=audio_values.device)
.unsqueeze(-1)
.repeat(1, 1, freq_len)
.view(batch_size, seq_len)
) # noise in [0, 1]
elif mask_type == "patch-level":
# 生成 [0, 1] 范围内的随机噪声
noise = torch.rand(batch_size, seq_len, device=audio_values.device) # noise in [0, 1]
# 计算需要保留的序列长度
len_keep = int(seq_len * (1 - mask_ratio))
return noise, len_keep
# 随机屏蔽,通过样本内帧级别的乱序进行随机屏蔽。通过 argsort 随机噪声进行样本内的乱序。
def random_masking(sequence, noise, len_keep, attention_masks=None):
"""
Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
noise. sequence: [batch_size, seq_len, hidden_dim], sequence
"""
batch_size, seq_len, hidden_dim = sequence.shape
# 对每个样本的噪声进行排序
ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove
# 恢复原始顺序
ids_restore = torch.argsort(ids_shuffle, dim=1)
# 保留第一个子集
ids_keep = ids_shuffle[:, :len_keep]
# 使用乱序索引收集序列数据
sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, hidden_dim))
# 生成二进制屏蔽:0 表示保留,1 表示移除
label_masks = torch.ones([batch_size, seq_len], device=sequence.device)
label_masks[:, :len_keep] = 0
# 使用 ids_restore 恢复原始顺序得到二进制屏蔽
label_masks = torch.gather(label_masks, dim=1, index=ids_restore)
if attention_masks is not None:
# 若存在注意力屏蔽,则将其乘以二进制屏蔽
label_masks *= attention_masks
# 使用 ids_keep 乱序索引 attention_masks
attention_masks = torch.gather(attention_masks, dim=1, index=ids_keep)
return sequence_masked, attention_masks, label_masks, ids_restore
class TvltPixelEmbeddings(nn.Module):
"""Construct the patch and position embeddings."""
def __init__(self, config):
super().__init__()
# 初始化像素块和位置嵌入
self.patch_embeddings = TvltPixelPatchEmbeddings(config)
self.num_patches_per_image = self.patch_embeddings.num_patches_per_image
# 初始化类型嵌入向量、时间嵌入和位置嵌入向量
self.type_embed_v = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
self.pos_embed_v = nn.Parameter(torch.zeros(1, self.num_patches_per_image, config.hidden_size))
self.config = config
def forward(self, pixel_values, attention_masks=None):
# 定义函数 forward,用于模型的前向传播计算
# 获取输入张量的维度信息
batch_size, num_frames, num_channels, height, width = pixel_values.shape
# 通过 patch_embeddings 方法将像素值转换为补丁嵌入向量
embeddings = self.patch_embeddings(pixel_values)
# 加上位置嵌入向量,重复 num_frames 次以适应每一帧
embeddings += self.pos_embed_v.repeat(1, num_frames, 1)
# 使用 torch.repeat_interleave 方法,根据 num_patches_per_image 重复填充时间嵌入向量的部分,以适应每个补丁
embeddings += torch.repeat_interleave(self.temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1)
# 加上类型嵌入向量,以适应输入数据的类型特征
embeddings += self.type_embed_v
# 返回嵌入向量和注意力掩码(可选)
return embeddings, attention_masks
class TvltAudioEmbeddings(nn.Module):
"""Construct the patch and position embeddings."""
def __init__(self, config):
super().__init__()
# 初始化音频补丁嵌入对象
self.patch_embeddings = TvltAudioPatchEmbeddings(config)
# 获取补丁数量
self.num_patches = self.patch_embeddings.num_patches
# 初始化音频类型嵌入
self.type_embed_a = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
# 计算频率补丁数量
self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
# 初始化位置嵌入
self.pos_embed_a = nn.Parameter(torch.zeros(1, self.num_patches // self.num_freq_patches, config.hidden_size))
# 初始化频率嵌入
self.freq_embed = nn.Parameter(torch.zeros(1, self.num_freq_patches, config.hidden_size))
# 重新计算频率补丁数量
self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
# 保存配置信息
self.config = config
def forward(self, audio_values, attention_masks=None):
# 创建补丁嵌入
embeddings = self.patch_embeddings(audio_values)
# 计算时间补丁数量
num_time_patches = embeddings.size(1) // self.num_freq_patches
# 添加频率嵌入到每个时间补丁
embeddings += self.freq_embed.repeat(1, num_time_patches, 1)
# 添加位置嵌入到每个时间补丁
embeddings += torch.repeat_interleave(self.pos_embed_a[:, :num_time_patches], self.num_freq_patches, dim=1)
# 添加类型嵌入
embeddings += self.type_embed_a
# 返回嵌入和注意力掩码(可选)
return embeddings, attention_masks
class TvltPixelPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
# 初始化图像大小、补丁大小、通道数和隐藏层大小
image_size, patch_size = config.image_size, config.image_patch_size
num_channels, hidden_size = config.num_image_channels, config.hidden_size
# 确保图像大小和补丁大小是迭代对象
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算每张图像的补丁数量
num_patches_per_image = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
# 保存初始化参数
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches_per_image = num_patches_per_image
self.hidden_size = hidden_size
# 使用卷积层进行投影
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
# 定义一个方法,用于对输入的像素值进行前向传播计算
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
# 从输入的像素值张量中提取批量大小、帧数、通道数、高度和宽度
batch_size, num_frames, num_channels, height, width = pixel_values.shape
# 检查输入的像素值通道数是否与配置中指定的通道数一致,若不一致则抛出数值错误
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# 检查输入图像的高度和宽度是否与模型配置中的设置一致,若不一致则抛出数值错误
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
)
# 将输入的像素值张量重塑为(batch_size * num_frames, num_channels, height, width)的形状
pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
# 使用模型中的投影层对重塑后的像素值进行投影,并将结果展平并转置
embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
# 将投影后的嵌入张量重新形状为(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)
embeddings = embeddings.reshape(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)
# 返回计算得到的嵌入张量作为前向传播的结果
return embeddings
# 定义一个名为 `TvltAudioPatchEmbeddings` 的类,继承自 `nn.Module`,用于将形状为 `(batch_size, num_channels, height, width)` 的音频值转换为形状为 `(batch_size, seq_length, hidden_size)` 的初始隐藏状态(即补丁嵌入),以供 Transformer 模型使用。
"""
This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
# 从配置中获取频谱长度、频率长度和补丁大小
spectrogram_length, frequency_length, patch_size = (
config.spectrogram_length,
config.frequency_length,
config.audio_patch_size,
)
# 从配置中获取音频通道数和隐藏状态的大小
num_channels, hidden_size = config.num_audio_channels, config.hidden_size
# 定义频谱大小为元组 `(spectrogram_length, frequency_length)`
spectrogram_size = (spectrogram_length, frequency_length)
# 如果 `patch_size` 是可迭代对象,则保持不变;否则转换为元组 `(patch_size, patch_size)`
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算补丁数量,即 `(spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])`
num_patches = (spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])
# 定义补丁形状为 `(spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])`
patch_shape = (spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])
# 设置类的属性,包括频谱大小、补丁大小、音频通道数、补丁数量和补丁形状
self.spectrogram_size = spectrogram_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.patch_shape = patch_shape
# 使用 `nn.Conv2d` 定义投影层,将输入的音频通道转换为隐藏状态的卷积操作
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, audio_values: torch.Tensor) -> torch.Tensor:
# 获取输入音频的形状信息 `(batch_size, num_channels, height, width)`
batch_size, num_channels, height, width = audio_values.shape
# 如果输入音频的通道数与设定的音频通道数不匹配,抛出数值错误
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# 如果输入音频的高度大于设定的频谱高度或者宽度不等于设定的频率长度,抛出数值错误
if height > self.spectrogram_size[0] or width != self.spectrogram_size[1]:
raise ValueError(
f"Input audio size ({height}*{width}) doesn't match model"
f" ({self.spectrogram_size[0]}*{self.spectrogram_size[1]})."
)
# 将输入音频值投影到隐藏状态空间,并展平成形状 `(batch_size, hidden_size, seq_length)`
embeddings = self.projection(audio_values).flatten(2).transpose(1, 2)
# 返回嵌入后的结果
return embeddings
# 从 `transformers.models.vilt.modeling_vilt.ViltSelfAttention` 复制到 `TvltSelfAttention`,仅修改类名
class TvltSelfAttention(nn.Module):
# 初始化函数,接收一个配置对象作为参数
def __init__(self, config):
# 调用父类的初始化函数
super().__init__()
# 检查隐藏层大小是否能被注意力头数整除,同时检查是否有嵌入大小的属性
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
# 如果不满足条件,抛出数值错误异常
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
# 设置注意力头数和每个头的大小
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
# 创建用于查询、键和值的线性层,并指定是否使用偏置
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
# 创建用于 dropout 的层,以减少注意力概率
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
# 将输入张量 x 转换为适合计算注意力分数的形状
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
# 前向传播函数,接收隐藏状态、注意力掩码、头掩码和是否输出注意力作为参数
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
# 通过查询线性层生成混合查询层
mixed_query_layer = self.query(hidden_states)
# 使用键和值线性层生成适合计算注意力分数的键和值张量
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
# 计算查询与键的点积,得到原始的注意力分数
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# 如果存在注意力掩码,将其应用到注意力分数上
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
# 将注意力分数归一化为概率值
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# 使用 dropout 层减少注意力概率
attention_probs = self.dropout(attention_probs)
# 如果存在头掩码,将其应用到注意力概率上
if head_mask is not None:
attention_probs = attention_probs * head_mask
# 计算上下文张量,即注意力概率加权的值层
context_layer = torch.matmul(attention_probs, value_layer)
# 将上下文张量的维度重新排列为 [batch_size, seq_length, all_head_size]
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
# 根据输出注意力的设置返回结果
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfOutput with Vilt->Tvlt
class TvltSelfOutput(nn.Module):
"""
The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: TvltConfig) -> None:
super().__init__()
# 定义一个全连接层,输入和输出大小都为 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 定义一个 dropout 层,根据 config.hidden_dropout_prob 概率随机将输入设置为0
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 使用全连接层处理输入 hidden_states
hidden_states = self.dense(hidden_states)
# 对全连接层的输出进行 dropout 处理
hidden_states = self.dropout(hidden_states)
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltAttention with Vilt->Tvlt
class TvltAttention(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化 TvltSelfAttention 和 TvltSelfOutput
self.attention = TvltSelfAttention(config)
self.output = TvltSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
# 如果 heads 列表为空,直接返回
if len(heads) == 0:
return
# 调用 find_pruneable_heads_and_indices 函数获取需要修剪的头信息
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
# 对 attention 和 output 中的相关层进行修剪
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储修剪后的头信息
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
# 调用 TvltSelfAttention 的 forward 方法处理 hidden_states
self_outputs = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
# 使用 TvltSelfOutput 处理 self_outputs 的第一个元素和 hidden_states
attention_output = self.output(self_outputs[0], hidden_states)
# 如果需要输出 attention,则将其加入 outputs
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltIntermediate with Vilt->Tvlt
class TvltIntermediate(nn.Module):
def __init__(self, config: TvltConfig) -> None:
super().__init__()
# 定义一个全连接层,输入大小为 config.hidden_size,输出大小为 config.intermediate_size
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据配置选择隐藏层激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 定义一个前向传播方法,接收隐藏状态张量作为输入,并返回处理后的张量作为输出
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将隐藏状态张量传入全连接层,进行线性变换
hidden_states = self.dense(hidden_states)
# 对线性变换后的结果应用中间激活函数
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回经过线性变换和激活函数处理后的隐藏状态张量
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltOutput with Vilt->Tvlt
class TvltOutput(nn.Module):
def __init__(self, config: TvltConfig) -> None:
super().__init__()
# 定义一个全连接层,将输入维度为config.intermediate_size的向量映射到config.hidden_size的向量
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
# 定义一个dropout层,用于在训练过程中随机置零输入张量中的部分元素,以防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 将输入hidden_states通过全连接层进行线性变换
hidden_states = self.dense(hidden_states)
# 对变换后的hidden_states进行dropout操作
hidden_states = self.dropout(hidden_states)
# 将dropout后的hidden_states与输入的input_tensor相加,实现残差连接
hidden_states = hidden_states + input_tensor
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltLayer with Vilt->Tvlt
class TvltLayer(nn.Module):
"""This corresponds to the Block class in the timm implementation."""
def __init__(self, config):
super().__init__()
# 初始化TvltLayer类,设置一些需要的参数
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
# 初始化self.attention为TvltAttention类的实例
self.attention = TvltAttention(config)
# 初始化self.intermediate为TvltIntermediate类的实例
self.intermediate = TvltIntermediate(config)
# 初始化self.output为TvltOutput类的实例
self.output = TvltOutput(config)
# 初始化layernorm_before,使用nn.LayerNorm对输入向量进行归一化处理
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 初始化layernorm_after,同样使用nn.LayerNorm对输入向量进行归一化处理
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
# 在ViLT中,先对输入hidden_states进行layernorm处理,再进行自注意力计算
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states),
attention_mask,
head_mask,
output_attentions=output_attentions,
)
# 取出self_attention计算后的输出
attention_output = self_attention_outputs[0]
# 如果需要输出注意力权重,则将注意力权重也包含在输出中
outputs = self_attention_outputs[1:]
# 第一个残差连接,将自注意力计算的输出与原始hidden_states相加
hidden_states = attention_output + hidden_states.to(attention_output.device)
# 在ViLT中,再次对输出进行layernorm处理
layer_output = self.layernorm_after(hidden_states)
# 将layernorm处理后的输出传递给intermediate层进行进一步的非线性变换
layer_output = self.intermediate(layer_output)
# 第二个残差连接,将intermediate层的输出与原始hidden_states相加
layer_output = self.output(layer_output, hidden_states)
# 将最终的layer_output作为输出结果,并将可能的注意力权重等其他信息也包含在outputs中返回
outputs = (layer_output,) + outputs
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltEncoder with Vilt->Tvlt
class TvltEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
# 初始化TvltEncoder类,创建包含config.num_hidden_layers个TvltLayer层的ModuleList
self.layer = nn.ModuleList([TvltLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
# 如果输出隐藏状态为真,则初始化一个空元组,否则为None
all_hidden_states = () if output_hidden_states else None
# 如果输出注意力分布为真,则初始化一个空元组,否则为None
all_self_attentions = () if output_attentions else None
# 遍历所有的Transformer层
for i, layer_module in enumerate(self.layer):
# 如果输出隐藏状态为真,将当前隐藏状态添加到all_hidden_states中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 获取当前层的头部掩码
layer_head_mask = head_mask[i] if head_mask is not None else None
# 如果开启了梯度检查点且处于训练状态
if self.gradient_checkpointing and self.training:
# 使用梯度检查点函数进行前向传播
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
output_attentions,
)
else:
# 普通的Transformer层前向传播
layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
# 更新隐藏状态为当前层的输出
hidden_states = layer_outputs[0]
# 如果输出注意力分布为真,将当前层的注意力分布添加到all_self_attentions中
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 如果输出隐藏状态为真,将最终隐藏状态添加到all_hidden_states中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不使用返回字典形式,则返回非None的元组
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 否则,使用BaseModelOutput对象包装并返回
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
Args:
pixel_values (:obj:`torch.Tensor` of shape :obj:`(batch_size, channels, height, width)`):
Pixel values. Pixel values are expected to be in the range [0, 1]. If the model expects a different
range, you can rescale it accordingly before passing it to the model.
Return: A dictionary containing the following entries:
- **last_hidden_state** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
- **pooler_output** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) further processed
by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the
last layer hidden-state and the bias is initialized as a zero vector.
# 定义函数签名,描述输入参数的类型和形状
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
像素数值。可以使用 [`TvltProcessor`] 获取像素数值。有关详细信息,请参见 [`TvltProcessor.__call__`]。
audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
音频数值。可以使用 [`TvltProcessor`] 获取音频数值。有关详细信息,请参见 [`TvltProcessor.__call__`]。
pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
像素掩码。可以使用 [`TvltProcessor`] 获取像素掩码。有关详细信息,请参见 [`TvltProcessor.__call__`]。
audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
音频掩码。可以使用 [`TvltProcessor`] 获取音频掩码。有关详细信息,请参见 [`TvltProcessor.__call__`]。
pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Tvlt 视听匹配中混合了正负样本的像素数值。可以使用 [`TvltProcessor`] 获取混合像素数值。有关详细信息,请参见 [`TvltProcessor.__call__`]。
pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
`pixel_values_mixed` 的像素掩码。可以使用 [`TvltProcessor`] 获取混合像素掩码。有关详细信息,请参见 [`TvltProcessor.__call__`]。
mask_pixel (`bool`, *optional*):
是否为 MAE 任务屏蔽像素。仅在 `TvltForPreTraining` 中设置为 True。
mask_audio (`bool`, *optional*):
是否为 MAE 任务屏蔽音频。仅在 `TvltForPreTraining` 中设置为 True。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。有关返回的张量中 `attentions` 的更多详细信息,请参阅。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。有关返回的张量中 `hidden_states` 的更多详细信息,请参阅。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
定义 TvltModel 类,继承自 TvltPreTrainedModel 类,实现了 TVLT 模型的基础功能。
这是一个 Transformer 模型,用于处理 TVLT 相关任务,返回原始隐藏状态而不添加任何特定的输出头部。
@param config: 模型的配置对象,包含了模型的各种参数设置
初始化 TvltModel 类,设置模型的各个组件和参数。
self.pixel_embeddings = TvltPixelEmbeddings(config)
self.audio_embeddings = TvltAudioEmbeddings(config)
self.encoder = TvltEncoder(config)
self.cls_embedding = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
if config.use_mean_pooling:
self.layernorm = None
else:
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
调用 post_init 方法,用于初始化权重并进行最终处理
提供方法 get_input_embeddings,用于获取像素和音频嵌入的 patch 嵌入对象
_prune_heads 方法用于剪枝模型的注意力头部
@param heads_to_prune: 要剪枝的模型头部的字典,格式为 {layer_num: 在该层要剪枝的头部列表},参见基类 PreTrainedModel
前向传播方法 forward,接受像素值和音频值作为输入,并可选地接受掩码和其他参数,返回 TvltModelOutput 对象
@param pixel_values: 像素值的张量输入
@param audio_values: 音频值的张量输入
@param pixel_mask: 可选的像素掩码张量
@param audio_mask: 可选的音频掩码张量
@param mask_pixel: 是否对像素值进行掩码处理
@param mask_audio: 是否对音频值进行掩码处理
@param output_attentions: 是否输出注意力权重
@param output_hidden_states: 是否输出隐藏状态
@param return_dict: 是否返回字典形式的输出结果
@return: TvltModelOutput 对象,包含前向传播的输出结果
定义 TvltDecoder 类,继承自 nn.Module 类,用于 TVLT 模型的解码器部分
@param config: 模型配置对象,包含解码器的各种参数设置
初始化 TvltDecoder 类,设置解码器的层列表和标准化层
self.decoder_layers = nn.ModuleList([TvltLayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)])
self.layernorm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
设置梯度检查点为 False,并保存配置信息
"""
):
# 如果输出隐藏状态设置为 True,则初始化空元组以保存所有隐藏状态,默认为 None
all_hidden_states = () if output_hidden_states else None
# 如果输出注意力权重设置为 True,则初始化空元组以保存所有自注意力权重,默认为 None
all_self_attentions = () if output_attentions else None
# 遍历 Transformer 解码器的每个层
for i, layer_module in enumerate(self.decoder_layers):
# 如果需要输出隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果启用了梯度检查点且处于训练模式,则使用梯度检查点函数调用层模块
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
None,
output_attentions,
)
else:
# 否则直接调用层模块,得到层的输出
layer_outputs = layer_module(hidden_states, output_attentions=output_attentions)
# 更新隐藏状态为当前层的输出的第一个元素(通常是下一层的输入)
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,则将当前层的自注意力权重添加到 all_self_attentions 中
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 如果需要输出隐藏状态,则将最终的隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 对最终的隐藏状态进行层归一化,得到最终的预测结果 logits
logits = self.layernorm(hidden_states)
# 如果不需要返回字典格式的结果,则按照顺序返回 logits、all_hidden_states、all_self_attentions 中不为 None 的部分
if not return_dict:
return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
# 否则,将结果封装成 TvltDecoderOutput 对象并返回
return TvltDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
# 添加自动文档字符串以描述该类的作用和功能
@add_start_docstrings(
"The TVLT Model transformer with the decoder on top for self-supervised pre-training.",
TVLT_START_DOCSTRING,
)
# TvltForPreTraining 类继承自 TvltPreTrainedModel 类
class TvltForPreTraining(TvltPreTrainedModel):
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 将配置信息存储在实例中
self.config = config
# 从配置中获取任务匹配和任务 MAE 的标志位
self.task_matching = config.task_matching
self.task_mae = config.task_mae
# 如果既没有设置任务匹配也没有设置任务 MAE,则抛出值错误异常
if not (self.task_matching or self.task_mae):
raise ValueError("Must set at least one of matching task and MAE task to true")
# 创建 TVLT 模型实例
self.tvlt = TvltModel(config)
# 如果配置了任务匹配,则创建匹配头部实例
if self.task_matching:
self.matching_head = TvltMatchingHead(config)
# 如果配置了任务 MAE,则进行以下初始化操作
if self.task_mae:
# 创建编码器到解码器的线性层
self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)
# 创建像素级和音频级掩码标记参数
self.pixel_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
self.audio_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
# 创建 TVLT 解码器实例
self.decoder = TvltDecoder(config)
# 从配置中获取解码器的隐藏层大小
decoder_hidden_size = config.decoder_hidden_size
# 从 TVLT 模型的像素嵌入中获取相关参数并创建相应的解码器位置嵌入
num_frames = config.num_frames
num_patches_per_image = self.tvlt.pixel_embeddings.num_patches_per_image
self.decoder_pixel_pos_embed = nn.Parameter(torch.zeros(1, num_patches_per_image, decoder_hidden_size))
self.decoder_temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, decoder_hidden_size))
self.decoder_pixel_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
# 从 TVLT 模型的音频嵌入中获取相关参数并创建相应的解码器位置嵌入
num_audio_patches = self.tvlt.audio_embeddings.num_patches
num_freq_patches = config.frequency_length // config.audio_patch_size[1]
self.decoder_audio_pos_embed = nn.Parameter(
torch.zeros(1, num_audio_patches // num_freq_patches, decoder_hidden_size)
)
self.decoder_freq_embed = nn.Parameter(torch.zeros(1, num_freq_patches, decoder_hidden_size))
self.decoder_audio_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
# 创建像素级和音频级 MAE 头部实例
pixel_mae_output_dim = self.config.image_patch_size[0] ** 2 * self.config.num_image_channels
self.pixel_mae_head = TvltMAEHead(config, pixel_mae_output_dim)
audio_mae_output_dim = (
self.config.audio_patch_size[0] * self.config.audio_patch_size[1] * self.config.num_audio_channels
)
self.audio_mae_head = TvltMAEHead(config, audio_mae_output_dim)
# 存储一些与解码器相关的参数信息
self.num_frames = num_frames
self.num_patches_per_image = num_patches_per_image
self.num_freq_patches = num_freq_patches
self.image_patch_size = config.image_patch_size
self.audio_patch_size = config.audio_patch_size
# 执行后续的初始化步骤,包括权重初始化和最终处理
self.post_init()
# 将输入的像素值按照指定的图像块大小进行分块处理
def patchify_pixel(self, pixel_values):
"""
pixel_values: [batch_size, num_frames, 3, height, width]
"""
# 获取输入像素值张量的维度信息
batch_size, num_frames, num_channels, height, width = pixel_values.shape
# 计算在高度和宽度上可以分成多少个图像块
num_patches_height = pixel_values.shape[3] // self.image_patch_size[0]
num_patches_width = pixel_values.shape[4] // self.image_patch_size[1]
# 将像素值重新组织成指定形状的张量,以便后续处理
patchified_pixel_values = pixel_values.reshape(
shape=(
batch_size,
num_frames,
num_channels,
num_patches_height,
self.image_patch_size[0],
num_patches_width,
self.image_patch_size[1],
)
)
# 使用 Einstein Summation Convention 进行张量乘积计算,重新排列张量维度
patchified_pixel_values = torch.einsum("ntchpwq->nthwpqc", patchified_pixel_values)
# 将重新排列的张量再次整形为指定形状,以便后续计算
patchified_pixel_values = patchified_pixel_values.reshape(
shape=(
batch_size,
num_patches_height * num_patches_width * num_frames,
self.image_patch_size[0] * self.image_patch_size[1] * num_channels,
)
)
return patchified_pixel_values
# 将输入的音频值按照指定的音频块大小进行分块处理
def patchify_audio(self, audio_values):
"""
audio_values: [batch_size, 1, height, width]
"""
# 获取输入音频值张量的维度信息
batch_size, num_channels, height, width = audio_values.shape
# 计算在高度和宽度上可以分成多少个音频块
num_patches_height = height // self.audio_patch_size[0]
num_patches_width = width // self.audio_patch_size[1]
# 将音频值重新组织成指定形状的张量,以便后续处理
patchified_audio_values = audio_values.reshape(
shape=(
batch_size,
num_channels,
num_patches_height,
self.audio_patch_size[0],
num_patches_width,
self.audio_patch_size[1],
)
)
# 使用 Einstein Summation Convention 进行张量乘积计算,重新排列张量维度
patchified_audio_values = torch.einsum("nchpwq->nhwpqc", patchified_audio_values)
# 将重新排列的张量再次整形为指定形状,以便后续计算
patchified_audio_values = patchified_audio_values.reshape(
shape=(
batch_size,
num_patches_height * num_patches_width,
self.audio_patch_size[0] * self.audio_patch_size[1] * num_channels,
)
)
return patchified_audio_values
# 计算像素预测和实际像素之间的均方误差损失
def pixel_mae_loss(self, pixel_values, pixel_predictions, mask):
# 将输入的像素值进行分块处理
patchified_pixel_values = self.patchify_pixel(pixel_values)
# 计算预测像素值和分块像素值之间的平方差
loss = (pixel_predictions - patchified_pixel_values) ** 2
# 计算每个图像块上的平均损失
loss = loss.mean(dim=-1) # [batch_size, pixel_pixel_length], mean loss per patch
# 根据掩码计算移除的图像块上的平均损失
loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches
return loss
# 计算音频预测和实际音频值之间的均方误差损失
def audio_mae_loss(self, audio_values, audio_predictions, mask):
# 将输入的音频值进行分块处理
patchified_audio_values = self.patchify_audio(audio_values)
# 计算预测音频值和分块音频值之间的平方差
loss = (audio_predictions - patchified_audio_values) ** 2
# 计算每个音频块上的平均损失
loss = loss.mean(dim=-1) # [batch_size, audio_pixel_length], mean loss per patch
# 根据掩码计算移除的音频块上的平均损失
loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches
return loss
# 定义一个方法用于拼接掩码到序列的末尾
def concatenate_mask(self, mask_token, sequence, ids_restore):
# 获取序列的批大小、序列长度和维度
batch_size, seq_length, dim = sequence.shape
# 将掩码标记重复添加到每个样本序列末尾,以匹配恢复后的序列长度
mask_tokens = mask_token.repeat(batch_size, ids_restore.shape[1] - seq_length, 1)
# 在序列的末尾连接掩码标记
padded_sequence = torch.cat([sequence, mask_tokens], dim=1)
# 根据恢复的索引ids_restore重新排序序列,以恢复原始顺序
padded_sequence = torch.gather(
padded_sequence, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, dim)
) # unshuffle
# 返回重新排序后的序列
return padded_sequence
# 定义模型的前向传播方法,此处注释通过装饰器已添加到模型前向方法的输入和输出文档字符串
@add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TvltForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
audio_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
audio_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
pixel_values_mixed: Optional[torch.FloatTensor] = None,
pixel_mask_mixed: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 定义一个自定义的 Transformer 模型,用于处理音频和视觉分类任务,例如 CMU-MOSEI 情感分析和音频到视频检索
@add_start_docstrings(
"""
Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
""",
TVLT_START_DOCSTRING,
)
class TvltForAudioVisualClassification(TvltPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 TvltModel,这是主要的 Transformer 模型
self.tvlt = TvltModel(config)
# 分类器头部网络
self.classifier = nn.Sequential(
nn.Linear(config.hidden_size, config.hidden_size * 2), # 线性层,扩展隐藏层大小
nn.LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps), # LayerNorm 层
nn.GELU(), # GELU 激活函数
nn.Linear(config.hidden_size * 2, config.num_labels), # 线性层,输出分类标签数
)
self.config = config
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
audio_values: torch.FloatTensor,
pixel_mask: Optional[torch.FloatTensor] = None,
audio_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
) -> Union[Tuple[torch.FloatTensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
refers to the number of classes in audiovisual tasks.
Return:
Examples:
```
>>> from transformers import TvltProcessor, TvltForAudioVisualClassification
>>> import numpy as np
>>> import torch
>>> num_frames = 8
>>> images = list(np.random.randn(num_frames, 3, 224, 224))
>>> audio = list(np.random.randn(10000))
>>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
>>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
>>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
>>> outputs = model(**input_dict)
>>> loss = outputs.loss
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict # 若未指定 return_dict 则使用模型配置中的默认值
outputs = self.tvlt(
pixel_values,
audio_values,
pixel_mask=pixel_mask,
audio_mask=audio_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
) # 调用 Tvlt 模型进行前向传播,获取输出
sequence_output = outputs[0][:, 0] # 获取序列输出的第一个位置的结果
logits = self.classifier(sequence_output) # 将序列输出传入分类器,得到分类 logits
loss = None
if labels is not None:
if self.config.loss_type == "regression": # 如果损失类型为回归
loss_fct = MSELoss()
loss = loss_fct(logits, labels) # 计算均方误差损失
elif self.config.loss_type == "classification": # 如果损失类型为分类
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits, labels) # 计算交叉熵损失
if not return_dict:
output = (logits,) + outputs[4:] # 如果不返回字典,则组合输出结果
return ((loss,) + output) if loss is not None else output # 返回包含损失的输出
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) # 返回包含损失、logits、隐藏状态和注意力的 SequenceClassifierOutput 对象