Transformers 源码解析(一百一十五)
.\models\unispeech_sat\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_torch_available,
)
_import_structure = {
"configuration_unispeech_sat": ["UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "UniSpeechSatConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_unispeech_sat"] = [
"UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST",
"UniSpeechSatForAudioFrameClassification",
"UniSpeechSatForCTC",
"UniSpeechSatForPreTraining",
"UniSpeechSatForSequenceClassification",
"UniSpeechSatForXVector",
"UniSpeechSatModel",
"UniSpeechSatPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_unispeech_sat import (
UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST,
UniSpeechSatForAudioFrameClassification,
UniSpeechSatForCTC,
UniSpeechSatForPreTraining,
UniSpeechSatForSequenceClassification,
UniSpeechSatForXVector,
UniSpeechSatModel,
UniSpeechSatPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\univnet\configuration_univnet.py
""" UnivNetModel 模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"dg845/univnet-dev": "https://huggingface.co/dg845/univnet-dev/resolve/main/config.json",
}
class UnivNetConfig(PretrainedConfig):
r"""
这是用于存储 [`UnivNetModel`] 配置的类。它用于根据指定参数实例化 UnivNet 语音合成模型,定义模型架构。
使用默认值实例化配置会生成类似于 UnivNet [dg845/univnet-dev](https://huggingface.co/dg845/univnet-dev)
架构的配置,对应于 [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/master/config/default_c32.yaml)
中的 'c32' 架构。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。有关更多信息,请阅读 [`PretrainedConfig`] 的文档。
示例:
```
>>> from transformers import UnivNetModel, UnivNetConfig
>>> # 初始化 Tortoise TTS 风格的配置
>>> configuration = UnivNetConfig()
>>> # 从 Tortoise TTS 风格的配置初始化一个模型(带有随机权重)
>>> model = UnivNetModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "univnet"
def __init__(
self,
model_in_channels=64,
model_hidden_channels=32,
num_mel_bins=100,
resblock_kernel_sizes=[3, 3, 3],
resblock_stride_sizes=[8, 8, 4],
resblock_dilation_sizes=[[1, 3, 9, 27], [1, 3, 9, 27], [1, 3, 9, 27]],
kernel_predictor_num_blocks=3,
kernel_predictor_hidden_channels=64,
kernel_predictor_conv_size=3,
kernel_predictor_dropout=0.0,
initializer_range=0.01,
leaky_relu_slope=0.2,
**kwargs,
):
如果 `resblock_kernel_sizes`、`resblock_stride_sizes`、`resblock_dilation_sizes` 的长度不相等,
抛出 ValueError 异常,提示这三个参数必须具有相同的长度,这个长度也将是模型中 ResNet 块的数量。
self.model_in_channels = model_in_channels
设置模型的输入通道数。
self.model_hidden_channels = model_hidden_channels
设置模型的隐藏通道数。
self.num_mel_bins = num_mel_bins
设置 Mel 频谱的频段数。
self.resblock_kernel_sizes = resblock_kernel_sizes
设置 ResNet 块的卷积核大小列表。
self.resblock_stride_sizes = resblock_stride_sizes
设置 ResNet 块的步幅大小列表。
self.resblock_dilation_sizes = resblock_dilation_sizes
设置 ResNet 块的扩张(dilation)大小列表。
self.kernel_predictor_num_blocks = kernel_predictor_num_blocks
设置核预测器中的块数量。
self.kernel_predictor_hidden_channels = kernel_predictor_hidden_channels
设置核预测器中的隐藏通道数。
self.kernel_predictor_conv_size = kernel_predictor_conv_size
设置核预测器中的卷积大小。
self.kernel_predictor_dropout = kernel_predictor_dropout
设置核预测器中的 dropout 概率。
self.initializer_range = initializer_range
设置模型参数的初始化范围。
self.leaky_relu_slope = leaky_relu_slope
设置 Leaky ReLU 激活函数的斜率。
super().__init__(**kwargs)
调用父类的构造函数,传递可能的关键字参数。
.\models\univnet\convert_univnet.py
import argparse
import torch
from transformers import UnivNetConfig, UnivNetModel, logging
logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.univnet")
def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = "", new_prefix: str = ""):
mapping = {}
mapping[f"{old_prefix}.input_conv.0.weight_g"] = f"{new_prefix}.input_conv.weight_g"
mapping[f"{old_prefix}.input_conv.0.weight_v"] = f"{new_prefix}.input_conv.weight_v"
mapping[f"{old_prefix}.input_conv.0.bias"] = f"{new_prefix}.input_conv.bias"
for i in range(config.kernel_predictor_num_blocks):
mapping[f"{old_prefix}.residual_convs.{i}.1.weight_g"] = f"{new_prefix}.resblocks.{i}.conv1.weight_g"
mapping[f"{old_prefix}.residual_convs.{i}.1.weight_v"] = f"{new_prefix}.resblocks.{i}.conv1.weight_v"
mapping[f"{old_prefix}.residual_convs.{i}.1.bias"] = f"{new_prefix}.resblocks.{i}.conv1.bias"
mapping[f"{old_prefix}.residual_convs.{i}.3.weight_g"] = f"{new_prefix}.resblocks.{i}.conv2.weight_g"
mapping[f"{old_prefix}.residual_convs.{i}.3.weight_v"] = f"{new_prefix}.resblocks.{i}.conv2.weight_v"
mapping[f"{old_prefix}.residual_convs.{i}.3.bias"] = f"{new_prefix}.resblocks.{i}.conv2.bias"
mapping[f"{old_prefix}.kernel_conv.weight_g"] = f"{new_prefix}.kernel_conv.weight_g"
mapping[f"{old_prefix}.kernel_conv.weight_v"] = f"{new_prefix}.kernel_conv.weight_v"
mapping[f"{old_prefix}.kernel_conv.bias"] = f"{new_prefix}.kernel_conv.bias"
mapping[f"{old_prefix}.bias_conv.weight_g"] = f"{new_prefix}.bias_conv.weight_g"
mapping[f"{old_prefix}.bias_conv.weight_v"] = f"{new_prefix}.bias_conv.weight_v"
mapping[f"{old_prefix}.bias_conv.bias"] = f"{new_prefix}.bias_conv.bias"
return mapping
def get_key_mapping(config: UnivNetConfig):
mapping = {}
for i in range(len(config.resblock_stride_sizes)):
mapping[f"res_stack.{i}.convt_pre.1.weight_g"] = f"resblocks.{i}.convt_pre.weight_g"
mapping[f"res_stack.{i}.convt_pre.1.weight_v"] = f"resblocks.{i}.convt_pre.weight_v"
mapping[f"res_stack.{i}.convt_pre.1.bias"] = f"resblocks.{i}.convt_pre.bias"
kernel_predictor_mapping = get_kernel_predictor_key_mapping(
config, old_prefix=f"res_stack.{i}.kernel_predictor", new_prefix=f"resblocks.{i}.kernel_predictor"
)
mapping.update(kernel_predictor_mapping)
for j in range(len(config.resblock_dilation_sizes[i])):
mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_g"] = f"resblocks.{i}.resblocks.{j}.conv.weight_g"
mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_v"] = f"resblocks.{i}.resblocks.{j}.conv.weight_v"
mapping[f"res_stack.{i}.conv_blocks.{j}.1.bias"] = f"resblocks.{i}.resblocks.{j}.conv.bias"
mapping["conv_post.1.weight_g"] = "conv_post.weight_g"
mapping["conv_post.1.weight_v"] = "conv_post.weight_v"
mapping["conv_post.1.bias"] = "conv_post.bias"
return mapping
def rename_state_dict(state_dict, keys_to_modify, keys_to_remove):
model_state_dict = {}
for key, value in state_dict.items():
if key in keys_to_remove:
continue
if key in keys_to_modify:
new_key = keys_to_modify[key]
model_state_dict[new_key] = value
else:
model_state_dict[key] = value
return model_state_dict
def convert_univnet_checkpoint(
checkpoint_path,
pytorch_dump_folder_path,
config_path=None,
repo_id=None,
safe_serialization=False,
):
model_state_dict_base = torch.load(checkpoint_path, map_location="cpu")
state_dict = model_state_dict_base["model_g"]
if config_path is not None:
config = UnivNetConfig.from_pretrained(config_path)
else:
config = UnivNetConfig()
keys_to_modify = get_key_mapping(config)
keys_to_remove = set()
hf_state_dict = rename_state_dict(state_dict, keys_to_modify, keys_to_remove)
model = UnivNetModel(config)
model.apply_weight_norm()
model.load_state_dict(hf_state_dict)
model.remove_weight_norm()
model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
if repo_id:
print("Pushing to the hub...")
model.push_to_hub(repo_id)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
)
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)
parser.add_argument(
"--safe_serialization", action="store_true", help="Whether to save the model using `safetensors`."
)
args = parser.parse_args()
convert_univnet_checkpoint(
args.checkpoint_path,
args.pytorch_dump_folder_path,
args.config_path,
args.push_to_hub,
args.safe_serialization,
)
if __name__ == "__main__":
main()
.\models\univnet\feature_extraction_univnet.py
from typing import Any, Dict, List, Optional, Union
import numpy as np
from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import PaddingStrategy, TensorType, logging
logger = logging.get_logger(__name__)
class UnivNetFeatureExtractor(SequenceFeatureExtractor):
r"""
构建 UnivNet 特征提取器。
此类使用短时傅里叶变换 (STFT) 从原始语音中提取对数梅尔滤波器组特征。
STFT 实现遵循 TacoTron 2 和 Hifi-GAN 的实现方式。
此特征提取器继承自 [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`],
该超类包含大部分主要方法。用户应参考此超类以获取有关这些方法的更多信息。
"""
model_input_names = ["input_features", "noise_sequence", "padding_mask"]
def __init__(
self,
feature_size: int = 1,
sampling_rate: int = 24000,
padding_value: float = 0.0,
do_normalize: bool = False,
num_mel_bins: int = 100,
hop_length: int = 256,
win_length: int = 1024,
win_function: str = "hann_window",
filter_length: Optional[int] = 1024,
max_length_s: int = 10,
fmin: float = 0.0,
fmax: Optional[float] = None,
mel_floor: float = 1e-9,
center: bool = False,
compression_factor: float = 1.0,
compression_clip_val: float = 1e-5,
normalize_min: float = -11.512925148010254,
normalize_max: float = 2.3143386840820312,
model_in_channels: int = 64,
pad_end_length: int = 10,
return_attention_mask=True,
**kwargs,
):
super().__init__()
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
self.do_normalize = do_normalize
self.num_mel_bins = num_mel_bins
self.hop_length = hop_length
self.win_length = win_length
self.win_function = win_function
self.filter_length = filter_length
self.fmin = fmin
if fmax is None:
fmax = float(sampling_rate) / 2
self.fmax = fmax
self.mel_floor = mel_floor
self.max_length_s = max_length_s
self.num_max_samples = max_length_s * sampling_rate
if self.filter_length is None:
self.n_fft = optimal_fft_length(self.win_length)
else:
self.n_fft = self.filter_length
self.n_freqs = (self.n_fft // 2) + 1
self.window = window_function(window_length=self.win_length, name=self.win_function, periodic=True)
self.mel_filters = mel_filter_bank(
num_frequency_bins=self.n_freqs,
num_mel_filters=self.num_mel_bins,
min_frequency=self.fmin,
max_frequency=self.fmax,
sampling_rate=self.sampling_rate,
norm="slaney",
mel_scale="slaney",
)
self.center = center
self.compression_factor = compression_factor
self.compression_clip_val = compression_clip_val
self.normalize_min = normalize_min
self.normalize_max = normalize_max
self.model_in_channels = model_in_channels
self.pad_end_length = pad_end_length
def normalize(self, spectrogram):
return 2 * ((spectrogram - self.normalize_min) / (self.normalize_max - self.normalize_min)) - 1
def denormalize(self, spectrogram):
return self.normalize_min + (self.normalize_max - self.normalize_min) * ((spectrogram + 1) / 2)
def mel_spectrogram(self, waveform: np.ndarray) -> np.ndarray:
"""
Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
`int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.
Args:
waveform (`np.ndarray` of shape `(length,)`):
The input waveform. This must be a single real-valued, mono waveform.
Returns:
`numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
"""
waveform = np.pad(
waveform,
(int((self.n_fft - self.hop_length) / 2), int((self.n_fft - self.hop_length) / 2)),
mode="reflect",
)
complex_spectrogram = spectrogram(
waveform,
window=self.window,
frame_length=self.n_fft,
hop_length=self.hop_length,
fft_length=self.n_fft,
power=None,
center=self.center,
mel_filters=None,
mel_floor=None,
)
amplitude_spectrogram = np.sqrt(
np.real(complex_spectrogram) ** 2 + np.imag(complex_spectrogram) ** 2 + self.mel_floor
)
mel_spectrogram = np.matmul(self.mel_filters.T, amplitude_spectrogram)
log_mel_spectrogram = np.log(
np.clip(mel_spectrogram, a_min=self.compression_clip_val, a_max=None) * self.compression_factor
)
return log_mel_spectrogram.T
def generate_noise(
self,
noise_length: int,
generator: Optional[np.random.Generator] = None,
def noise_sequence(self, noise_length: int, generator: Optional[np.random.Generator] = None) -> np.ndarray:
"""
Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
[`UnivNetModel.forward`].
Args:
noise_length (`int`):
The length of the generated noise sequence.
generator (`numpy.random.Generator`, *optional*, defaults to `None`):
An optional random number generator to control noise generation. If not provided, a new generator
instance will be created using `np.random.default_rng()`.
Returns:
`numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
self.model_in_channels)`.
"""
if generator is None:
generator = np.random.default_rng()
noise_shape = (noise_length, self.model_in_channels)
noise = generator.standard_normal(noise_shape, dtype=np.float32)
return noise
def batch_decode(self, waveforms, waveform_lengths=None) -> List[np.ndarray]:
r"""
Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
audio waveform arrays and not a single tensor/array because in general the waveforms will have different
lengths after removing padding.
Args:
waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
The batched output waveforms from the [`UnivNetModel`].
waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
The batched lengths of each waveform before padding.
Returns:
`List[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
"""
waveforms = [waveform.detach().clone().cpu().numpy() for waveform in waveforms]
if waveform_lengths is not None:
waveforms = [waveform[: waveform_lengths[i]] for i, waveform in enumerate(waveforms)]
return waveforms
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
sampling_rate: Optional[int] = None,
padding: Union[bool, str, PaddingStrategy] = True,
max_length: Optional[int] = None,
truncation: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_noise: bool = True,
generator: Optional[np.random.Generator] = None,
pad_end: bool = False,
pad_length: Optional[int] = None,
do_normalize: Optional[str] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
):
output = super().to_dict()
names = ["window", "mel_filters", "n_fft", "n_freqs", "num_max_samples"]
for name in names:
if name in output:
del output[name]
return output
.\models\univnet\modeling_univnet.py
""" PyTorch UnivNetModel model."""
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ...modeling_utils import ModelOutput, PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_univnet import UnivNetConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "UnivNetConfig"
_CHECKPOINT_FOR_DOC = "dg845/univnet-dev"
UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"dg845/univnet-dev",
]
@dataclass
class UnivNetModelOutput(ModelOutput):
"""
Output class for the [`UnivNetModel`], which includes the generated audio waveforms and the original unpadded
lengths of those waveforms (so that the padding can be removed by [`UnivNetModel.batch_decode`]).
Args:
waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Batched 1D (mono-channel) output audio waveforms.
waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
The batched length in samples of each unpadded waveform in `waveforms`.
"""
waveforms: torch.FloatTensor = None
waveform_lengths: torch.FloatTensor = None
class UnivNetKernelPredictorResidualBlock(nn.Module):
"""
Implementation of the residual block for the kernel predictor network inside each location variable convolution
block (LVCBlock).
Parameters:
config: (`UnivNetConfig`):
Config for the `UnivNetModel` model.
"""
def __init__(
self,
config: UnivNetConfig,
):
super().__init__()
self.channels = config.model_in_channels
self.kernel_size = config.kernel_predictor_conv_size
self.dropout_prob = config.kernel_predictor_dropout
self.leaky_relu_slope = config.leaky_relu_slope
padding = (self.kernel_size - 1) // 2
self.dropout = nn.Dropout(self.dropout_prob)
self.conv1 = nn.Conv1d(self.channels, self.channels, self.kernel_size, padding=padding, bias=True)
self.conv2 = nn.Conv1d(self.channels, self.channels, self.kernel_size, padding=padding, bias=True)
def forward(self, hidden_states: torch.FloatTensor):
residual = hidden_states
hidden_states = self.dropout(hidden_states)
hidden_states = self.conv1(hidden_states)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = self.conv2(hidden_states)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
return hidden_states + residual
def apply_weight_norm(self):
nn.utils.weight_norm(self.conv1)
nn.utils.weight_norm(self.conv2)
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.conv1)
nn.utils.remove_weight_norm(self.conv2)
class UnivNetKernelPredictor(nn.Module):
"""
Implementation of the kernel predictor network which supplies the kernel and bias for the location variable
convolutional layers (LVCs) in each UnivNet LVCBlock.
Based on the KernelPredictor implementation in
[maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L7).
Parameters:
config: (`UnivNetConfig`):
Config for the `UnivNetModel` model.
conv_kernel_size (`int`, *optional*, defaults to 3):
The kernel size for the location variable convolutional layer kernels (convolutional weight tensor).
conv_layers (`int`, *optional*, defaults to 4):
The number of location variable convolutional layers to output kernels and biases for.
"""
def __init__(
self,
config: UnivNetConfig,
conv_kernel_size: int = 3,
conv_layers: int = 4,
):
super().__init__()
self.conv_in_channels = config.model_hidden_channels
self.conv_out_channels = 2 * config.model_hidden_channels
self.conv_kernel_size = conv_kernel_size
self.conv_layers = conv_layers
self.kernel_channels = (
self.conv_in_channels * self.conv_out_channels * self.conv_kernel_size * self.conv_layers
)
self.bias_channels = self.conv_out_channels * self.conv_layers
self.resnet_in_channels = config.num_mel_bins
self.resnet_hidden_channels = config.kernel_predictor_hidden_channels
self.resnet_kernel_size = config.kernel_predictor_conv_size
self.num_blocks = config.kernel_predictor_num_blocks
self.leaky_relu_slope = config.leaky_relu_slope
padding = (self.resnet_kernel_size - 1) // 2
self.input_conv = nn.Conv1d(self.resnet_in_channels, self.resnet_hidden_channels, 5, padding=2, bias=True)
self.resblocks = nn.ModuleList([UnivNetKernelPredictorResidualBlock(config) for _ in range(self.num_blocks)])
self.kernel_conv = nn.Conv1d(
self.resnet_hidden_channels, self.kernel_channels, self.resnet_kernel_size, padding=padding, bias=True
)
self.bias_conv = nn.Conv1d(
self.resnet_hidden_channels, self.bias_channels, self.resnet_kernel_size, padding=padding, bias=True
)
def forward(self, spectrogram: torch.FloatTensor):
"""
将一个条件化的对数梅尔频谱映射到卷积核和偏置的张量,用于位置变量卷积层。注意输入的频谱应具有形状 (batch_size, input_channels, seq_length)。
Args:
spectrogram (`torch.FloatTensor` of shape `(batch_size, input_channels, seq_length)`):
包含对数梅尔频谱的张量。
Returns:
Tuple[`torch.FloatTensor, `torch.FloatTensor`]: 一个元组,第一个元素是形状为 `(batch_size, self.conv_layers, self.conv_in_channels,
self.conv_out_channels, self.conv_kernel_size, seq_length)` 的位置变量卷积核张量,第二个元素是形状为 `(batch_size, self.conv_layers, self.conv_out_channels,
seq_length)` 的位置变量卷积偏置张量。
"""
batch_size, _, seq_length = spectrogram.shape
hidden_states = self.input_conv(spectrogram)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
for resblock in self.resblocks:
hidden_states = resblock(hidden_states)
kernel_hidden_states = self.kernel_conv(hidden_states)
bias_hidden_states = self.bias_conv(hidden_states)
kernels = kernel_hidden_states.view(
batch_size,
self.conv_layers,
self.conv_in_channels,
self.conv_out_channels,
self.conv_kernel_size,
seq_length,
).contiguous()
biases = bias_hidden_states.view(
batch_size,
self.conv_layers,
self.conv_out_channels,
seq_length,
).contiguous()
return kernels, biases
def apply_weight_norm(self):
nn.utils.weight_norm(self.input_conv)
for layer in self.resblocks:
layer.apply_weight_norm()
nn.utils.weight_norm(self.kernel_conv)
nn.utils.weight_norm(self.bias_conv)
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.input_conv)
for layer in self.resblocks:
layer.remove_weight_norm()
nn.utils.remove_weight_norm(self.kernel_conv)
nn.utils.remove_weight_norm(self.bias_conv)
class UnivNetLvcResidualBlock(nn.Module):
"""
Implementation of the location variable convolution (LVC) residual block for the UnivNet residual network.
Parameters:
config: (`UnivNetConfig`):
Config for the `UnivNetModel` model.
kernel_size (`int`):
The kernel size for the dilated 1D convolutional layer.
dilation (`int`):
The dilation for the dilated 1D convolutional layer.
"""
def __init__(
self,
config: UnivNetConfig,
kernel_size: int,
dilation: int,
):
super().__init__()
self.hidden_channels = config.model_hidden_channels
self.kernel_size = kernel_size
self.dilation = dilation
self.leaky_relu_slope = config.leaky_relu_slope
padding = self.dilation * (self.kernel_size - 1) // 2
self.conv = nn.Conv1d(
self.hidden_channels,
self.hidden_channels,
self.kernel_size,
padding=padding,
dilation=self.dilation,
)
def forward(self, hidden_states, kernel, bias, hop_size=256):
residual = hidden_states
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = self.conv(hidden_states)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
hidden_states = self.location_variable_convolution(hidden_states, kernel, bias, hop_size=hop_size)
hidden_states = torch.sigmoid(hidden_states[:, : self.hidden_channels, :]) * torch.tanh(
hidden_states[:, self.hidden_channels :, :]
)
hidden_states = residual + hidden_states
return hidden_states
def apply_weight_norm(self):
nn.utils.weight_norm(self.conv)
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.conv)
Parameters:
config (`UnivNetConfig`):
`UnivNetModel`模型的配置。
layer_id (`int`):
当前LVC ResNet块层的索引,应在0到`len(config.resblock_stride_sizes) - 1`之间(包括边界)。
lvc_hop_size (`int`, *可选*, 默认为256):
位置变量卷积层的跳跃步长。
"""
def __init__(
self,
config: UnivNetConfig,
layer_id: int,
lvc_hop_size: int = 256,
):
super().__init__()
self.hidden_channels = config.model_hidden_channels # 设置隐藏通道数
self.kernel_size = config.resblock_kernel_sizes[layer_id] # 根据层索引获取内核大小
self.stride = config.resblock_stride_sizes[layer_id] # 根据层索引获取步幅大小
self.dilations = config.resblock_dilation_sizes[layer_id] # 根据层索引获取扩张率列表
self.cond_hop_length = lvc_hop_size # 设置条件跳跃长度
self.leaky_relu_slope = config.leaky_relu_slope # 设置LeakyReLU的斜率
self.num_blocks = len(self.dilations) # 获取块的数量
self.convt_pre = nn.ConvTranspose1d(
self.hidden_channels, # 输入通道数
self.hidden_channels, # 输出通道数
2 * self.stride, # 内核大小
stride=self.stride, # 步幅大小
padding=self.stride // 2 + self.stride % 2, # 填充大小
output_padding=self.stride % 2, # 输出填充大小
)
self.kernel_predictor = UnivNetKernelPredictor(config, self.kernel_size, self.num_blocks) # 初始化内核预测器
self.resblocks = nn.ModuleList(
[UnivNetLvcResidualBlock(config, self.kernel_size, self.dilations[i]) for i in range(self.num_blocks)]
) # 创建LVC残差块列表
def forward(self, hidden_states: torch.FloatTensor, spectrogram: torch.FloatTensor):
# hidden_states: (batch_size, hidden_channels, seq_length)
# spectrogram: (batch_size, cond_channels, cond_length)
hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) # 应用LeakyReLU激活函数
hidden_states = self.convt_pre(hidden_states) # 执行转置卷积预处理
kernels, biases = self.kernel_predictor(spectrogram) # 从谱图预测内核和偏置
for i, resblock in enumerate(self.resblocks):
kernel = kernels[:, i, :, :, :, :] # 获取当前块的内核
bias = biases[:, i, :, :] # 获取当前块的偏置
hidden_states = resblock(hidden_states, kernel, bias, hop_size=self.cond_hop_length) # 执行残差块操作
return hidden_states # 返回处理后的隐藏状态
def apply_weight_norm(self):
nn.utils.weight_norm(self.convt_pre) # 应用权重归一化到转置卷积层
self.kernel_predictor.apply_weight_norm() # 应用权重归一化到内核预测器
for layer in self.resblocks:
layer.apply_weight_norm() # 依次应用权重归一化到每个残差块
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.convt_pre) # 移除转置卷积层的权重归一化
self.kernel_predictor.remove_weight_norm() # 移除内核预测器的权重归一化
for layer in self.resblocks:
layer.remove_weight_norm() # 依次移除每个残差块的权重归一化
# 包含关于 UnivNetModel 类的开始文档字符串,描述了该类的继承和基本使用方法
UNIVNET_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`UnivNetConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# 包含关于 UnivNetModel 类的输入文档字符串,描述了输入参数及其形状的详细信息
UNIVNET_INPUTS_DOCSTRING = r"""
Converts a noise waveform and a conditioning spectrogram to a speech waveform. Passing a batch of log-mel
spectrograms returns a batch of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a
single, un-batched speech waveform.
Args:
input_features (`torch.FloatTensor`):
Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
config.num_mel_channels)`, or un-batched and of shape `(sequence_length, config.num_mel_channels)`.
noise_sequence (`torch.FloatTensor`, *optional*):
Tensor containing a noise sequence of standard Gaussian noise. Can be batched and of shape `(batch_size,
sequence_length, config.model_in_channels)`, or un-batched and of shape (sequence_length,
config.model_in_channels)`. If not supplied, will be randomly generated.
padding_mask (`torch.BoolTensor`, *optional*):
Mask indicating which parts of each sequence are padded. Mask values are selected in `[0, 1]`:
- 1 for tokens that are **not masked**
- 0 for tokens that are **masked**
The mask can be batched and of shape `(batch_size, sequence_length)` or un-batched and of shape
`(sequence_length,)`.
generator (`torch.Generator`, *optional*):
A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
deterministic.
return_dict:
Whether to return a [`~utils.ModelOutput`] subclass instead of a plain tuple.
"""
# 使用 @add_start_docstrings 装饰器添加关于 UnivNetModel 类的简要描述和详细文档字符串
@add_start_docstrings(
"""UnivNet GAN vocoder.""",
UNIVNET_START_DOCSTRING,
)
# 定义 UnivNetModel 类,继承自 PreTrainedModel,表示一个 UnivNet GAN 声码器模型
class UnivNetModel(PreTrainedModel):
# 指定该模型的配置类为 UnivNetConfig
config_class = UnivNetConfig
# 指定主要输入的名称为 "input_features"
main_input_name = "input_features"
def __init__(self, config: UnivNetConfig):
super().__init__(config)
self.num_kernels = len(config.resblock_kernel_sizes) # 计算 ResNet 块的内核数目
self.leaky_relu_slope = config.leaky_relu_slope # 从配置中获取 Leaky ReLU 的斜率
self.conv_pre = nn.Conv1d(
config.model_in_channels,
config.model_hidden_channels,
kernel_size=7,
stride=1,
padding=3,
padding_mode="reflect",
)
# 创建预处理卷积层,用于输入数据的初始处理
# Initialize location-variable convolution ResNet Blocks.
num_layers = len(config.resblock_stride_sizes) # 获取 ResNet 块的层数
hop_length = 1
hop_lengths = []
for stride in config.resblock_stride_sizes:
hop_length = hop_length * stride
hop_lengths.append(hop_length)
# 计算每个 ResNet 块的跳跃长度,并存储在列表中
self.resblocks = nn.ModuleList(
[
UnivNetLvcBlock(
config,
layer_id=i,
lvc_hop_size=hop_lengths[i],
)
for i in range(num_layers)
]
)
# 创建 ResNet 块的列表,每个块都使用不同的位置变量卷积设置
self.conv_post = nn.Conv1d(config.model_hidden_channels, 1, 7, padding=3, padding_mode="reflect")
# 创建后处理卷积层,用于最终输出的处理
# Initialize weights and apply final processing
self.post_init()
# 调用初始化方法,用于权重初始化和最终处理的应用
@add_start_docstrings_to_model_forward(UNIVNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=UnivNetModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_features: torch.FloatTensor,
noise_sequence: Optional[torch.FloatTensor] = None,
padding_mask: Optional[torch.FloatTensor] = None,
generator: Optional[torch.Generator] = None,
return_dict: Optional[bool] = None,
):
# 正向传播方法,详细文档说明见装饰器函数
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
# 初始化模块的权重,适用于线性层、卷积层和转置卷积层
def apply_weight_norm(self):
nn.utils.weight_norm(self.conv_pre)
for layer in self.resblocks:
layer.apply_weight_norm()
nn.utils.weight_norm(self.conv_post)
# 应用权重归一化到预处理卷积层、ResNet 块和后处理卷积层
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.conv_pre)
for layer in self.resblocks:
layer.remove_weight_norm()
nn.utils.remove_weight_norm(self.conv_post)
# 移除预处理卷积层、ResNet 块和后处理卷积层的权重归一化
.\models\univnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_univnet": [
"UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
"UnivNetConfig",
],
"feature_extraction_univnet": ["UnivNetFeatureExtractor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_univnet"] = [
"UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"UnivNetModel",
]
if TYPE_CHECKING:
from .configuration_univnet import (
UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
UnivNetConfig,
)
from .feature_extraction_univnet import UnivNetFeatureExtractor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_univnet import (
UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST,
UnivNetModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\upernet\configuration_upernet.py
""" UperNet 模型配置"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
class UperNetConfig(PretrainedConfig):
r"""
这是用于存储 [`UperNetForSemanticSegmentation`] 配置的类。它用于根据指定的参数实例化 UperNet 模型,
定义模型的架构。使用默认值实例化配置会产生类似于 UperNet
[openmmlab/upernet-convnext-tiny](https://huggingface.co/openmmlab/upernet-convnext-tiny) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读
[`PretrainedConfig`] 的文档获取更多信息。
"""
model_type = "upernet"
def __init__(
self,
backbone_config=None,
backbone=None,
use_pretrained_backbone=False,
use_timm_backbone=False,
backbone_kwargs=None,
hidden_size=512,
initializer_range=0.02,
pool_scales=[1, 2, 3, 6],
use_auxiliary_head=True,
auxiliary_loss_weight=0.4,
auxiliary_in_channels=384,
auxiliary_channels=256,
auxiliary_num_convs=1,
auxiliary_concat_input=False,
loss_ignore_index=255,
**kwargs,
):
super().__init__(**kwargs)
if use_pretrained_backbone:
raise ValueError("Pretrained backbones are not supported yet.")
if backbone_config is not None and backbone is not None:
raise ValueError("You can't specify both `backbone` and `backbone_config`.")
if backbone_config is None and backbone is None:
logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage1", "stage2", "stage3", "stage4"])
elif isinstance(backbone_config, dict):
backbone_model_type = backbone_config.get("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
if backbone_kwargs is not None and backbone_config is not None:
raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
self.backbone_config = backbone_config
self.backbone = backbone
self.use_pretrained_backbone = use_pretrained_backbone
self.use_timm_backbone = use_timm_backbone
self.backbone_kwargs = backbone_kwargs
self.hidden_size = hidden_size
self.initializer_range = initializer_range
self.pool_scales = pool_scales
self.use_auxiliary_head = use_auxiliary_head
self.auxiliary_loss_weight = auxiliary_loss_weight
self.auxiliary_in_channels = auxiliary_in_channels
self.auxiliary_channels = auxiliary_channels
self.auxiliary_num_convs = auxiliary_num_convs
self.auxiliary_concat_input = auxiliary_concat_input
self.loss_ignore_index = loss_ignore_index
.\models\upernet\convert_convnext_upernet_to_pytorch.py
"""Convert ConvNext + UperNet checkpoints from mmsegmentation."""
import argparse
import json
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import ConvNextConfig, SegformerImageProcessor, UperNetConfig, UperNetForSemanticSegmentation
def get_upernet_config(model_name):
auxiliary_in_channels = 384
if "tiny" in model_name:
depths = [3, 3, 9, 3]
hidden_sizes = [96, 192, 384, 768]
if "small" in model_name:
depths = [3, 3, 27, 3]
hidden_sizes = [96, 192, 384, 768]
if "base" in model_name:
depths = [3, 3, 27, 3]
hidden_sizes = [128, 256, 512, 1024]
auxiliary_in_channels = 512
if "large" in model_name:
depths = [3, 3, 27, 3]
hidden_sizes = [192, 384, 768, 1536]
auxiliary_in_channels = 768
if "xlarge" in model_name:
depths = [3, 3, 27, 3]
hidden_sizes = [256, 512, 1024, 2048]
auxiliary_in_channels = 1024
num_labels = 150
repo_id = "huggingface/label-files"
filename = "ade20k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
backbone_config = ConvNextConfig(
depths=depths, hidden_sizes=hidden_sizes, out_features=["stage1", "stage2", "stage3", "stage4"]
)
config = UperNetConfig(
backbone_config=backbone_config,
auxiliary_in_channels=auxiliary_in_channels,
num_labels=num_labels,
id2label=id2label,
label2id=label2id,
)
return config
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("backbone.downsample_layers.0.0.weight", "backbone.embeddings.patch_embeddings.weight"))
rename_keys.append(("backbone.downsample_layers.0.0.bias", "backbone.embeddings.patch_embeddings.bias"))
rename_keys.append(("backbone.downsample_layers.0.1.weight", "backbone.embeddings.layernorm.weight"))
rename_keys.append(("backbone.downsample_layers.0.1.bias", "backbone.embeddings.layernorm.bias"))
for i in range(len(config.backbone_config.depths)):
for j in range(config.backbone_config.depths[i]):
rename_keys.append((f"backbone.stages.{i}.{j}.gamma", f"backbone.encoder.stages.{i}.layers.{j}.layer_scale_parameter"))
rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.weight", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.weight"))
rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.bias", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.bias"))
rename_keys.append((f"backbone.stages.{i}.{j}.norm.weight", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.weight"))
rename_keys.append((f"backbone.stages.{i}.{j}.norm.bias", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.bias"))
rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.weight"))
rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.bias"))
rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.weight"))
rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.bias"))
if i > 0:
rename_keys.append((f"backbone.downsample_layers.{i}.0.weight", f"backbone.encoder.stages.{i}.downsampling_layer.0.weight"))
rename_keys.append((f"backbone.downsample_layers.{i}.0.bias", f"backbone.encoder.stages.{i}.downsampling_layer.0.bias"))
rename_keys.append((f"backbone.downsample_layers.{i}.1.weight", f"backbone.encoder.stages.{i}.downsampling_layer.1.weight"))
rename_keys.append((f"backbone.downsample_layers.{i}.1.bias", f"backbone.encoder.stages.{i}.downsampling_layer.1.bias"))
rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
rename_keys.extend(
[
("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
]
)
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
model_name_to_url = {
"upernet-convnext-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth",
"upernet-convnext-small": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth",
"upernet-convnext-base": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth",
"upernet-convnext-large": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth",
"upernet-convnext-xlarge": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth",
}
checkpoint_url = model_name_to_url[model_name]
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
config = get_upernet_config(model_name)
model = UperNetForSemanticSegmentation(config)
model.eval()
for key in state_dict.copy().keys():
val = state_dict.pop(key)
if "bn" in key:
key = key.replace("bn", "batch_norm")
state_dict[key] = val
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
model.load_state_dict(state_dict)
url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
processor = SegformerImageProcessor()
pixel_values = processor(image, return_tensors="pt").pixel_values
with torch.no_grad():
outputs = model(pixel_values)
if model_name == "upernet-convnext-tiny":
expected_slice = torch.tensor(
[[-8.8110, -8.8110, -8.6521], [-8.8110, -8.8110, -8.6521], [-8.7746, -8.7746, -8.6130]]
)
elif model_name == "upernet-convnext-small":
expected_slice = torch.tensor(
[[-8.8236, -8.8236, -8.6771], [-8.8236, -8.8236, -8.6771], [-8.7638, -8.7638, -8.6240]]
)
elif model_name == "upernet-convnext-base":
expected_slice = torch.tensor(
[[-8.8558, -8.8558, -8.6905], [-8.8558, -8.8558, -8.6905], [-8.7669, -8.7669, -8.6021]]
)
elif model_name == "upernet-convnext-large":
expected_slice = torch.tensor(
[[-8.6660, -8.6660, -8.6210], [-8.6660, -8.6660, -8.6210], [-8.6310, -8.6310, -8.5964]]
)
elif model_name == "upernet-convnext-xlarge":
expected_slice = torch.tensor(
[[-8.4980, -8.4980, -8.3977], [-8.4980, -8.4980, -8.3977], [-8.4379, -8.4379, -8.3412]]
)
print("Logits:", outputs.logits[0, 0, :3, :3])
assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving processor to {pytorch_dump_folder_path}")
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing model and processor for {model_name} to hub")
model.push_to_hub(f"openmmlab/{model_name}")
processor.push_to_hub(f"openmmlab/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="upernet-convnext-tiny",
type=str,
choices=[f"upernet-convnext-{size}" for size in ["tiny", "small", "base", "large", "xlarge"]],
help="Name of the ConvNext UperNet model you'd like to convert."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\upernet\convert_swin_upernet_to_pytorch.py
"""Convert Swin Transformer + UperNet checkpoints from mmsegmentation.
从mmsegmentation转换Swin Transformer + UperNet检查点。
URL: https://github.com/open-mmlab/mmsegmentation/tree/master/configs/swin
"""
import argparse
import json
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import SegformerImageProcessor, SwinConfig, UperNetConfig, UperNetForSemanticSegmentation
def get_upernet_config(model_name):
auxiliary_in_channels = 384
window_size = 7
if "tiny" in model_name:
embed_dim = 96
depths = (2, 2, 6, 2)
num_heads = (3, 6, 12, 24)
elif "small" in model_name:
embed_dim = 96
depths = (2, 2, 18, 2)
num_heads = (3, 6, 12, 24)
elif "base" in model_name:
embed_dim = 128
depths = (2, 2, 18, 2)
num_heads = (4, 8, 16, 32)
window_size = 12
auxiliary_in_channels = 512
elif "large" in model_name:
embed_dim = 192
depths = (2, 2, 18, 2)
num_heads = (6, 12, 24, 48)
window_size = 12
auxiliary_in_channels = 768
num_labels = 150
repo_id = "huggingface/label-files"
filename = "ade20k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
backbone_config = SwinConfig(
embed_dim=embed_dim,
depths=depths,
num_heads=num_heads,
window_size=window_size,
out_features=["stage1", "stage2", "stage3", "stage4"],
)
config = UperNetConfig(
backbone_config=backbone_config,
auxiliary_in_channels=auxiliary_in_channels,
num_labels=num_labels,
id2label=id2label,
label2id=label2id,
)
return config
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("backbone.patch_embed.projection.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("backbone.patch_embed.projection.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("backbone.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
rename_keys.append(("backbone.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))
for i in range(len(config.backbone_config.depths)):
for j in range(config.backbone_config.depths[i]):
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_bias_table", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_index", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
if i < 3:
rename_keys.append((f"backbone.stages.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight"))
rename_keys.append((f"backbone.stages.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight"))
rename_keys.append((f"backbone.stages.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias"))
rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
rename_keys.extend(
[
("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
]
)
return rename_keys
rename_keys.extend(
[
("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
]
)
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_q_k_v(state_dict, backbone_config):
num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
for i in range(len(backbone_config.depths)):
dim = num_features[i]
for j in range(backbone_config.depths[i]):
in_proj_weight = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.weight")
in_proj_bias = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.bias")
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
dim : dim * 2, :
]
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
dim : dim * 2
]
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-dim :, :
]
state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
def correct_unfold_reduction_order(x):
out_channel, in_channel = x.shape
x = x.reshape(out_channel, 4, in_channel // 4)
x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
return x
def reverse_correct_unfold_reduction_order(x):
out_channel, in_channel = x.shape
x = x.reshape(out_channel, in_channel // 4, 4)
x = x[:, :, [0, 2, 1, 3]].transpose(1, 2).reshape(out_channel, in_channel)
return x
def correct_unfold_norm_order(x):
in_channel = x.shape[0]
x = x.reshape(4, in_channel // 4)
x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
return x
def reverse_correct_unfold_norm_order(x):
in_channel = x.shape[0]
x = x.reshape(in_channel // 4, 4)
x = x[:, [0, 2, 1, 3]].transpose(0, 1).reshape(in_channel)
return x
def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
pass
model_name_to_url = {
"upernet-swin-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth",
"upernet-swin-small": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth",
"upernet-swin-base": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth",
"upernet-swin-large": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k_20220318_091743-9ba68901.pth",
}
checkpoint_url = model_name_to_url[model_name]
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)["state_dict"]
for name, param in state_dict.items():
print(name, param.shape)
config = get_upernet_config(model_name)
model = UperNetForSemanticSegmentation(config)
model.eval()
for key in state_dict.copy().keys():
val = state_dict.pop(key)
if "bn" in key:
key = key.replace("bn", "batch_norm")
state_dict[key] = val
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config.backbone_config)
for key, value in state_dict.items():
if "downsample" in key:
if "reduction" in key:
state_dict[key] = reverse_correct_unfold_reduction_order(value)
if "norm" in key:
state_dict[key] = reverse_correct_unfold_norm_order(value)
model.load_state_dict(state_dict)
url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
processor = SegformerImageProcessor()
pixel_values = processor(image, return_tensors="pt").pixel_values
with torch.no_grad():
outputs = model(pixel_values)
logits = outputs.logits
print(logits.shape)
print("First values of logits:", logits[0, 0, :3, :3])
if model_name == "upernet-swin-tiny":
expected_slice = torch.tensor(
[[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
)
elif model_name == "upernet-swin-small":
expected_slice = torch.tensor(
[[-7.1921, -7.1921, -6.9532], [-7.1921, -7.1921, -6.9532], [-7.0908, -7.0908, -6.8534]]
)
elif model_name == "upernet-swin-base":
expected_slice = torch.tensor(
[[-6.5851, -6.5851, -6.4330], [-6.5851, -6.5851, -6.4330], [-6.4763, -6.4763, -6.3254]]
)
elif model_name == "upernet-swin-large":
expected_slice = torch.tensor(
[[-7.5297, -7.5297, -7.3802], [-7.5297, -7.5297, -7.3802], [-7.4044, -7.4044, -7.2586]]
)
print("Logits:", outputs.logits[0, 0, :3, :3])
assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving processor to {pytorch_dump_folder_path}")
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing model and processor for {model_name} to hub")
model.push_to_hub(f"openmmlab/{model_name}")
processor.push_to_hub(f"openmmlab/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="upernet-swin-tiny",
type=str,
choices=[f"upernet-swin-{size}" for size in ["tiny", "small", "base", "large"]],
help="Name of the Swin + UperNet model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\upernet\modeling_upernet.py
""" PyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation."""
from typing import List, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from ...modeling_outputs import SemanticSegmenterOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from ...utils.backbone_utils import load_backbone
from .configuration_upernet import UperNetConfig
UPERNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openmmlab/upernet-convnext-tiny",
]
_CONFIG_FOR_DOC = "UperNetConfig"
class UperNetConvModule(nn.Module):
"""
A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, int]],
padding: Union[int, Tuple[int, int], str] = 0,
bias: bool = False,
dilation: Union[int, Tuple[int, int]] = 1,
) -> None:
super().__init__()
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
padding=padding,
bias=bias,
dilation=dilation,
)
self.batch_norm = nn.BatchNorm2d(out_channels)
self.activation = nn.ReLU()
def forward(self, input: torch.Tensor) -> torch.Tensor:
output = self.conv(input)
output = self.batch_norm(output)
output = self.activation(output)
return output
class UperNetPyramidPoolingBlock(nn.Module):
def __init__(self, pool_scale: int, in_channels: int, channels: int) -> None:
super().__init__()
self.layers = [
nn.AdaptiveAvgPool2d(pool_scale),
UperNetConvModule(in_channels, channels, kernel_size=1),
]
for i, layer in enumerate(self.layers):
self.add_module(str(i), layer)
def forward(self, input: torch.Tensor) -> torch.Tensor:
hidden_state = input
for layer in self.layers:
hidden_state = layer(hidden_state)
return hidden_state
class UperNetPyramidPoolingModule(nn.Module):
"""
Pyramid Pooling Module (PPM) used in PSPNet.
Args:
pool_scales (`Tuple[int]`):
Pooling scales used in Pooling Pyramid Module.
in_channels (`int`):
Input channels.
channels (`int`):
Channels after modules, before conv_seg.
align_corners (`bool`):
align_corners argument of F.interpolate.
"""
def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, channels: int, align_corners: bool) -> None:
super().__init__()
self.pool_scales = pool_scales
self.align_corners = align_corners
self.in_channels = in_channels
self.channels = channels
self.blocks = []
for i, pool_scale in enumerate(pool_scales):
block = UperNetPyramidPoolingBlock(pool_scale=pool_scale, in_channels=in_channels, channels=channels)
self.blocks.append(block)
self.add_module(str(i), block)
def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
ppm_outs = []
for ppm in self.blocks:
ppm_out = ppm(x)
upsampled_ppm_out = nn.functional.interpolate(
ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners
)
ppm_outs.append(upsampled_ppm_out)
return ppm_outs
class UperNetHead(nn.Module):
"""
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://arxiv.org/abs/1807.10221).
"""
def __init__(self, config, in_channels):
super().__init__()
self.config = config
self.pool_scales = config.pool_scales
self.in_channels = in_channels
self.channels = config.hidden_size
self.align_corners = False
self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
self.psp_modules = UperNetPyramidPoolingModule(
self.pool_scales,
self.in_channels[-1],
self.channels,
align_corners=self.align_corners,
)
self.bottleneck = UperNetConvModule(
self.in_channels[-1] + len(self.pool_scales) * self.channels,
self.channels,
kernel_size=3,
padding=1,
)
self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()
for in_channels in self.in_channels[:-1]:
l_conv = UperNetConvModule(in_channels, self.channels, kernel_size=1)
fpn_conv = UperNetConvModule(self.channels, self.channels, kernel_size=3, padding=1)
self.lateral_convs.append(l_conv)
self.fpn_convs.append(fpn_conv)
self.fpn_bottleneck = UperNetConvModule(
len(self.in_channels) * self.channels,
self.channels,
kernel_size=3,
padding=1,
)
def init_weights(self):
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Conv2d):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
def psp_forward(self, inputs):
x = inputs[-1]
psp_outs = [x]
psp_outs.extend(self.psp_modules(x))
psp_outs = torch.cat(psp_outs, dim=1)
output = self.bottleneck(psp_outs)
return output
def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]
laterals.append(self.psp_forward(encoder_hidden_states))
used_backbone_levels = len(laterals)
for i in range(used_backbone_levels - 1, 0, -1):
prev_shape = laterals[i - 1].shape[2:]
laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate(
laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners
)
fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
fpn_outs.append(laterals[-1])
for i in range(used_backbone_levels - 1, 0, -1):
fpn_outs[i] = nn.functional.interpolate(
fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners
)
fpn_outs = torch.cat(fpn_outs, dim=1)
output = self.fpn_bottleneck(fpn_outs)
output = self.classifier(output)
return output
class UperNetFCNHead(nn.Module):
"""
Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
[FCNNet](https://arxiv.org/abs/1411.4038>).
Args:
config:
Configuration.
in_channels (int):
Number of input channels.
kernel_size (int):
The kernel size for convs in the head. Default: 3.
dilation (int):
The dilation rate for convs in the head. Default: 1.
"""
def __init__(
self, config, in_index: int = 2, kernel_size: int = 3, dilation: Union[int, Tuple[int, int]] = 1
) -> None:
super().__init__()
self.config = config
self.in_channels = config.auxiliary_in_channels
self.channels = config.auxiliary_channels
self.num_convs = config.auxiliary_num_convs
self.concat_input = config.auxiliary_concat_input
self.in_index = in_index
conv_padding = (kernel_size // 2) * dilation
convs = []
convs.append(
UperNetConvModule(
self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
)
)
for i in range(self.num_convs - 1):
convs.append(
UperNetConvModule(
self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
)
)
if self.num_convs == 0:
self.convs = nn.Identity()
else:
self.convs = nn.Sequential(*convs)
if self.concat_input:
self.conv_cat = UperNetConvModule(
self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2
)
self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
def init_weights(self):
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Conv2d):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = encoder_hidden_states[self.in_index]
output = self.convs(hidden_states)
if self.concat_input:
output = self.conv_cat(torch.cat([hidden_states, output], dim=1))
output = self.classifier(output)
return output
def _init_weights(self, module):
if isinstance(module, UperNetPreTrainedModel):
module.backbone.init_weights()
module.decode_head.init_weights()
if module.auxiliary_head is not None:
module.auxiliary_head.init_weights()
def init_weights(self):
"""Initialize the weights"""
self.backbone.init_weights()
self.decode_head.init_weights()
if self.auxiliary_head is not None:
self.auxiliary_head.init_weights()
UPERNET_START_DOCSTRING = r"""
Parameters:
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
UPERNET_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
`attentions` under returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
returned tensors for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"""UperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.""",
UPERNET_START_DOCSTRING,
)
class UperNetForSemanticSegmentation(UperNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.backbone = load_backbone(config)
self.decode_head = UperNetHead(config, in_channels=self.backbone.channels)
self.auxiliary_head = UperNetFCNHead(config) if config.use_auxiliary_head else None
self.post_init()
@add_start_docstrings_to_model_forward(UPERNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
.\models\upernet\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_upernet": ["UperNetConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_upernet"] = [
"UperNetForSemanticSegmentation",
"UperNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_upernet import UperNetConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_upernet import UperNetForSemanticSegmentation, UperNetPreTrainedModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\videomae\configuration_videomae.py
""" VideoMAE model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"MCG-NJU/videomae-base": "https://huggingface.co/MCG-NJU/videomae-base/resolve/main/config.json",
}
class VideoMAEConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`VideoMAEModel`]. It is used to instantiate a
VideoMAE model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the VideoMAE
[MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
class VisionTransformerConfig:
def __init__(
self,
image_size=224,
patch_size=16,
num_channels=3,
num_frames=16,
tubelet_size=2,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-12,
qkv_bias=True,
use_mean_pooling=True,
decoder_num_attention_heads=6,
decoder_hidden_size=384,
decoder_num_hidden_layers=4,
decoder_intermediate_size=1536,
norm_pix_loss=True
):
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_frames = num_frames
self.tubelet_size = tubelet_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.qkv_bias = qkv_bias
self.use_mean_pooling = use_mean_pooling
self.decoder_num_attention_heads = decoder_num_attention_heads
self.decoder_hidden_size = decoder_hidden_size
self.decoder_num_hidden_layers = decoder_num_hidden_layers
self.decoder_intermediate_size = decoder_intermediate_size
self.norm_pix_loss = norm_pix_loss
>>> from transformers import VideoMAEConfig, VideoMAEModel
>>> configuration = VideoMAEConfig()
>>> model = VideoMAEModel(configuration)
>>> configuration = model.config
.\models\videomae\convert_videomae_to_pytorch.py
"""Convert VideoMAE checkpoints from the original repository: https://github.com/MCG-NJU/VideoMAE"""
import argparse
import json
import gdown
import numpy as np
import torch
from huggingface_hub import hf_hub_download
from transformers import (
VideoMAEConfig,
VideoMAEForPreTraining,
VideoMAEForVideoClassification,
VideoMAEImageProcessor,
)
def get_videomae_config(model_name):
config = VideoMAEConfig()
set_architecture_configs(model_name, config)
if "finetuned" not in model_name:
config.use_mean_pooling = False
if "finetuned" in model_name:
repo_id = "huggingface/label-files"
if "kinetics" in model_name:
config.num_labels = 400
filename = "kinetics400-id2label.json"
elif "ssv2" in model_name:
config.num_labels = 174
filename = "something-something-v2-id2label.json"
else:
raise ValueError("Model name should either contain 'kinetics' or 'ssv2' in case it's fine-tuned.")
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def set_architecture_configs(model_name, config):
if "small" in model_name:
config.hidden_size = 384
config.intermediate_size = 1536
config.num_hidden_layers = 12
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 3
config.decoder_hidden_size = 192
config.decoder_intermediate_size = 768
elif "large" in model_name:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 512
config.decoder_intermediate_size = 2048
elif "huge" in model_name:
config.hidden_size = 1280
config.intermediate_size = 5120
config.num_hidden_layers = 32
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 640
config.decoder_intermediate_size = 2560
elif "base" not in model_name:
raise ValueError('Model name should include either "small", "base", "large", or "huge"')
def rename_key(name):
if "encoder." in name:
name = name.replace("encoder.", "")
if "cls_token" in name:
name = name.replace("cls_token", "videomae.embeddings.cls_token")
if "decoder_pos_embed" in name:
name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
if "pos_embed" in name and "decoder" not in name:
name = name.replace("pos_embed", "videomae.embeddings.position_embeddings")
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "videomae.embeddings.patch_embeddings.projection")
if "patch_embed.norm" in name:
name = name.replace("patch_embed.norm", "videomae.embeddings.norm")
if "decoder.blocks" in name:
name = name.replace("decoder.blocks", "decoder.decoder_layers")
if "blocks" in name:
name = name.replace("blocks", "videomae.encoder.layer")
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "attn" in name and "bias" not in name:
name = name.replace("attn", "attention.self")
if "attn" in name:
name = name.replace("attn", "attention.attention")
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
if "decoder_embed" in name:
name = name.replace("decoder_embed", "decoder.decoder_embed")
if "decoder_norm" in name:
name = name.replace("decoder_norm", "decoder.decoder_norm")
if "decoder_pred" in name:
name = name.replace("decoder_pred", "decoder.decoder_pred")
if "norm.weight" in name and "decoder" not in name and "fc" not in name:
name = name.replace("norm.weight", "videomae.layernorm.weight")
if "norm.bias" in name and "decoder" not in name and "fc" not in name:
name = name.replace("norm.bias", "videomae.layernorm.bias")
if "head" in name and "decoder" not in name:
name = name.replace("head", "classifier")
return name
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if key.startswith("encoder."):
key = key.replace("encoder.", "")
if "qkv" in key:
key_split = key.split(".")
if key.startswith("decoder.blocks"):
dim = config.decoder_hidden_size
layer_num = int(key_split[2])
prefix = "decoder.decoder_layers."
if "weight" in key:
orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
else:
dim = config.hidden_size
layer_num = int(key_split[1])
prefix = "videomae.encoder.layer."
if "weight" in key:
orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
else:
orig_state_dict[rename_key(key)] = val
return orig_state_dict
def prepare_video():
file = hf_hub_download(
repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
)
video = np.load(file)
return list(video)
def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub):
config = get_videomae_config(model_name)
if "finetuned" in model_name:
model = VideoMAEForVideoClassification(config)
else:
model = VideoMAEForPreTraining(config)
output = "pytorch_model.bin"
gdown.cached_download(checkpoint_url, output, quiet=False)
files = torch.load(output, map_location="cpu")
if "model" in files:
state_dict = files["model"]
else:
state_dict = files["module"]
new_state_dict = convert_state_dict(state_dict, config)
model.load_state_dict(new_state_dict)
model.eval()
image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
video = prepare_video()
inputs = image_processor(video, return_tensors="pt")
if "finetuned" not in model_name:
local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
inputs["bool_masked_pos"] = torch.load(local_path)
outputs = model(**inputs)
logits = outputs.logits
model_names = [
"videomae-small-finetuned-kinetics",
"videomae-small-finetuned-ssv2",
"videomae-base-short",
"videomae-base-short-finetuned-kinetics",
"videomae-base",
"videomae-base-finetuned-kinetics",
"videomae-large",
"videomae-large-finetuned-kinetics",
"videomae-huge-finetuned-kinetics",
"videomae-base-short-ssv2",
"videomae-base-short-finetuned-ssv2",
"videomae-base-ssv2",
"videomae-base-finetuned-ssv2",
]
if model_name == "videomae-small-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([-0.9291, -0.4061, -0.9307])
elif model_name == "videomae-small-finetuned-ssv2":
expected_shape = torch.Size([1, 174])
expected_slice = torch.tensor([0.2671, -0.4689, -0.8235])
elif model_name == "videomae-base":
expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
elif model_name == "videomae-base-short":
expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
expected_loss = torch.tensor([0.5142]) if config.norm_pix_loss else torch.tensor([0.6469])
elif model_name == "videomae-large":
expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor([[0.7149, 0.7997, 0.6966], [0.6768, 0.7869, 0.6948], [0.5139, 0.6221, 0.5605]])
elif model_name == "videomae-large-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
elif model_name == "videomae-huge-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.2433, 0.1632, -0.4894])
elif model_name == "videomae-base-short-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
elif model_name == "videomae-base-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.3669, -0.0688, -0.2421])
elif model_name == "videomae-base-short-ssv2":
expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor([[0.4712, 0.5296, 0.5786], [0.2278, 0.2729, 0.4026], [0.0352, 0.0730, 0.2506]])
elif model_name == "videomae-base-short-finetuned-ssv2":
expected_shape = torch.Size([1, 174])
expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266])
elif model_name == "videomae-base-ssv2":
expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor([[0.8131, 0.8727, 0.8546], [0.7366, 0.9377, 0.8870], [0.5935, 0.8874, 0.8564]])
elif model_name == "videomae-base-finetuned-ssv2":
expected_shape = torch.Size([1, 174])
expected_slice = torch.tensor([0.1961, -0.8337, -0.6389])
else:
raise ValueError(f"Model name not supported. Should be one of {model_names}")
assert logits.shape == expected_shape
if "finetuned" in model_name:
assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
else:
print("Logits:", logits[0, :3, :3])
assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
print("Logits ok!")
if model_name == "videomae-base-short":
loss = outputs.loss
assert torch.allclose(loss, expected_loss, atol=1e-4)
print("Loss ok!")
if pytorch_dump_folder_path is not None:
print(f"Saving model and image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
model.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print("Pushing to the hub...")
model.push_to_hub(model_name, organization="nielsr")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://drive.google.com/u/1/uc?id=1tEhLyskjb755TJ65ptsrafUG2llSwQE1&export=download&confirm=t&uuid=aa3276eb-fb7e-482a-adec-dc7171df14c4",
type=str,
help=(
"URL of the original PyTorch checkpoint (on Google Drive) you'd like to convert. Should be a direct"
" download link."
),
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="/Users/nielsrogge/Documents/VideoMAE/Test",
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--model_name", default="videomae-base", type=str, help="Name of the model.")
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_videomae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
.\models\videomae\feature_extraction_videomae.py
"""
Feature extractor class for VideoMAE.
"""
import warnings
from ...utils import logging
from .image_processing_videomae import VideoMAEImageProcessor
logger = logging.get_logger(__name__)
class VideoMAEFeatureExtractor(VideoMAEImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class VideoMAEFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use VideoMAEImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\videomae\image_processing_videomae.py
"""Image processor class for VideoMAE."""
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
def make_batched(videos) -> List[List[ImageInput]]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
return [videos]
elif is_valid_image(videos):
return [[videos]]
raise ValueError(f"Could not make batched video from {videos}")
class VideoMAEImageProcessor(BaseImageProcessor):
r"""
Constructs a VideoMAE image processor.
# 定义函数参数说明文档,描述了预处理图像的可选参数及其默认值和用途
Args:
do_resize (`bool`, *optional*, defaults to `True`):
是否调整图像的(高度,宽度)尺寸至指定 `size`。可以被 `preprocess` 方法中的 `do_resize` 参数覆盖。
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
调整后图像的尺寸。图像的最短边将被调整至 `size["shortest_edge"]`,同时保持原始图像的长宽比。
可以被 `preprocess` 方法中的 `size` 参数覆盖。
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
调整图像尺寸时使用的重采样滤波器。可以被 `preprocess` 方法中的 `resample` 参数覆盖。
do_center_crop (`bool`, *optional*, defaults to `True`):
是否对图像进行中心裁剪至指定 `crop_size`。可以被 `preprocess` 方法中的 `do_center_crop` 参数覆盖。
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
应用中心裁剪后的图像尺寸。可以被 `preprocess` 方法中的 `crop_size` 参数覆盖。
do_rescale (`bool`, *optional*, defaults to `True`):
是否按照指定的缩放因子 `rescale_factor` 进行图像缩放。可以被 `preprocess` 方法中的 `do_rescale` 参数覆盖。
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
如果进行图像缩放,定义要使用的缩放因子。可以被 `preprocess` 方法中的 `rescale_factor` 参数覆盖。
do_normalize (`bool`, *optional*, defaults to `True`):
是否对图像进行归一化。可以被 `preprocess` 方法中的 `do_normalize` 参数覆盖。
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
归一化时使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。
可以被 `preprocess` 方法中的 `image_mean` 参数覆盖。
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
归一化时使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。
可以被 `preprocess` 方法中的 `image_std` 参数覆盖。
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, param_name="crop_size")
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self._valid_processor_keys = [
"videos",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
) -> np.ndarray:
"""
Resize an image.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
shortest edge of length `s` while keeping the aspect ratio of the original image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size:
output_size = get_resize_output_image_size(
image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
elif "height" in size and "width" in size:
output_size = (size["height"], size["width"])
else:
raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Preprocesses a single image.
"""
validate_preprocess_arguments(
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_resize=do_resize,
size=size,
resample=resample,
)
image = to_numpy_array(image)
if is_scaled_image(image) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
if do_resize:
image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
if do_center_crop:
image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
return image
def preprocess(
self,
videos: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Ultimate method to preprocess images or videos with flexible options.
"""
.\models\videomae\modeling_videomae.py
""" PyTorch VideoMAE (masked autoencoder) model."""
import collections.abc
import math
from copy import deepcopy
from dataclasses import dataclass
from typing import Optional, Set, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from .configuration_videomae import VideoMAEConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "VideoMAEConfig"
_CHECKPOINT_FOR_DOC = "MCG-NJU/videomae-base"
VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = [
"MCG-NJU/videomae-base",
]
@dataclass
class VideoMAEDecoderOutput(ModelOutput):
"""
VideoMAEDecoder 的输出类,可能包含隐藏状态和注意力权重。
Args:
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
像素重构的 logits。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 返回时 `output_hidden_states=True` 或 `config.output_hidden_states=True`):
一个元组,包含 `torch.FloatTensor`(嵌入层输出 + 每个层的输出)的形状为 `(batch_size, sequence_length, hidden_size)`。
模型每一层的隐藏状态以及初始嵌入层的输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, 返回时 `output_attentions=True` 或 `config.output_attentions=True`):
一个元组,包含 `torch.FloatTensor`(每个层的注意力权重)的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
经过注意力 softmax 后的注意力权重,用于计算自注意力头的加权平均值。
"""
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class VideoMAEForPreTrainingOutput(ModelOutput):
"""
Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
Args:
loss (`torch.FloatTensor` of shape `(1,)`):
Pixel reconstruction loss.
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
Pixel reconstruction logits.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
def get_sinusoid_encoding_table(n_position, d_hid):
"""
Sinusoid position encoding table.
Args:
n_position (int): Number of positions to encode.
d_hid (int): Hidden dimension size.
Returns:
torch.FloatTensor: Sinusoid position encoding table of shape `(1, n_position, d_hid)`.
"""
def get_position_angle_vec(position):
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
class VideoMAEEmbeddings(nn.Module):
"""
Construct the patch and position embeddings for VideoMAE model.
Args:
config (object): Configuration object containing model settings.
Attributes:
patch_embeddings (VideoMAEPatchEmbeddings): Patch embeddings module.
num_patches (int): Number of patches in the input.
position_embeddings (torch.FloatTensor): Sinusoid position embeddings tensor.
config (object): Configuration object.
"""
def __init__(self, config):
super().__init__()
self.patch_embeddings = VideoMAEPatchEmbeddings(config)
self.num_patches = self.patch_embeddings.num_patches
self.position_embeddings = get_sinusoid_encoding_table(self.num_patches, config.hidden_size)
self.config = config
def forward(self, pixel_values, bool_masked_pos):
embeddings = self.patch_embeddings(pixel_values)
embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
if bool_masked_pos is not None:
batch_size, _, num_channels = embeddings.shape
embeddings = embeddings[~bool_masked_pos]
embeddings = embeddings.reshape(batch_size, -1, num_channels)
return embeddings
class VideoMAEPatchEmbeddings(nn.Module):
"""
Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.
The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
patch_size).
"""
def __init__(self, config):
super().__init__()
image_size = config.image_size
patch_size = config.patch_size
num_channels = config.num_channels
hidden_size = config.hidden_size
num_frames = config.num_frames
tubelet_size = config.tubelet_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
self.image_size = image_size
self.patch_size = patch_size
self.tubelet_size = int(tubelet_size)
num_patches = (
(image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
)
self.num_channels = num_channels
self.num_patches = num_patches
self.projection = nn.Conv3d(
in_channels=num_channels,
out_channels=hidden_size,
kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
stride=(self.tubelet_size, patch_size[0], patch_size[1]),
)
def forward(self, pixel_values):
batch_size, num_frames, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
)
pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
return embeddings
def __init__(self, config: VideoMAEConfig) -> None:
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
if config.qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(self.all_head_size))
self.v_bias = nn.Parameter(torch.zeros(self.all_head_size))
else:
self.q_bias = None
self.v_bias = None
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
):
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
queries = nn.functional.linear(input=hidden_states, weight=self.query.weight, bias=self.q_bias)
key_layer = self.transpose_for_scores(keys)
value_layer = self.transpose_for_scores(values)
query_layer = self.transpose_for_scores(queries)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class VideoMAESelfOutput(nn.Module):
"""
The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: VideoMAEConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class VideoMAEAttention(nn.Module):
def __init__(self, config: VideoMAEConfig) -> None:
super().__init__()
self.attention = VideoMAESelfAttention(config)
self.output = VideoMAESelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads: Set[int]) -> None:
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_outputs = self.attention(hidden_states, head_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class VideoMAEIntermediate(nn.Module):
def __init__(self, config: VideoMAEConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class VideoMAEOutput(nn.Module):
def __init__(self, config: VideoMAEConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
class VideoMAELayer(nn.Module):
"""This corresponds to the Block class in the timm implementation."""
def __init__(self, config: VideoMAEConfig) -> None:
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = VideoMAEAttention(config)
self.intermediate = VideoMAEIntermediate(config)
self.output = VideoMAEOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states),
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
hidden_states = attention_output + hidden_states
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output, hidden_states)
outputs = (layer_output,) + outputs
return outputs
class VideoMAEEncoder(nn.Module):
def __init__(self, config: VideoMAEConfig) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([VideoMAELayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
layer_head_mask,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
@add_start_docstrings(
"The bare VideoMAE Model transformer outputting raw hidden-states without any specific head on top.",
VIDEOMAE_START_DOCSTRING,
)
模型类的装饰器,用于为 `VideoMAEModel` 添加文档字符串,并且包括了模型的描述信息和参数说明。
class VideoMAEModel(VideoMAEPreTrainedModel):
定义了 `VideoMAEModel` 类,它继承自 `VideoMAEPreTrainedModel` 类,是视频多模态自编码器(VideoMAE)的模型类。
VIDEOMAE_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`VideoMAEConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
`VIDEOMAE_START_DOCSTRING` 是一个原始字符串,用于描述 `VideoMAEModel` 类的基本信息和参数说明。它介绍了模型是如何作为 PyTorch 的 `torch.nn.Module` 子类来使用的,并提供了初始化参数的说明。
VIDEOMAE_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`VideoMAEImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
`VIDEOMAE_INPUTS_DOCSTRING` 是一个原始字符串,用于描述 `VideoMAEModel` 类的输入参数。它详细说明了模型接受的各种输入参数,包括像素值、头部掩码、是否返回注意力张量和隐藏状态等。
这些注释和文档字符串为 `VideoMAEModel` 类提供了清晰的描述和参数说明,帮助用户了解如何使用和配置该模型。
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = VideoMAEEmbeddings(config)
self.encoder = VideoMAEEncoder(config)
if config.use_mean_pooling:
self.layernorm = None
else:
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class VideoMAEDecoder(nn.Module):
def __init__(self, config, num_patches):
super().__init__()
decoder_num_labels = config.num_channels * config.tubelet_size * config.patch_size**2
decoder_config = deepcopy(config)
decoder_config.hidden_size = config.decoder_hidden_size
decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
decoder_config.num_attention_heads = config.decoder_num_attention_heads
decoder_config.intermediate_size = config.decoder_intermediate_size
self.decoder_layers = nn.ModuleList(
[VideoMAELayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
)
self.norm = nn.LayerNorm(config.decoder_hidden_size)
self.head = (
nn.Linear(config.decoder_hidden_size, decoder_num_labels) if decoder_num_labels > 0 else nn.Identity()
)
self.gradient_checkpointing = False
self.config = config
def forward(
self,
hidden_states,
return_token_num,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.decoder_layers):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
None,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, head_mask=None, output_attentions=output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if return_token_num > 0:
hidden_states = hidden_states[:, -return_token_num:]
hidden_states = self.norm(hidden_states)
logits = self.head(hidden_states)
if not return_dict:
return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
return VideoMAEDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
@add_start_docstrings(
"The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.",
VIDEOMAE_START_DOCSTRING,
)
class VideoMAEForPreTraining(VideoMAEPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.videomae = VideoMAEModel(config)
self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=False)
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
self.position_embeddings = get_sinusoid_encoding_table(
self.videomae.embeddings.num_patches, config.decoder_hidden_size
)
self.decoder = VideoMAEDecoder(config, num_patches=self.videomae.embeddings.num_patches)
self.post_init()
@add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=VideoMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
bool_masked_pos: torch.BoolTensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
states of all tokens) e.g. for ImageNet.""",
VIDEOMAE_START_DOCSTRING,
)
class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.videomae = VideoMAEModel(config)
self.fc_norm = nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.post_init()
@add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\videomae\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_videomae"] = [
"VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
"VideoMAEForPreTraining",
"VideoMAEModel",
"VideoMAEPreTrainedModel",
"VideoMAEForVideoClassification",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_videomae"] = ["VideoMAEFeatureExtractor"]
_import_structure["image_processing_videomae"] = ["VideoMAEImageProcessor"]
if TYPE_CHECKING:
from .configuration_videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_videomae import (
VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
VideoMAEForPreTraining,
VideoMAEForVideoClassification,
VideoMAEModel,
VideoMAEPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_videomae import VideoMAEFeatureExtractor
from .image_processing_videomae import VideoMAEImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\vilt\configuration_vilt.py
""" VilT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
VILT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"dandelin/vilt-b32-mlm": "https://huggingface.co/dandelin/vilt-b32-mlm/blob/main/config.json"
}
class ViltConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ViLTModel`]. It is used to instantiate an ViLT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the ViLT
[dandelin/vilt-b32-mlm](https://huggingface.co/dandelin/vilt-b32-mlm) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import ViLTModel, ViLTConfig
>>> # Initializing a ViLT dandelin/vilt-b32-mlm style configuration
>>> configuration = ViLTConfig()
>>> # Initializing a model from the dandelin/vilt-b32-mlm style configuration
>>> model = ViLTModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
model_type = "vilt"
def __init__(
self,
vocab_size=30522,
type_vocab_size=2,
modality_type_vocab_size=2,
max_position_embeddings=40,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-12,
image_size=384,
patch_size=32,
num_channels=3,
qkv_bias=True,
max_image_length=-1,
tie_word_embeddings=False,
num_images=-1,
**kwargs,
):
# 调用父类的构造函数,初始化模型参数和超参数
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
# 设置模型的词汇表大小
self.vocab_size = vocab_size
# 设置模型的类型词汇表大小
self.type_vocab_size = type_vocab_size
# 设置模型的模态类型词汇表大小
self.modality_type_vocab_size = modality_type_vocab_size
# 设置模型的最大位置嵌入长度
self.max_position_embeddings = max_position_embeddings
# 设置模型的隐藏层大小
self.hidden_size = hidden_size
# 设置模型的隐藏层数量
self.num_hidden_layers = num_hidden_layers
# 设置模型的注意力头数量
self.num_attention_heads = num_attention_heads
# 设置模型的中间层大小
self.intermediate_size = intermediate_size
# 设置模型的隐藏层激活函数类型
self.hidden_act = hidden_act
# 设置模型的隐藏层的丢弃率
self.hidden_dropout_prob = hidden_dropout_prob
# 设置模型的注意力机制的概率丢弃率
self.attention_probs_dropout_prob = attention_probs_dropout_prob
# 设置模型的初始化范围
self.initializer_range = initializer_range
# 设置模型的层归一化 epsilon 参数
self.layer_norm_eps = layer_norm_eps
# 设置模型的图像输入大小
self.image_size = image_size
# 设置模型的图像块的大小
self.patch_size = patch_size
# 设置模型的图像通道数量
self.num_channels = num_channels
# 设置模型的注意力中的查询、键、值是否包含偏置
self.qkv_bias = qkv_bias
# 设置模型的最大图像长度
self.max_image_length = max_image_length
# 设置模型处理的图像数量
self.num_images = num_images
.\models\vilt\convert_vilt_original_to_pytorch.py
"""Convert ViLT checkpoints from the original Github repository."""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
BertTokenizer,
ViltConfig,
ViltForImageAndTextRetrieval,
ViltForImagesAndTextClassification,
ViltForMaskedLM,
ViltForQuestionAnswering,
ViltImageProcessor,
ViltProcessor,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(config, vqa_model=False, nlvr_model=False, irtr_model=False):
rename_keys = []
for i in range(config.num_hidden_layers):
rename_keys.append((f"transformer.blocks.{i}.norm1.weight", f"vilt.encoder.layer.{i}.layernorm_before.weight"))
rename_keys.append((f"transformer.blocks.{i}.norm1.bias", f"vilt.encoder.layer.{i}.layernorm_before.bias"))
rename_keys.append(
(f"transformer.blocks.{i}.attn.proj.weight", f"vilt.encoder.layer.{i}.attention.output.dense.weight")
)
rename_keys.append(
(f"transformer.blocks.{i}.attn.proj.bias", f"vilt.encoder.layer.{i}.attention.output.dense.bias")
)
rename_keys.append((f"transformer.blocks.{i}.norm2.weight", f"vilt.encoder.layer.{i}.layernorm_after.weight"))
rename_keys.append((f"transformer.blocks.{i}.norm2.bias", f"vilt.encoder.layer.{i}.layernorm_after.bias"))
rename_keys.append(
(f"transformer.blocks.{i}.mlp.fc1.weight", f"vilt.encoder.layer.{i}.intermediate.dense.weight")
)
rename_keys.append(
(f"transformer.blocks.{i}.mlp.fc1.bias", f"vilt.encoder.layer.{i}.intermediate.dense.bias")
)
rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.weight", f"vilt.encoder.layer.{i}.output.dense.weight"))
rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.bias", f"vilt.encoder.layer.{i}.output.dense.bias"))
rename_keys.extend(
[
("text_embeddings.word_embeddings.weight", "vilt.embeddings.text_embeddings.word_embeddings.weight"),
("text_embeddings.position_embeddings.weight", "vilt.embeddings.text_embeddings.position_embeddings.weight"),
("text_embeddings.position_ids", "vilt.embeddings.text_embeddings.position_ids"),
("text_embeddings.token_type_embeddings.weight", "vilt.embeddings.text_embeddings.token_type_embeddings.weight"),
("text_embeddings.LayerNorm.weight", "vilt.embeddings.text_embeddings.LayerNorm.weight"),
("text_embeddings.LayerNorm.bias", "vilt.embeddings.text_embeddings.LayerNorm.bias"),
("transformer.cls_token", "vilt.embeddings.cls_token"),
("transformer.patch_embed.proj.weight", "vilt.embeddings.patch_embeddings.projection.weight"),
("transformer.patch_embed.proj.bias", "vilt.embeddings.patch_embeddings.projection.bias"),
("transformer.pos_embed", "vilt.embeddings.position_embeddings"),
("token_type_embeddings.weight", "vilt.embeddings.token_type_embeddings.weight"),
]
)
rename_keys.extend(
[
("transformer.norm.weight", "vilt.layernorm.weight"),
("transformer.norm.bias", "vilt.layernorm.bias"),
("pooler.dense.weight", "vilt.pooler.dense.weight"),
("pooler.dense.bias", "vilt.pooler.dense.bias"),
]
)
if vqa_model:
rename_keys.extend(
[
("vqa_classifier.0.weight", "classifier.0.weight"),
("vqa_classifier.0.bias", "classifier.0.bias"),
("vqa_classifier.1.weight", "classifier.1.weight"),
("vqa_classifier.1.bias", "classifier.1.bias"),
("vqa_classifier.3.weight", "classifier.3.weight"),
("vqa_classifier.3.bias", "classifier.3.bias"),
]
)
elif nlvr_model:
rename_keys.extend(
[
("nlvr2_classifier.0.weight", "classifier.0.weight"),
("nlvr2_classifier.0.bias", "classifier.0.bias"),
("nlvr2_classifier.1.weight", "classifier.1.weight"),
("nlvr2_classifier.1.bias", "classifier.1.bias"),
("nlvr2_classifier.3.weight", "classifier.3.weight"),
("nlvr2_classifier.3.bias", "classifier.3.bias"),
]
)
else:
pass
return rename_keys
def read_in_q_k_v(state_dict, config):
for i in range(config.num_hidden_layers):
prefix = "vilt."
in_proj_weight = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.bias")
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[config.hidden_size : config.hidden_size * 2, :]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[config.hidden_size : config.hidden_size * 2]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
def remove_classification_head_(state_dict):
ignore_keys = ["head.weight", "head.bias"]
for k in ignore_keys:
state_dict.pop(k, None)
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
@torch.no_grad()
def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
"""
复制/粘贴/调整模型的权重到我们的ViLT结构中。
"""
config = ViltConfig(image_size=384, patch_size=32, tie_word_embeddings=False)
mlm_model = False
vqa_model = False
nlvr_model = False
irtr_model = False
if "vqa" in checkpoint_url:
vqa_model = True
config.num_labels = 3129
repo_id = "huggingface/label-files"
filename = "vqa2-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
model = ViltForQuestionAnswering(config)
elif "nlvr" in checkpoint_url:
nlvr_model = True
config.num_labels = 2
config.id2label = {0: "False", 1: "True"}
config.label2id = {v: k for k, v in config.id2label.items()}
config.modality_type_vocab_size = 3
model = ViltForImagesAndTextClassification(config)
elif "irtr" in checkpoint_url:
irtr_model = True
model = ViltForImageAndTextRetrieval(config)
elif "mlm_itm" in checkpoint_url:
mlm_model = True
model = ViltForMaskedLM(config)
else:
raise ValueError("Unknown model type")
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
rename_keys = create_rename_keys(config, vqa_model, nlvr_model, irtr_model)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config)
if mlm_model or irtr_model:
ignore_keys = ["itm_score.fc.weight", "itm_score.fc.bias"]
for k in ignore_keys:
state_dict.pop(k, None)
model.eval()
if mlm_model:
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
assert missing_keys == ["mlm_score.decoder.bias"]
else:
model.load_state_dict(state_dict)
image_processor = ViltImageProcessor(size=384)
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
processor = ViltProcessor(image_processor, tokenizer)
if nlvr_model:
image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
text = (
"The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
" standing."
)
encoding_1 = processor(image1, text, return_tensors="pt")
encoding_2 = processor(image2, text, return_tensors="pt")
outputs = model(
input_ids=encoding_1.input_ids,
pixel_values=encoding_1.pixel_values,
pixel_values_2=encoding_2.pixel_values,
)
else:
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
if mlm_model:
text = "a bunch of [MASK] laying on a [MASK]."
else:
text = "How many cats are there?"
encoding = processor(image, text, return_tensors="pt")
outputs = model(**encoding)
if mlm_model:
expected_shape = torch.Size([1, 11, 30522])
expected_slice = torch.tensor([-12.5061, -12.5123, -12.5174])
assert outputs.logits.shape == expected_shape
assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)
predicted_id = outputs.logits[0, 4, :].argmax(-1).item()
assert tokenizer.decode([predicted_id]) == "cats"
elif vqa_model:
expected_shape = torch.Size([1, 3129])
expected_slice = torch.tensor([-15.9495, -18.1472, -10.3041])
assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
assert outputs.logits.shape == expected_shape
assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)
predicted_idx = outputs.logits.argmax(-1).item()
assert model.config.id2label[predicted_idx] == "2"
elif nlvr_model:
expected_shape = torch.Size([1, 2])
expected_slice = torch.tensor([-2.8721, 2.1291])
assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
assert outputs.logits.shape == expected_shape
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_url",
default="https://github.com/dandelin/ViLT/releases/download/200k/vilt_200k_mlm_itm.ckpt",
type=str,
help="URL of the checkpoint you'd like to convert."
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory."
)
args = parser.parse_args()
convert_vilt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
.\models\vilt\feature_extraction_vilt.py
"""ViLT 的特征提取器类。"""
import warnings
from ...utils import logging
from .image_processing_vilt import ViltImageProcessor
logger = logging.get_logger(__name__)
class ViltFeatureExtractor(ViltImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class ViltFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
" use ViltImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\vilt\image_processing_vilt.py
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
返回可迭代值中各索引位置的最大值列表。
"""
return [max(values_i) for values_i in zip(*values)]
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
创建图像的像素掩码,其中 1 表示有效像素,0 表示填充像素。
Args:
image (`np.ndarray`):
要创建像素掩码的图像。
output_size (`Tuple[int, int]`):
掩码的输出大小。
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
获取批量图像中所有图像的最大高度和宽度。
"""
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width)
def get_resize_output_image_size(
input_image: np.ndarray,
shorter: int = 800,
longer: int = 1333,
size_divisor: int = 32,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
input_height, input_width = get_image_size(input_image, input_data_format)
min_size, max_size = shorter, longer
scale = min_size / min(input_height, input_width)
if input_height < input_width:
new_height = min_size
new_width = scale * input_width
else:
new_height = scale * input_height
new_width = min_size
if max(new_height, new_width) > max_size:
scale = max_size / max(new_height, new_width)
new_height = scale * new_height
new_width = scale * new_width
new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
new_height = new_height // size_divisor * size_divisor
new_width = new_width // size_divisor * size_divisor
return new_height, new_width
class ViltImageProcessor(BaseImageProcessor):
r"""
构建 ViLT 图像处理器。
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
size_divisor: int = 32,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: bool = True,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
do_pad = kwargs.pop("pad_and_return_pixel_mask")
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 384}
size = get_size_dict(size, default_to_square=False)
self.do_resize = do_resize
self.size = size
self.size_divisor = size_divisor
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.do_pad = do_pad
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"size_divisor",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"do_pad",
"return_tensors",
"data_format",
"input_data_format",
]
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
image_processor_dict = image_processor_dict.copy()
if "pad_and_return_pixel_mask" in kwargs:
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
return super().from_dict(image_processor_dict, **kwargs)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
size_divisor: int = 32,
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image.
Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
resized to the max size while preserving the aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
size_divisor (`int`, defaults to 32):
The image is resized to a size that is a multiple of this value.
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" not in size:
raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
shorter = size["shortest_edge"]
longer = int(1333 / 800 * shorter)
output_size = get_resize_output_image_size(
image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = output_size
pad_bottom = output_height - input_height
pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
def pad(
self,
images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [
self._pad_image(
image,
pad_size,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
for image in images
]
data = {"pixel_values": padded_images}
if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
size_divisor: Optional[int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,