Transformers 源码解析(三十一)
.\models\ctrl\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
_import_structure = {
"configuration_ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig"],
"tokenization_ctrl": ["CTRLTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_ctrl"] = [
"CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
"CTRLForSequenceClassification",
"CTRLLMHeadModel",
"CTRLModel",
"CTRLPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_ctrl"] = [
"TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFCTRLForSequenceClassification",
"TFCTRLLMHeadModel",
"TFCTRLModel",
"TFCTRLPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
from .tokenization_ctrl import CTRLTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_ctrl import (
CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
CTRLForSequenceClassification,
CTRLLMHeadModel,
CTRLModel,
CTRLPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_ctrl import (
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCTRLForSequenceClassification,
TFCTRLLMHeadModel,
TFCTRLModel,
TFCTRLPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\cvt\configuration_cvt.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
CVT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/cvt-13": "https://huggingface.co/microsoft/cvt-13/resolve/main/config.json",
}
class CvtConfig(PretrainedConfig):
r"""
这是配置类,用于存储 [`CvtModel`] 的配置信息。根据指定的参数实例化一个 CvT 模型,定义模型架构。
使用默认值实例化配置将产生类似于 CvT [microsoft/cvt-13](https://huggingface.co/microsoft/cvt-13) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。
示例:
```
>>> from transformers import CvtConfig, CvtModel
>>> # 初始化一个 CvT msft/cvt 风格的配置
>>> configuration = CvtConfig()
>>> # 从配置中初始化一个模型(具有随机权重)
>>> model = CvtModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "cvt"
def __init__(
self,
num_channels=3,
patch_sizes=[7, 3, 3],
patch_stride=[4, 2, 2],
patch_padding=[2, 1, 1],
embed_dim=[64, 192, 384],
num_heads=[1, 3, 6],
depth=[1, 2, 10],
mlp_ratio=[4.0, 4.0, 4.0],
attention_drop_rate=[0.0, 0.0, 0.0],
drop_rate=[0.0, 0.0, 0.0],
drop_path_rate=[0.0, 0.0, 0.1],
qkv_bias=[True, True, True],
cls_token=[False, False, True],
qkv_projection_method=["dw_bn", "dw_bn", "dw_bn"],
kernel_qkv=[3, 3, 3],
padding_kv=[1, 1, 1],
stride_kv=[2, 2, 2],
padding_q=[1, 1, 1],
stride_q=[1, 1, 1],
initializer_range=0.02,
layer_norm_eps=1e-12,
**kwargs,
):
super().__init__(**kwargs)
self.num_channels = num_channels
self.patch_sizes = patch_sizes
self.patch_stride = patch_stride
self.patch_padding = patch_padding
self.embed_dim = embed_dim
self.num_heads = num_heads
self.depth = depth
self.mlp_ratio = mlp_ratio
self.attention_drop_rate = attention_drop_rate
self.drop_rate = drop_rate
self.drop_path_rate = drop_path_rate
self.qkv_bias = qkv_bias
self.cls_token = cls_token
self.qkv_projection_method = qkv_projection_method
self.kernel_qkv = kernel_qkv
self.padding_kv = padding_kv
self.stride_kv = stride_kv
self.padding_q = padding_q
self.stride_q = stride_q
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
.\models\cvt\convert_cvt_original_pytorch_checkpoint_to_pytorch.py
def embeddings(idx):
"""
The function helps in renaming embedding layer weights.
Args:
idx: stage number in original model
"""
embed = []
embed.append(
(
f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight",
f"stage{idx}.patch_embed.proj.weight",
)
)
embed.append(
(
f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias",
f"stage{idx}.patch_embed.proj.bias",
)
)
embed.append(
(
f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight",
f"stage{idx}.patch_embed.norm.weight",
)
)
embed.append(
(
f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias",
f"stage{idx}.patch_embed.norm.bias",
)
)
return embed
def attention(idx, cnt):
"""
The function helps in renaming attention block layers weights.
Args:
idx: stage number in original model
cnt: count of blocks in each stage
"""
attention_weights = []
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias",
)
)
return attention_weights
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked",
f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight",
f"stage{idx}.blocks.{cnt}.attn.proj_q.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias",
f"stage{idx}.blocks.{cnt}.attn.proj_q.bias",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight",
f"stage{idx}.blocks.{cnt}.attn.proj_k.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias",
f"stage{idx}.blocks.{cnt}.attn.proj_k.bias",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight",
f"stage{idx}.blocks.{cnt}.attn.proj_v.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias",
f"stage{idx}.blocks.{cnt}.attn.proj_v.bias",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight",
f"stage{idx}.blocks.{cnt}.attn.proj.weight",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias",
f"stage{idx}.blocks.{cnt}.attn.proj.bias",
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight",
f"stage{idx}.blocks.{cnt}.mlp.fc1.weight"
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias",
f"stage{idx}.blocks.{cnt}.mlp.fc1.bias"
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight",
f"stage{idx}.blocks.{cnt}.mlp.fc2.weight"
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias",
f"stage{idx}.blocks.{cnt}.mlp.fc2.bias"
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight",
f"stage{idx}.blocks.{cnt}.norm1.weight"
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias",
f"stage{idx}.blocks.{cnt}.norm1.bias"
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight",
f"stage{idx}.blocks.{cnt}.norm2.weight"
)
)
attention_weights.append(
(
f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias",
f"stage{idx}.blocks.{cnt}.norm2.bias"
)
)
return attention_weights
def cls_token(idx):
token = []
token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token"))
return token
def final():
head = []
head.append(("layernorm.weight", "norm.weight"))
head.append(("layernorm.bias", "norm.bias"))
head.append(("classifier.weight", "head.weight"))
head.append(("classifier.bias", "head.bias"))
return head
def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
img_labels_file = "imagenet-1k-id2label.json"
num_labels = 1000
repo_id = "huggingface/label-files"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, img_labels_file, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id)
if cvt_model.rsplit("/", 1)[-1][4:6] == "13":
config.depth = [1, 2, 10]
elif cvt_model.rsplit("/", 1)[-1][4:6] == "21":
config.depth = [1, 4, 16]
else:
config.depth = [2, 2, 20]
config.num_heads = [3, 12, 16]
config.embed_dim = [192, 768, 1024]
model = CvtForImageClassification(config)
image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
image_processor.size["shortest_edge"] = image_size
original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"))
huggingface_weights = OrderedDict()
list_of_state_dict = []
for idx in range(len(config.depth)):
if config.cls_token[idx]:
list_of_state_dict = list_of_state_dict + cls_token(idx)
list_of_state_dict = list_of_state_dict + embeddings(idx)
for cnt in range(config.depth[idx]):
list_of_state_dict = list_of_state_dict + attention(idx, cnt)
list_of_state_dict = list_of_state_dict + final()
for i in range(len(list_of_state_dict)):
huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]]
model.load_state_dict(huggingface_weights)
model.save_pretrained(pytorch_dump_folder)
image_processor.save_pretrained(pytorch_dump_folder)
parser.add_argument(
"--cvt_model",
default="cvt-w24",
type=str,
help="Name of the cvt model you'd like to convert."
)
parser.add_argument(
"--image_size",
default=384,
type=int,
help="Input Image Size"
)
parser.add_argument(
"--cvt_file_name",
default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth",
type=str,
help="Input Image Size"
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory."
)
args = parser.parse_args()
convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path)
.\models\cvt\modeling_cvt.py
""" PyTorch CvT model."""
import collections.abc
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import logging
from .configuration_cvt import CvtConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "CvtConfig"
_CHECKPOINT_FOR_DOC = "microsoft/cvt-13"
_EXPECTED_OUTPUT_SHAPE = [1, 384, 14, 14]
_IMAGE_CLASS_CHECKPOINT = "microsoft/cvt-13"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
CVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/cvt-13",
"microsoft/cvt-13-384",
"microsoft/cvt-13-384-22k",
"microsoft/cvt-21",
"microsoft/cvt-21-384",
"microsoft/cvt-21-384-22k",
]
@dataclass
class BaseModelOutputWithCLSToken(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
Classification token at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
"""
last_hidden_state: torch.FloatTensor = None
cls_token_value: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class CvtDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class CvtEmbeddings(nn.Module):
"""
Construct the CvT embeddings.
"""
def __init__(self, patch_size, num_channels, embed_dim, stride, padding, dropout_rate):
super().__init__()
self.convolution_embeddings = CvtConvEmbeddings(
patch_size=patch_size, num_channels=num_channels, embed_dim=embed_dim, stride=stride, padding=padding
)
self.dropout = nn.Dropout(dropout_rate)
def forward(self, pixel_values):
hidden_state = self.convolution_embeddings(pixel_values)
hidden_state = self.dropout(hidden_state)
return hidden_state
class CvtConvEmbeddings(nn.Module):
"""
Image to Conv Embedding.
"""
def __init__(self, patch_size, num_channels, embed_dim, stride, padding):
super().__init__()
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
self.patch_size = patch_size
self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
self.normalization = nn.LayerNorm(embed_dim)
def forward(self, pixel_values):
pixel_values = self.projection(pixel_values)
batch_size, num_channels, height, width = pixel_values.shape
hidden_size = height * width
pixel_values = pixel_values.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
if self.normalization:
pixel_values = self.normalization(pixel_values)
pixel_values = pixel_values.permute(0, 2, 1).view(batch_size, num_channels, height, width)
return pixel_values
class CvtSelfAttentionConvProjection(nn.Module):
def __init__(self, embed_dim, kernel_size, padding, stride):
super().__init__()
self.convolution = nn.Conv2d(
embed_dim,
embed_dim,
kernel_size=kernel_size,
padding=padding,
stride=stride,
bias=False,
groups=embed_dim,
)
self.normalization = nn.BatchNorm2d(embed_dim)
def forward(self, hidden_state):
hidden_state = self.convolution(hidden_state)
hidden_state = self.normalization(hidden_state)
return hidden_state
class CvtSelfAttentionLinearProjection(nn.Module):
def forward(self, hidden_state):
batch_size, num_channels, height, width = hidden_state.shape
hidden_size = height * width
hidden_state = hidden_state.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
return hidden_state
class CvtSelfAttentionProjection(nn.Module):
def __init__(self, embed_dim, kernel_size, padding, stride, projection_method="dw_bn"):
super().__init__()
if projection_method == "dw_bn":
self.convolution_projection = CvtSelfAttentionConvProjection(embed_dim, kernel_size, padding, stride)
self.linear_projection = CvtSelfAttentionLinearProjection()
def forward(self, hidden_state):
hidden_state = self.convolution_projection(hidden_state)
hidden_state = self.linear_projection(hidden_state)
return hidden_state
class CvtSelfAttention(nn.Module):
def __init__(
self,
num_heads,
embed_dim,
kernel_size,
padding_q,
padding_kv,
stride_q,
stride_kv,
qkv_projection_method,
qkv_bias,
attention_drop_rate,
with_cls_token=True,
**kwargs,
):
pass
super().__init__()
self.scale = embed_dim**-0.5
self.with_cls_token = with_cls_token
self.embed_dim = embed_dim
self.num_heads = num_heads
self.convolution_projection_query = CvtSelfAttentionProjection(
embed_dim,
kernel_size,
padding_q,
stride_q,
projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
)
self.convolution_projection_key = CvtSelfAttentionProjection(
embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
)
self.convolution_projection_value = CvtSelfAttentionProjection(
embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
)
self.projection_query = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
self.projection_key = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
self.projection_value = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
self.dropout = nn.Dropout(attention_drop_rate)
def rearrange_for_multi_head_attention(self, hidden_state):
batch_size, hidden_size, _ = hidden_state.shape
head_dim = self.embed_dim // self.num_heads
return hidden_state.view(batch_size, hidden_size, self.num_heads, head_dim).permute(0, 2, 1, 3)
def forward(self, hidden_state, height, width):
if self.with_cls_token:
cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
batch_size, hidden_size, num_channels = hidden_state.shape
hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
key = self.convolution_projection_key(hidden_state)
query = self.convolution_projection_query(hidden_state)
value = self.convolution_projection_value(hidden_state)
if self.with_cls_token:
query = torch.cat((cls_token, query), dim=1)
key = torch.cat((cls_token, key), dim=1)
value = torch.cat((cls_token, value), dim=1)
head_dim = self.embed_dim // self.num_heads
query = self.rearrange_for_multi_head_attention(self.projection_query(query))
key = self.rearrange_for_multi_head_attention(self.projection_key(key))
value = self.rearrange_for_multi_head_attention(self.projection_value(value))
attention_score = torch.einsum("bhlk,bhtk->bhlt", [query, key]) * self.scale
attention_probs = torch.nn.functional.softmax(attention_score, dim=-1)
attention_probs = self.dropout(attention_probs)
context = torch.einsum("bhlt,bhtv->bhlv", [attention_probs, value])
_, _, hidden_size, _ = context.shape
context = context.permute(0, 2, 1, 3).contiguous().view(batch_size, hidden_size, self.num_heads * head_dim)
return context
class CvtSelfOutput(nn.Module):
"""
The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, embed_dim, drop_rate):
super().__init__()
self.dense = nn.Linear(embed_dim, embed_dim)
self.dropout = nn.Dropout(drop_rate)
def forward(self, hidden_state, input_tensor):
hidden_state = self.dense(hidden_state)
hidden_state = self.dropout(hidden_state)
return hidden_state
class CvtAttention(nn.Module):
def __init__(
self,
num_heads,
embed_dim,
kernel_size,
padding_q,
padding_kv,
stride_q,
stride_kv,
qkv_projection_method,
qkv_bias,
attention_drop_rate,
drop_rate,
with_cls_token=True,
):
super().__init__()
self.attention = CvtSelfAttention(
num_heads,
embed_dim,
kernel_size,
padding_q,
padding_kv,
stride_q,
stride_kv,
qkv_projection_method,
qkv_bias,
attention_drop_rate,
with_cls_token,
)
self.output = CvtSelfOutput(embed_dim, drop_rate)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_state, height, width):
self_output = self.attention(hidden_state, height, width)
attention_output = self.output(self_output, hidden_state)
return attention_output
class CvtIntermediate(nn.Module):
def __init__(self, embed_dim, mlp_ratio):
super().__init__()
self.dense = nn.Linear(embed_dim, int(embed_dim * mlp_ratio))
self.activation = nn.GELU()
def forward(self, hidden_state):
hidden_state = self.dense(hidden_state)
hidden_state = self.activation(hidden_state)
return hidden_state
class CvtOutput(nn.Module):
def __init__(self, embed_dim, mlp_ratio, drop_rate):
super().__init__()
self.dense = nn.Linear(int(embed_dim * mlp_ratio), embed_dim)
self.dropout = nn.Dropout(drop_rate)
def forward(self, hidden_state, input_tensor):
hidden_state = self.dense(hidden_state)
hidden_state = self.dropout(hidden_state)
hidden_state = hidden_state + input_tensor
return hidden_state
class CvtLayer(nn.Module):
"""
CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
"""
def __init__(
self,
num_heads,
embed_dim,
kernel_size,
padding_q,
padding_kv,
stride_q,
stride_kv,
qkv_projection_method,
qkv_bias,
attention_drop_rate,
drop_rate,
mlp_ratio,
drop_path_rate,
with_cls_token=True,
):
super().__init__()
self.attention = CvtAttention(
num_heads,
embed_dim,
kernel_size,
padding_q,
padding_kv,
stride_q,
stride_kv,
qkv_projection_method,
qkv_bias,
attention_drop_rate,
drop_rate,
with_cls_token,
)
self.intermediate = CvtIntermediate(embed_dim, mlp_ratio)
self.output = CvtOutput(embed_dim, mlp_ratio, drop_rate)
self.drop_path = CvtDropPath(drop_prob=drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.layernorm_before = nn.LayerNorm(embed_dim)
self.layernorm_after = nn.LayerNorm(embed_dim)
def forward(self, hidden_state, height, width):
self_attention_output = self.attention(
self.layernorm_before(hidden_state),
height,
width,
)
attention_output = self_attention_output
attention_output = self.drop_path(attention_output)
hidden_state = attention_output + hidden_state
layer_output = self.layernorm_after(hidden_state)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output, hidden_state)
layer_output = self.drop_path(layer_output)
return layer_output
def __init__(self, config, stage):
super().__init__()
self.config = config
self.stage = stage
if self.config.cls_token[self.stage]:
self.cls_token = nn.Parameter(torch.randn(1, 1, self.config.embed_dim[-1]))
self.embedding = CvtEmbeddings(
patch_size=config.patch_sizes[self.stage],
stride=config.patch_stride[self.stage],
num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
embed_dim=config.embed_dim[self.stage],
padding=config.patch_padding[self.stage],
dropout_rate=config.drop_rate[self.stage],
)
drop_path_rates = [x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage])]
self.layers = nn.Sequential(
*[
CvtLayer(
num_heads=config.num_heads[self.stage],
embed_dim=config.embed_dim[self.stage],
kernel_size=config.kernel_qkv[self.stage],
padding_q=config.padding_q[self.stage],
padding_kv=config.padding_kv[self.stage],
stride_kv=config.stride_kv[self.stage],
stride_q=config.stride_q[self.stage],
qkv_projection_method=config.qkv_projection_method[self.stage],
qkv_bias=config.qkv_bias[self.stage],
attention_drop_rate=config.attention_drop_rate[self.stage],
drop_rate=config.drop_rate[self.stage],
drop_path_rate=drop_path_rates[self.stage],
mlp_ratio=config.mlp_ratio[self.stage],
with_cls_token=config.cls_token[self.stage],
)
for _ in range(config.depth[self.stage])
]
)
def forward(self, hidden_state):
cls_token = None
hidden_state = self.embedding(hidden_state)
batch_size, num_channels, height, width = hidden_state.shape
hidden_state = hidden_state.view(batch_size, num_channels, height * width).permute(0, 2, 1)
if self.config.cls_token[self.stage]:
cls_token = self.cls_token.expand(batch_size, -1, -1)
hidden_state = torch.cat((cls_token, hidden_state), dim=1)
for layer in self.layers:
layer_outputs = layer(hidden_state, height, width)
hidden_state = layer_outputs
if self.config.cls_token[self.stage]:
cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
return hidden_state, cls_token
"""
# CVTEncoder 类,用于实现一个可变形视觉 Transformer 编码器模型
def __init__(self, config):
# 初始化函数,继承自 nn.Module
super().__init__()
# 保存模型配置信息
self.config = config
# 初始化模型的多个编码阶段
self.stages = nn.ModuleList([])
# 根据配置中的深度信息,逐个创建并添加 CvtStage 实例到 stages 中
for stage_idx in range(len(config.depth)):
self.stages.append(CvtStage(config, stage_idx))
def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
# 初始化隐藏状态和额外输出
all_hidden_states = () if output_hidden_states else None
hidden_state = pixel_values
# 初始化分类标记
cls_token = None
# 遍历每个阶段的模块
for _, (stage_module) in enumerate(self.stages):
# 通过阶段模块处理隐藏状态,同时获取分类标记
hidden_state, cls_token = stage_module(hidden_state)
# 如果需要输出所有隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
# 如果不返回字典形式的输出,则返回非空的隐藏状态、分类标记和所有隐藏状态
if not return_dict:
return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
# 返回包含分类标记、最终隐藏状态和所有隐藏状态的 BaseModelOutputWithCLSToken 对象
return BaseModelOutputWithCLSToken(
last_hidden_state=hidden_state,
cls_token_value=cls_token,
hidden_states=all_hidden_states,
)
"""
CVT_PRETRAINED_MODEL_DOCSTRING = r"""
This model is an abstract class for handling weights initialization and providing a simple interface for downloading
and loading pretrained models.
Configuration:
config_class (`CvtConfig`): The configuration class holding all parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. To load weights, use the `~PreTrainedModel.from_pretrained` method.
Attributes:
base_model_prefix (`str`): Prefix applied to base model attributes.
main_input_name (`str`): Name of the main input attribute for the model, typically 'pixel_values'.
"""
CVT_INIT_WEIGHTS_DOCSTRING = r"""
Initialize the weights of the model module.
Args:
module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""
CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING = r"""
Initialize the weights of the provided module.
Args:
module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""
CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING = r"""
Initialize the weights of a Linear or Conv2d module.
Args:
module (`nn.Module`): The PyTorch Linear or Conv2d module.
"""
CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING = r"""
Initialize the weights of a LayerNorm module.
Args:
module (`nn.Module`): The PyTorch LayerNorm module.
"""
CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING = r"""
Initialize the weights of a CvtStage module.
Args:
module (`CvtStage`): The CvtStage module.
"""
CVT_START_DOCSTRING,
CVT_INPUTS_DOCSTRING,
CVT_PRETRAINED_MODEL_DOCSTRING,
CVT_INIT_WEIGHTS_DOCSTRING,
CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING,
CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING,
CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING,
CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING```
"""
# CVTEncoder 类,用于实现一个可变形视觉 Transformer 编码器模型
def __init__(self, config):
# 初始化函数,继承自 nn.Module
super().__init__()
# 保存模型配置信息
self.config = config
# 初始化模型的多个编码阶段
self.stages = nn.ModuleList([])
# 根据配置中的深度信息,逐个创建并添加 CvtStage 实例到 stages 中
for stage_idx in range(len(config.depth)):
self.stages.append(CvtStage(config, stage_idx))
def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
# 初始化隐藏状态和额外输出
all_hidden_states = () if output_hidden_states else None
hidden_state = pixel_values
# 初始化分类标记
cls_token = None
# 遍历每个阶段的模块
for _, (stage_module) in enumerate(self.stages):
# 通过阶段模块处理隐藏状态,同时获取分类标记
hidden_state, cls_token = stage_module(hidden_state)
# 如果需要输出所有隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
# 如果不返回字典形式的输出,则返回非空的隐藏状态、分类标记和所有隐藏状态
if not return_dict:
return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
# 返回包含分类标记、最终隐藏状态和所有隐藏状态的 BaseModelOutputWithCLSToken 对象
return BaseModelOutputWithCLSToken(
last_hidden_state=hidden_state,
cls_token_value=cls_token,
hidden_states=all_hidden_states,
)
"""
CVT_PRETRAINED_MODEL_DOCSTRING = r"""
This model is an abstract class for handling weights initialization and providing a simple interface for downloading
and loading pretrained models.
Configuration:
config_class (`CvtConfig`): The configuration class holding all parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. To load weights, use the `~PreTrainedModel.from_pretrained` method.
Attributes:
base_model_prefix (`str`): Prefix applied to base model attributes.
main_input_name (`str`): Name of the main input attribute for the model, typically 'pixel_values'.
"""
CVT_INIT_WEIGHTS_DOCSTRING = r"""
Initialize the weights of the model module.
Args:
module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""
CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING = r"""
Initialize the weights of the provided module.
Args:
module (`nn.Module`): The PyTorch module for which weights need to be initialized.
"""
CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING = r"""
Initialize the weights of a Linear or Conv2d module.
Args:
module (`nn.Module`): The PyTorch Linear or Conv2d module.
"""
CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING = r"""
Initialize the weights of a LayerNorm module.
Args:
module (`nn.Module`): The PyTorch LayerNorm module.
"""
CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING = r"""
Initialize the weights of a CvtStage module.
Args:
module (`CvtStage`): The CvtStage module.
"""
CVT_START_DOCSTRING,
CVT_INPUTS_DOCSTRING,
CVT_PRETRAINED_MODEL_DOCSTRING,
CVT_INIT_WEIGHTS_DOCSTRING,
CVT_INIT_WEIGHTS_FUNCTION_DOCSTRING,
CVT_INIT_WEIGHTS_LINEAR_CONV2D_DOCSTRING,
CVT_INIT_WEIGHTS_LAYERNORM_DOCSTRING,
CVT_INIT_WEIGHTS_CVTSTAGE_DOCSTRING
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
Cvt Model transformer outputting raw hidden-states without any specific head on top.
"""
class CvtModel(CvtPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.encoder = CvtEncoder(config)
self.post_init()
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithCLSToken,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithCLSToken]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
encoder_outputs = self.encoder(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutputWithCLSToken(
last_hidden_state=sequence_output,
cls_token_value=encoder_outputs.cls_token_value,
hidden_states=encoder_outputs.hidden_states,
)
"""
Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
"""
class CvtForImageClassification(CvtPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.cvt = CvtModel(config, add_pooling_layer=False)
self.layernorm = nn.LayerNorm(config.embed_dim[-1])
self.classifier = (
nn.Linear(config.embed_dim[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(CVT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.cvt(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
cls_token = outputs[1]
if self.config.cls_token[-1]:
sequence_output = self.layernorm(cls_token)
else:
batch_size, num_channels, height, width = sequence_output.shape
sequence_output = sequence_output.view(batch_size, num_channels, height * width).permute(0, 2, 1)
sequence_output = self.layernorm(sequence_output)
sequence_output_mean = sequence_output.mean(dim=1)
logits = self.classifier(sequence_output_mean)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.config.num_labels == 1:
self.config.problem_type = "regression"
elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.config.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
.\models\cvt\modeling_tf_cvt.py
import collections.abc
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import tensorflow as tf
from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_cvt import CvtConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "CvtConfig"
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/cvt-13",
"microsoft/cvt-13-384",
"microsoft/cvt-13-384-22k",
"microsoft/cvt-21",
"microsoft/cvt-21-384",
"microsoft/cvt-21-384-22k",
]
@dataclass
class TFBaseModelOutputWithCLSToken(ModelOutput):
"""
模型输出的基本类。
参数:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐态输出序列。
cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
模型最后一层的分类标记。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`. 每层模型的隐藏状态加上初始嵌入输出。
"""
last_hidden_state: tf.Tensor = None
cls_token_value: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
class TFCvtDropPath(keras.layers.Layer):
"""在残差块的主路径上对每个样本进行辍学(随机深度)。
参考:(1) github.com:rwightman/pytorch-image-models
"""
def __init__(self, drop_prob: float, **kwargs):
super().__init__(**kwargs)
self.drop_prob = drop_prob
def call(self, x: tf.Tensor, training=None):
if self.drop_prob == 0.0 or not training:
return x
keep_prob = 1 - self.drop_prob
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype)
random_tensor = tf.floor(random_tensor)
return (x / keep_prob) * random_tensor
class TFCvtEmbeddings(keras.layers.Layer):
"""Construct the Convolutional Token Embeddings."""
def __init__(
self,
config: CvtConfig,
patch_size: int,
num_channels: int,
embed_dim: int,
stride: int,
padding: int,
dropout_rate: float,
**kwargs,
):
super().__init__(**kwargs)
self.convolution_embeddings = TFCvtConvEmbeddings(
config,
patch_size=patch_size,
num_channels=num_channels,
embed_dim=embed_dim,
stride=stride,
padding=padding,
name="convolution_embeddings",
)
self.dropout = keras.layers.Dropout(dropout_rate)
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution_embeddings(pixel_values)
hidden_state = self.dropout(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_embeddings", None) is not None:
with tf.name_scope(self.convolution_embeddings.name):
self.convolution_embeddings.build(None)
class TFCvtConvEmbeddings(keras.layers.Layer):
"""Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
def __init__(
self,
config: CvtConfig,
patch_size: int,
num_channels: int,
embed_dim: int,
stride: int,
padding: int,
**kwargs,
):
super().__init__(**kwargs)
self.padding = keras.layers.ZeroPadding2D(padding=padding)
self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
self.projection = keras.layers.Conv2D(
filters=embed_dim,
kernel_size=patch_size,
strides=stride,
padding="valid",
data_format="channels_last",
kernel_initializer=get_initializer(config.initializer_range),
name="projection",
)
self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
self.num_channels = num_channels
self.embed_dim = embed_dim
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
if isinstance(pixel_values, dict):
pixel_values = pixel_values["pixel_values"]
pixel_values = self.projection(self.padding(pixel_values))
batch_size, height, width, num_channels = shape_list(pixel_values)
hidden_size = height * width
pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels))
pixel_values = self.normalization(pixel_values)
pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
return pixel_values
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, self.embed_dim])
class TFCvtSelfAttentionConvProjection(keras.layers.Layer):
"""Convolutional projection layer."""
def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
super().__init__(**kwargs)
self.padding = keras.layers.ZeroPadding2D(padding=padding)
self.convolution = keras.layers.Conv2D(
filters=embed_dim,
kernel_size=kernel_size,
kernel_initializer=get_initializer(config.initializer_range),
padding="valid",
strides=stride,
use_bias=False,
name="convolution",
groups=embed_dim,
)
self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution(self.padding(hidden_state))
hidden_state = self.normalization(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution", None) is not None:
with tf.name_scope(self.convolution.name):
self.convolution.build([None, None, None, self.embed_dim])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.embed_dim])
class TFCvtSelfAttentionLinearProjection(keras.layers.Layer):
"""Linear projection layer used to flatten tokens into 1D."""
def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
batch_size, height, width, num_channels = shape_list(hidden_state)
hidden_size = height * width
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
return hidden_state
class TFCvtSelfAttentionProjection(keras.layers.Layer):
"""Convolutional Projection for Attention."""
def __init__(
self,
config: CvtConfig,
embed_dim: int,
kernel_size: int,
stride: int,
padding: int,
projection_method: str = "dw_bn",
**kwargs,
):
super().__init__(**kwargs)
if projection_method == "dw_bn":
self.convolution_projection = TFCvtSelfAttentionConvProjection(
config, embed_dim, kernel_size, stride, padding, name="convolution_projection"
)
self.linear_projection = TFCvtSelfAttentionLinearProjection()
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution_projection(hidden_state, training=training)
hidden_state = self.linear_projection(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution_projection", None) is not None:
with tf.name_scope(self.convolution_projection.name):
self.convolution_projection.build(None)
class TFCvtSelfAttention(keras.layers.Layer):
"""
Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
query, key, and value embeddings.
"""
def __init__(
self,
config: CvtConfig,
num_heads: int,
embed_dim: int,
kernel_size: int,
stride_q: int,
stride_kv: int,
padding_q: int,
padding_kv: int,
qkv_projection_method: str,
qkv_bias: bool,
attention_drop_rate: float,
with_cls_token: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.scale = embed_dim**-0.5
self.with_cls_token = with_cls_token
self.embed_dim = embed_dim
self.num_heads = num_heads
self.convolution_projection_query = TFCvtSelfAttentionProjection(
config,
embed_dim,
kernel_size,
stride_q,
padding_q,
projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
name="convolution_projection_query",
)
self.convolution_projection_key = TFCvtSelfAttentionProjection(
config,
embed_dim,
kernel_size,
stride_kv,
padding_kv,
projection_method=qkv_projection_method,
name="convolution_projection_key",
)
self.convolution_projection_value = TFCvtSelfAttentionProjection(
config,
embed_dim,
kernel_size,
stride_kv,
padding_kv,
projection_method=qkv_projection_method,
name="convolution_projection_value",
)
self.projection_query = keras.layers.Dense(
units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias,
bias_initializer="zeros",
name="projection_query",
)
self.projection_key = keras.layers.Dense(
units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias,
bias_initializer="zeros",
name="projection_key",
)
self.projection_value = keras.layers.Dense(
units=embed_dim,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=qkv_bias,
bias_initializer="zeros",
name="projection_value",
)
self.dropout = keras.layers.Dropout(attention_drop_rate)
def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
batch_size, hidden_size, _ = shape_list(hidden_state)
head_dim = self.embed_dim // self.num_heads
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim))
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
return hidden_state
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
if self.with_cls_token:
cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
batch_size, hidden_size, num_channels = shape_list(hidden_state)
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
key = self.convolution_projection_key(hidden_state, training=training)
query = self.convolution_projection_query(hidden_state, training=training)
value = self.convolution_projection_value(hidden_state, training=training)
if self.with_cls_token:
query = tf.concat((cls_token, query), axis=1)
key = tf.concat((cls_token, key), axis=1)
value = tf.concat((cls_token, value), axis=1)
head_dim = self.embed_dim // self.num_heads
query = self.rearrange_for_multi_head_attention(self.projection_query(query))
key = self.rearrange_for_multi_head_attention(self.projection_key(key))
value = self.rearrange_for_multi_head_attention(self.projection_value(value))
attention_score = tf.matmul(query, key, transpose_b=True) * self.scale
attention_probs = stable_softmax(logits=attention_score, axis=-1)
attention_probs = self.dropout(attention_probs, training=training)
context = tf.matmul(attention_probs, value)
_, _, hidden_size, _ = shape_list(context)
context = tf.transpose(context, perm=(0, 2, 1, 3))
context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
return context
if self.built:
return
self.built = True
if getattr(self, "convolution_projection_query", None) is not None:
with tf.name_scope(self.convolution_projection_query.name):
self.convolution_projection_query.build(None)
if getattr(self, "convolution_projection_key", None) is not None:
with tf.name_scope(self.convolution_projection_key.name):
self.convolution_projection_key.build(None)
if getattr(self, "convolution_projection_value", None) is not None:
with tf.name_scope(self.convolution_projection_value.name):
self.convolution_projection_value.build(None)
if getattr(self, "projection_query", None) is not None:
with tf.name_scope(self.projection_query.name):
self.projection_query.build([None, None, self.embed_dim])
if getattr(self, "projection_key", None) is not None:
with tf.name_scope(self.projection_key.name):
self.projection_key.build([None, None, self.embed_dim])
if getattr(self, "projection_value", None) is not None:
with tf.name_scope(self.projection_value.name):
self.projection_value.build([None, None, self.embed_dim])
class TFCvtSelfOutput(keras.layers.Layer):
"""Output of the Attention layer ."""
def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = keras.layers.Dropout(drop_rate)
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.dense(inputs=hidden_state)
hidden_state = self.dropout(inputs=hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.embed_dim])
class TFCvtAttention(keras.layers.Layer):
"""Attention layer. First chunk of the convolutional transformer block."""
def __init__(
self,
config: CvtConfig,
num_heads: int,
embed_dim: int,
kernel_size: int,
stride_q: int,
stride_kv: int,
padding_q: int,
padding_kv: int,
qkv_projection_method: str,
qkv_bias: bool,
attention_drop_rate: float,
drop_rate: float,
with_cls_token: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.attention = TFCvtSelfAttention(
config,
num_heads,
embed_dim,
kernel_size,
stride_q,
stride_kv,
padding_q,
padding_kv,
qkv_projection_method,
qkv_bias,
attention_drop_rate,
with_cls_token,
name="attention",
)
self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output")
def prune_heads(self, heads):
raise NotImplementedError
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False):
self_output = self.attention(hidden_state, height, width, training=training)
attention_output = self.dense_output(self_output, training=training)
return attention_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFCvtIntermediate(keras.layers.Layer):
"""Intermediate dense layer. Second chunk of the convolutional transformer block."""
def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=int(embed_dim * mlp_ratio),
kernel_initializer=get_initializer(config.initializer_range),
activation="gelu",
name="dense",
)
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
hidden_state = self.dense(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.embed_dim])
super().__init__(**kwargs)
self.attention = AttentionLayer(
num_heads=num_heads,
embed_dim=embed_dim,
kernel_size=kernel_size,
stride_q=stride_q,
stride_kv=stride_kv,
padding_q=padding_q,
padding_kv=padding_kv,
qkv_projection_method=qkv_projection_method,
qkv_bias=qkv_bias,
attention_drop_rate=attention_drop_rate,
**kwargs
)
self.norm1 = keras.layers.LayerNormalization(epsilon=1e-5)
self.dense1 = keras.layers.Dense(
units=int(embed_dim * mlp_ratio),
kernel_initializer=get_initializer(config.initializer_range),
activation=gelu,
name="dense1"
)
self.dropout = keras.layers.Dropout(drop_rate)
self.norm2 = keras.layers.LayerNormalization(epsilon=1e-5)
self.dense2 = keras.layers.Dense(units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense2")
self.drop_path = DropPath(drop_path_rate)
self.with_cls_token = with_cls_token
def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
attn_output = self.attention(inputs, training=training)
attn_output = self.norm1(attn_output + inputs)
mlp_output = self.dense1(attn_output)
mlp_output = self.dropout(mlp_output, training=training)
mlp_output = self.dense2(mlp_output)
mlp_output = self.drop_path(mlp_output, training=training)
mlp_output = self.norm2(mlp_output + attn_output)
if self.with_cls_token:
return mlp_output[:, 0]
else:
return mlp_output
):
super().__init__(**kwargs)
self.attention = TFCvtAttention(
config,
num_heads,
embed_dim,
kernel_size,
stride_q,
stride_kv,
padding_q,
padding_kv,
qkv_projection_method,
qkv_bias,
attention_drop_rate,
drop_rate,
with_cls_token,
name="attention",
)
self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output")
self.drop_path = (
TFCvtDropPath(drop_path_rate, name="drop_path")
if drop_path_rate > 0.0
else keras.layers.Activation("linear", name="drop_path")
)
self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
self.embed_dim = embed_dim
def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training)
attention_output = self.drop_path(attention_output, training=training)
hidden_state = attention_output + hidden_state
layer_output = self.layernorm_after(hidden_state)
layer_output = self.intermediate(layer_output)
layer_output = self.dense_output(layer_output, hidden_state)
layer_output = self.drop_path(layer_output, training=training)
return layer_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.embed_dim])
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.embed_dim])
"""
Cvt stage (encoder block). Each stage has 2 parts :
- (1) A Convolutional Token Embedding layer
- (2) A Convolutional Transformer Block (layer).
The classification token is added only in the last stage.
Args:
config ([`CvtConfig`]): Model configuration class.
stage (`int`): Stage number.
"""
def __init__(self, config: CvtConfig, stage: int, **kwargs):
super().__init__(**kwargs)
self.config = config
self.stage = stage
if self.config.cls_token[self.stage]:
self.cls_token = self.add_weight(
shape=(1, 1, self.config.embed_dim[-1]),
initializer=get_initializer(self.config.initializer_range),
trainable=True,
name="cvt.encoder.stages.2.cls_token",
)
self.embedding = TFCvtEmbeddings(
self.config,
patch_size=config.patch_sizes[self.stage],
num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
stride=config.patch_stride[self.stage],
embed_dim=config.embed_dim[self.stage],
padding=config.patch_padding[self.stage],
dropout_rate=config.drop_rate[self.stage],
name="embedding",
)
drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage])
drop_path_rates = [x.numpy().item() for x in drop_path_rates]
self.layers = [
TFCvtLayer(
config,
num_heads=config.num_heads[self.stage],
embed_dim=config.embed_dim[self.stage],
kernel_size=config.kernel_qkv[self.stage],
stride_q=config.stride_q[self.stage],
stride_kv=config.stride_kv[self.stage],
padding_q=config.padding_q[self.stage],
padding_kv=config.padding_kv[self.stage],
qkv_projection_method=config.qkv_projection_method[self.stage],
qkv_bias=config.qkv_bias[self.stage],
attention_drop_rate=config.attention_drop_rate[self.stage],
drop_rate=config.drop_rate[self.stage],
mlp_ratio=config.mlp_ratio[self.stage],
drop_path_rate=drop_path_rates[self.stage],
with_cls_token=config.cls_token[self.stage],
name=f"layers.{j}",
)
for j in range(config.depth[self.stage])
]
def call(self, hidden_state: tf.Tensor, training: bool = False):
cls_token = None
hidden_state = self.embedding(hidden_state, training)
batch_size, height, width, num_channels = shape_list(hidden_state)
hidden_size = height * width
hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
if self.config.cls_token[self.stage]:
cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
hidden_state = tf.concat((cls_token, hidden_state), axis=1)
for layer in self.layers:
layer_outputs = layer(hidden_state, height, width, training=training)
hidden_state = layer_outputs
if self.config.cls_token[self.stage]:
cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
return hidden_state, cls_token
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding", None) is not None:
with tf.name_scope(self.embedding.name):
self.embedding.build(None)
if getattr(self, "layers", None) is not None:
for layer in self.layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFCvtEncoder(keras.layers.Layer):
"""
Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
(depth) being 1, 2 and 10.
Args:
config ([`CvtConfig`]): Model configuration class.
"""
config_class = CvtConfig
def __init__(self, config: CvtConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.stages = [
TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth))
]
def call(
self,
pixel_values: TFModelInputType,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
all_hidden_states = () if output_hidden_states else None
hidden_state = pixel_values
hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))
cls_token = None
for _, (stage_module) in enumerate(self.stages):
hidden_state, cls_token = stage_module(hidden_state, training=training)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2))
if output_hidden_states:
all_hidden_states = tuple([tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states])
if not return_dict:
return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
return TFBaseModelOutputWithCLSToken(
last_hidden_state=hidden_state,
cls_token_value=cls_token,
hidden_states=all_hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "stages", None) is not None:
for layer in self.stages:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFCvtMainLayer(keras.layers.Layer):
"""Construct the Cvt model."""
config_class = CvtConfig
def __init__(self, config: CvtConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.encoder = TFCvtEncoder(config, name="encoder")
@unpack_inputs
def call(
self,
pixel_values: TFModelInputType | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
def __call__(self, pixel_values=None) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
encoder_outputs = self.encoder(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return TFBaseModelOutputWithCLSToken(
last_hidden_state=sequence_output,
cls_token_value=encoder_outputs.cls_token_value,
hidden_states=encoder_outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
class TFCvtPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = CvtConfig
base_model_prefix = "cvt"
main_input_name = "pixel_values"
TFCVT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
tensors in the first argument of the model call function: `model(inputs)`.
</Tip>
Args:
config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""
TFCVT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@add_start_docstrings(
"The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
TFCVT_START_DOCSTRING,
)
class TFCvtModel(TFCvtPreTrainedModel):
def __init__(self, config: CvtConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.cvt = TFCvtMainLayer(config, name="cvt")
@unpack_inputs
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
def call(
self,
pixel_values: tf.Tensor | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]:
r"""
模型的前向传播方法,接收输入像素值和一些可选参数,返回模型输出。
Args:
pixel_values (tf.Tensor | None): 输入像素值的张量,可以为 None。
output_hidden_states (Optional[bool]): 是否输出隐藏状态,默认为 None。
return_dict (Optional[bool]): 是否返回字典格式的输出,默认为 None。
training (Optional[bool]): 是否在训练模式下,默认为 False。
Returns:
Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]: 根据 return_dict 参数返回不同类型的输出。
Examples:
```
>>> from transformers import AutoImageProcessor, TFCvtModel
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
>>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")
>>> inputs = image_processor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```"""
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
outputs = self.cvt(
pixel_values=pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
if not return_dict:
return (outputs[0],) + outputs[1:]
return TFBaseModelOutputWithCLSToken(
last_hidden_state=outputs.last_hidden_state,
cls_token_value=outputs.cls_token_value,
hidden_states=outputs.hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "cvt", None) is not None:
with tf.name_scope(self.cvt.name):
self.cvt.build(None)
"""
Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
"""
TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: CvtConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.cvt = TFCvtMainLayer(config, name="cvt")
self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=True,
bias_initializer="zeros",
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
def call(
self,
pixel_values: tf.Tensor | None = None,
labels: tf.Tensor | None = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
outputs = self.cvt(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
cls_token = outputs[1]
if self.config.cls_token[-1]:
sequence_output = self.layernorm(cls_token)
else:
batch_size, num_channels, height, width = shape_list(sequence_output)
sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width))
sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1))
sequence_output = self.layernorm(sequence_output)
sequence_output_mean = tf.reduce_mean(sequence_output, axis=1)
logits = self.classifier(sequence_output_mean)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
if self.built:
return
self.built = True
if getattr(self, "cvt", None) is not None:
with tf.name_scope(self.cvt.name):
self.cvt.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.embed_dim[-1]])
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.embed_dim[-1]])
.\models\cvt\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
_import_structure = {"configuration_cvt": ["CVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CvtConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_cvt"] = [
"CVT_PRETRAINED_MODEL_ARCHIVE_LIST",
"CvtForImageClassification",
"CvtModel",
"CvtPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_cvt"] = [
"TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFCvtForImageClassification",
"TFCvtModel",
"TFCvtPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_cvt import CVT_PRETRAINED_CONFIG_ARCHIVE_MAP, CvtConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_cvt import (
CVT_PRETRAINED_MODEL_ARCHIVE_LIST,
CvtForImageClassification,
CvtModel,
CvtPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_cvt import (
TF_CVT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCvtForImageClassification,
TFCvtModel,
TFCvtPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\data2vec\configuration_data2vec_audio.py
""" Data2VecText configuration"""
import math
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/data2vec-base-960h": "https://huggingface.co/facebook/data2vec-audio-base-960h/resolve/main/config.json",
}
class Data2VecAudioConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Data2VecAudioModel`]. It is used to instantiate
an Data2VecAudio model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Data2VecAudio
[facebook/data2vec-audio-base-960h](https://huggingface.co/facebook/data2vec-audio-base-960h) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import Data2VecAudioConfig, Data2VecAudioModel
>>> # Initializing a Data2VecAudio facebook/data2vec-audio-base-960h style configuration
>>> configuration = Data2VecAudioConfig()
>>> # Initializing a model (with random weights) from the facebook/data2vec-audio-base-960h style configuration
>>> model = Data2VecAudioModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "data2vec-audio"
def __init__(
self,
vocab_size=32,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout=0.1,
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
feat_extract_activation="gelu",
conv_dim=(512, 512, 512, 512, 512, 512, 512),
conv_stride=(5, 2, 2, 2, 2, 2, 2),
conv_kernel=(10, 3, 3, 3, 3, 2, 2),
conv_bias=False,
num_conv_pos_embedding_groups=16,
conv_pos_kernel_size=19,
num_conv_pos_embeddings=5,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="sum",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=256,
tdnn_dim=(512, 512, 512, 512, 1500),
tdnn_kernel=(5, 3, 3, 1, 1),
tdnn_dilation=(1, 2, 3, 1, 1),
xvector_output_dim=512,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
add_adapter=False,
adapter_kernel_size=3,
adapter_stride=2,
num_adapter_layers=3,
output_hidden_size=None,
**kwargs,
):
@property
def inputs_to_logits_ratio(self):
return math.prod(self.conv_stride)
.\models\data2vec\configuration_data2vec_text.py
""" Data2VecText configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/data2vec-text-base": "https://huggingface.co/data2vec/resolve/main/config.json",
}
class Data2VecTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Data2VecTextModel`] and [`Data2VecTextModel`]. It
is used to instantiate a Data2VecText model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the Data2VecText
[facebook/data2vec-text-base](https://huggingface.co/facebook/data2vec-text-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "data2vec-text"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs,
):
pass
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
class Data2VecTextOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\data2vec\configuration_data2vec_vision.py
""" Data2VecVision 模型配置 """
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/data2vec-vision-base-ft": (
"https://huggingface.co/facebook/data2vec-vision-base-ft/resolve/main/config.json"
),
}
class Data2VecVisionConfig(PretrainedConfig):
r"""
这是用于存储 [`Data2VecVisionModel`] 配置的配置类。根据指定的参数实例化 Data2VecVision 模型,定义模型架构。
使用默认值实例化配置将产生类似于 Data2VecVision [facebook/data2vec-vision-base](https://huggingface.co/facebook/data2vec-vision-base) 架构的配置。
示例:
```
>>> from transformers import Data2VecVisionConfig, Data2VecVisionModel
>>> # 初始化一个 Data2VecVision data2vec_vision-base-patch16-224-in22k 风格的配置
>>> configuration = Data2VecVisionConfig()
>>> # 从上述配置初始化一个(带有随机权重)模型
>>> model = Data2VecVisionModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "data2vec-vision"
def __init__(
self,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-12,
image_size=224,
patch_size=16,
num_channels=3,
use_mask_token=False,
use_absolute_position_embeddings=False,
use_relative_position_bias=False,
use_shared_relative_position_bias=False,
layer_scale_init_value=0.1,
drop_path_rate=0.1,
use_mean_pooling=True,
out_indices=[3, 5, 7, 11],
pool_scales=[1, 2, 3, 6],
use_auxiliary_head=True,
auxiliary_loss_weight=0.4,
auxiliary_channels=256,
auxiliary_num_convs=1,
auxiliary_concat_input=False,
semantic_loss_ignore_index=255,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.use_mask_token = use_mask_token
self.use_absolute_position_embeddings = use_absolute_position_embeddings
self.use_relative_position_bias = use_relative_position_bias
self.use_shared_relative_position_bias = use_shared_relative_position_bias
self.layer_scale_init_value = layer_scale_init_value
self.drop_path_rate = drop_path_rate
self.use_mean_pooling = use_mean_pooling
self.out_indices = out_indices
self.pool_scales = pool_scales
self.use_auxiliary_head = use_auxiliary_head
self.auxiliary_loss_weight = auxiliary_loss_weight
self.auxiliary_channels = auxiliary_channels
self.auxiliary_num_convs = auxiliary_num_convs
self.auxiliary_concat_input = auxiliary_concat_input
self.semantic_loss_ignore_index = semantic_loss_ignore_index
class Data2VecVisionOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-4
.\models\data2vec\convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
"""Convert Wav2Vec2 checkpoint."""
import argparse
import os
from functools import reduce
import fairseq
import torch
from datasets import load_dataset
from transformers import Wav2Vec2Processor, logging
from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy
from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection.projection",
"models.0.layer_norm": "feature_projection.layer_norm",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "feature_projection.layer_norm",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
TOP_LEVEL_KEYS = [
"lm_head",
]
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
if hf_shape != value.shape:
raise ValueError(
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model, is_headless):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
if not is_headless:
feature_extractor = hf_model.data2vec_audio.feature_extractor
pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed
else:
feature_extractor = hf_model.feature_extractor
pos_conv_embedding = hf_model.encoder.pos_conv_embed
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
)
is_used = True
elif "pos_conv" in name:
load_pos_conv_layer(
name,
value,
pos_conv_embedding,
unused_weights,
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
if not is_headless:
mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "bias" in name:
weight_type = "bias"
elif "weight" in name:
weight_type = "weight"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def access_by_string(module, path):
names = path.split(".")
return reduce(getattr, names, module)
def set_weights(full_name, module, fsq_value, hf_weight_path):
hf_weight = access_by_string(module, hf_weight_path)
hf_value = hf_weight.data
if fsq_value.shape != hf_value.shape:
raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.")
hf_weight.data = fsq_value
logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
def load_conv_layer(full_name, value, feature_extractor, unused_weights):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
weight_type = name.split(".")[-1]
if type_id == 0:
layer_type = "conv"
elif type_id == 2:
layer_type = "layer_norm"
else:
unused_weights.append(full_name)
return
set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}")
def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights):
name = full_name.split("pos_conv.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
weight_type = name.split(".")[-1]
if type_id != 0:
unused_weights.append(full_name)
return
else:
layer_type = "conv"
set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}")
@torch.no_grad()
def convert_wav2vec2_checkpoint(
checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
"""
将模型的权重复制/粘贴/调整为transformers设计。
"""
if config_path is not None:
config = Data2VecAudioConfig.from_pretrained(config_path)
else:
config = Data2VecAudioConfig()
if not is_finetuned:
hf_wav2vec = Data2VecAudioModel(config)
data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
state_dict = torch.load(checkpoint_path)
state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
torch.save(state_dict, converted_ckpt)
else:
hf_wav2vec = Data2VecAudioForCTC(config)
converted_ckpt = checkpoint_path
def load_data2vec(path):
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
return model[0].eval()
model = load_data2vec(converted_ckpt)
recursively_load_weights(model, hf_wav2vec, not is_finetuned)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
input_audio = [x["array"] for x in ds[:4]["audio"]]
inputs = processor(input_audio, return_tensors="pt", padding=True)
input_values = inputs.input_values
attention_mask = inputs.attention_mask
hf_wav2vec.eval()
model.eval()
if is_finetuned:
their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)["encoder_out"].transpose(0, 1)
our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
pred_ids = torch.argmax(our_output, dim=-1)
output_string = processor.batch_decode(pred_ids)
print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
else:
their_output = model(
source=input_values,
padding_mask=(1 - attention_mask),
mask=False,
features_only=True
)["layer_results"][-1][0].transpose(0, 1)
our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
print(our_output.shape, their_output.shape)
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}")
success = torch.allclose(our_output, their_output, atol=1e-3)
print("Do both models output the same tensors?", "🔥" if success else "💩")
if not success:
raise Exception("Something went wRoNg")
hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
if is_finetuned:
processor.save_pretrained(pytorch_dump_folder_path)
else:
processor.feature_extractor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args()
convert_wav2vec2_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
)
.\models\data2vec\convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
import argparse
import os
import pathlib
import fairseq
import torch
from fairseq.modules import TransformerSentenceEncoderLayer
from packaging import version
from transformers import (
Data2VecTextConfig,
Data2VecTextForMaskedLM,
Data2VecTextForSequenceClassification,
Data2VecTextModel,
)
from transformers.models.bert.modeling_bert import (
BertIntermediate,
BertLayer,
BertOutput,
BertSelfAttention,
BertSelfOutput,
)
from transformers.utils import logging
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
raise Exception("requires fairseq >= 0.9.0")
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SAMPLE_TEXT = "Hello world! cécé herlolip"
def convert_data2vec_checkpoint_to_pytorch(
data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
):
"""
将data2vec的检查点权重复制/粘贴/调整到我们的BERT结构中。
"""
data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
data2vec = Data2VecTextModel.from_pretrained(
data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name
)
data2vec.eval()
data2vec_model = data2vec.models[0]
data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
config = Data2VecTextConfig(
vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
hidden_size=data2vec_model.args.encoder_embed_dim,
num_hidden_layers=data2vec_model.args.encoder_layers,
num_attention_heads=data2vec_model.args.encoder_attention_heads,
intermediate_size=data2vec_model.args.encoder_ffn_embed_dim,
max_position_embeddings=514,
type_vocab_size=1,
layer_norm_eps=1e-5,
)
if classification_head:
config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]
print("Our BERT config:", config)
model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config)
model.eval()
model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
model.data2vec_text.embeddings.token_type_embeddings.weight
)
model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
if classification_head:
model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight
model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias
model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight
model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
else:
model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight
model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias
model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight
model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias
model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight
model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias
input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0)
our_output = model(input_ids)[0]
if classification_head:
their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
else:
their_output = data2vec_model(input_ids)[0]
print(our_output.shape, their_output.shape)
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}")
success = torch.allclose(our_output, their_output, atol=1e-3)
print("Do both models output the same tensors?", "🔥" if success else "💩")
if not success:
raise Exception("Something went wRoNg")
pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--classification_head", action="store_true", help="Whether to convert a final classification head."
)
args = parser.parse_args()
convert_data2vec_checkpoint_to_pytorch(
args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
)
这段代码是一个典型的命令行工具的入口点,它使用 argparse 模块解析命令行参数,并调用一个函数来处理这些参数指定的任务。
.\models\data2vec\convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
import argparse
import json
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from timm.models import create_model
from transformers import (
BeitImageProcessor,
Data2VecVisionConfig,
Data2VecVisionForImageClassification,
Data2VecVisionModel,
)
def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."):
prefix = "backbone." if is_semantic else ""
rename_keys = []
for i in range(config.num_hidden_layers):
rename_keys.append(
(f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight")
)
rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias"))
rename_keys.append(
(f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight")
)
rename_keys.append(
(f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias")
)
rename_keys.append(
(f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight")
)
rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias"))
rename_keys.append(
(f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight")
)
rename_keys.append(
(f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias")
)
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight"))
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias"))
rename_keys.extend(
[
(f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"),
(f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"),
(f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"),
]
)
if has_lm_head:
rename_keys.extend(
[
("mask_token", f"{hf_prefix}embeddings.mask_token"),
(
"rel_pos_bias.relative_position_bias_table",
f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table",
),
(
"rel_pos_bias.relative_position_index",
f"{hf_prefix}encoder.relative_position_bias.relative_position_index",
),
("norm.weight", "layernorm.weight"),
("norm.bias", "layernorm.bias"),
]
)
elif is_semantic:
rename_keys.extend(
[
("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
]
)
else:
rename_keys.extend(
[
("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"),
("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"),
("head.weight", "classifier.weight"),
("head.bias", "classifier.bias"),
]
)
return rename_keys
def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."):
for i in range(config.num_hidden_layers):
prefix = "backbone." if is_semantic else ""
in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
: config.hidden_size, :
]
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
config.hidden_size : config.hidden_size * 2, :
]
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-config.hidden_size :, :
]
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias
gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1
state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2
if not has_lm_head:
table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")
state_dict[
f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
] = table
state_dict[
f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
] = index
def get_args():
parser = argparse.ArgumentParser(
"Convert Data2VecVision to HF for image classification and pretraining", add_help=False
)
parser.add_argument("--hf_checkpoint_name", type=str)
parser.add_argument("--input_size", default=224, type=int, help="images input size")
parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint")
return parser.parse_args()
def load_beit_model(args, is_finetuned, is_large):
def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"):
missing_keys = []
unexpected_keys = []
error_msgs = []
metadata = getattr(state_dict, "_metadata", None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=""):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + ".")
load(model, prefix=prefix)
warn_missing_keys = []
ignore_missing_keys = []
for key in missing_keys:
keep_flag = True
for ignore_key in ignore_missing.split("|"):
if ignore_key in key:
keep_flag = False
break
if keep_flag:
warn_missing_keys.append(key)
else:
ignore_missing_keys.append(key)
missing_keys = warn_missing_keys
if len(missing_keys) > 0:
print(
"Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys
)
)
if len(unexpected_keys) > 0:
print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys))
if len(ignore_missing_keys) > 0:
print(
"Ignored weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, ignore_missing_keys
)
)
if len(error_msgs) > 0:
print("\n".join(error_msgs))
model_kwargs = {
"pretrained": False,
"use_shared_rel_pos_bias": True,
"use_abs_pos_emb": False,
"init_values": 0.1,
}
if is_finetuned:
model_kwargs.update(
{
"num_classes": 1000,
"use_mean_pooling": True,
"init_scale": 0.001,
"use_rel_pos_bias": True,
}
)
model = create_model(
"beit_large_patch16_224" if is_large else "beit_base_patch16_224",
**model_kwargs,
)
patch_size = model.patch_embed.patch_size
args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1])
checkpoint = torch.load(args.beit_checkpoint, map_location="cpu")
print(f"Load ckpt from {args.beit_checkpoint}")
checkpoint_model = None
for model_key in ("model", "module"):
if model_key in checkpoint:
checkpoint_model = checkpoint[model_key]
print(f"Load state_dict by model_key = {model_key}")
break
all_keys = list(checkpoint_model.keys())
for key in all_keys:
if "relative_position_index" in key:
checkpoint_model.pop(key)
if "relative_position_bias_table" in key:
rel_pos_bias = checkpoint_model[key]
src_num_pos, num_attn_heads = rel_pos_bias.size()
dst_num_pos, _ = model.state_dict()[key].size()
dst_patch_shape = model.patch_embed.patch_shape
if dst_patch_shape[0] != dst_patch_shape[1]:
raise NotImplementedError()
load_state_dict(model, checkpoint_model, prefix="")
return model
def main():
args = get_args()
is_finetuned = "ft1k" in args.hf_checkpoint_name
is_large = "large" in args.hf_checkpoint_name
if is_finetuned:
import modeling_finetune
else:
import modeling_cyclical
config = Data2VecVisionConfig()
if is_finetuned:
config.use_relative_position_bias = True
config.use_shared_relative_position_bias = False
config.use_mean_pooling = True
config.num_labels = 1000
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
else:
config.use_relative_position_bias = False
config.use_shared_relative_position_bias = True
config.use_mean_pooling = False
if is_large:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
orig_model = load_beit_model(args, is_finetuned, is_large)
orig_model.eval()
image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
encoding = image_processor(images=image, return_tensors="pt")
pixel_values = encoding["pixel_values"]
orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
with torch.no_grad():
orig_model_output = orig_model(*orig_args)
if is_finetuned:
hf_model = Data2VecVisionForImageClassification(config)
hf_model.eval()
has_lm_head = False
hf_prefix = "data2vec_vision."
else:
hf_model = Data2VecVisionModel(config)
hf_model.eval()
has_lm_head = True
hf_prefix = ""
rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
state_dict = orig_model.state_dict()
for src, dest in rename_keys:
val = state_dict.pop(src)
state_dict[dest] = val
read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
print("HF missing", missing_keys)
print("HF unexpected_keys", unexpected_keys)
with torch.no_grad():
hf_model_output = hf_model(pixel_values)
hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state
max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}")
success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
print("Do both models output the same tensors?", "🔥" if success else "💩")
if not success:
raise Exception("Something went wRoNg")
print(f"Saving to {args.hf_checkpoint_name}")
hf_model.save_pretrained(args.hf_checkpoint_name)
image_processor.save_pretrained(args.hf_checkpoint_name)
if __name__ == "__main__":
main()
.\models\data2vec\modeling_data2vec_audio.py
""" PyTorch Data2VecAudio model. """
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
Wav2Vec2BaseModelOutput,
XVectorOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_peft_available,
logging,
)
from .configuration_data2vec_audio import Data2VecAudioConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 2
_CONFIG_FOR_DOC = "Data2VecAudioConfig"
_CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 66.95
DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/data2vec-audio-base",
"facebook/data2vec-audio-base-10m",
"facebook/data2vec-audio-base-100h",
"facebook/data2vec-audio-base-960h",
]
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.LongTensor] = None,
min_masks: int = 0,
) -> np.ndarray:
"""
计算给定形状的随机掩码间隔。用于实现ASR的数据增强方法[SpecAugment](https://arxiv.org/abs/1904.08779)。
注意:此方法不适合在TPU上运行,应该在训练期间作为预处理步骤在CPU上运行。
"""
Args:
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
the first element is the batch size and the second element is the length of the axis to span.
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
independently generated mask spans of length `mask_length` is computed by
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask
min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
"""
# 解包 shape 元组,得到 batch_size 和 sequence_length
batch_size, sequence_length = shape
# 检查 mask_length 是否合法(大于0)
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
# 检查 mask_length 是否小于 sequence_length
if mask_length > sequence_length:
raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
)
# epsilon 用于概率舍入
epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length):
"""Given input length, compute how many spans should be masked"""
# 计算应该被mask的span的数量
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
# 确保 num_masked_span 不小于 min_masks
num_masked_span = max(num_masked_span, min_masks)
# 确保 num_masked_span * mask_length 不大于 sequence_length
if num_masked_span * mask_length > sequence_length:
num_masked_span = sequence_length // mask_length
# 确保 num_masked_span 不大于 input_length - (mask_length - 1)
if input_length - (mask_length - 1) < num_masked_span:
num_masked_span = max(input_length - (mask_length - 1), 0)
return num_masked_span
# 计算每个样本的输入长度
input_lengths = (
attention_mask.sum(-1).detach().tolist()
if attention_mask is not None
else [sequence_length for _ in range(batch_size)]
)
# 创建一个全为 False 的布尔数组作为 spec_aug_mask 的初始值
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
# 保存 spec_aug_mask 的索引
spec_aug_mask_idxs = []
# 计算最大允许的 masked span 数量
max_num_masked_span = compute_num_masked_span(sequence_length)
# 如果 max_num_masked_span 为 0,则直接返回空的 spec_aug_mask
if max_num_masked_span == 0:
return spec_aug_mask
这段代码主要是用于生成一种名为 SpecAugment 的遮罩技术,用于语音识别和其他序列数据处理中。
# 对于每个输入长度计算需要掩码的片段数量
for input_length in input_lengths:
# 计算当前输入长度下的掩码片段数目
num_masked_span = compute_num_masked_span(input_length)
# 随机选择要掩码的索引
spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
)
# 选择第一个样本索引作为填充向量的虚拟索引,以确保所有批次具有相同的维度,因为概率舍入
# 选择第一个样本索引是为了简化向量的填充操作
if len(spec_aug_mask_idx) == 0:
# 当 `input_length` 严格小于 `sequence_length` 时会出现这种情况,
# 此时最后一个标记应该是填充标记,可以用作虚拟掩码 ID
dummy_mask_idx = sequence_length - 1
else:
dummy_mask_idx = spec_aug_mask_idx[0]
# 将虚拟掩码索引添加到掩码索引数组中,确保数组长度达到 `max_num_masked_span`
spec_aug_mask_idx = np.concatenate(
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
)
spec_aug_mask_idxs.append(spec_aug_mask_idx)
# 将列表转换为 NumPy 数组
spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
# 将掩码索引扩展为掩码片段
spec_aug_mask_idxs = np.broadcast_to(
spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
)
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# 添加偏移量到起始索引,使索引创建一个掩码片段
offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length
)
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
# 确保索引不超过 `sequence_length - 1`
if spec_aug_mask_idxs.max() > sequence_length - 1:
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
# 使用索引散布掩码
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
# 返回最终的掩码张量
return spec_aug_mask
# 定义一个用于处理音频转换的卷积层的类,继承自 nn.Module
class Data2VecAudioConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 从配置中获取输入和输出的卷积维度
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个一维卷积层,指定输入和输出维度、卷积核大小、步长和是否使用偏置
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 创建一个用于层归一化的 LayerNorm 层,参数为输出卷积维度,启用元素级仿射变换
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
# 获取指定的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 前向传播函数,接收隐藏状态并返回转换后的隐藏状态
def forward(self, hidden_states):
# 对输入的隐藏状态进行一维卷积处理
hidden_states = self.conv(hidden_states)
# 将卷积输出的维度进行转置,交换倒数第二和倒数第一维度
hidden_states = hidden_states.transpose(-2, -1)
# 对转置后的隐藏状态进行层归一化处理
hidden_states = self.layer_norm(hidden_states)
# 再次对隐藏状态进行维度转置,还原初始维度排列
hidden_states = hidden_states.transpose(-2, -1)
# 使用预定义的激活函数处理隐藏状态并返回
hidden_states = self.activation(hidden_states)
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制而来,修改类名为 Data2VecAudioPadLayer
class Data2VecAudioPadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
# 根据给定的卷积位置嵌入数量计算需要移除的填充数量
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
# 前向传播函数,根据需要移除的填充数量截断隐藏状态的最后一维
def forward(self, hidden_states):
if self.num_pad_remove > 0:
hidden_states = hidden_states[:, :, : -self.num_pad_remove]
return hidden_states
# 定义一个用于位置卷积的类,继承自 nn.Module
class Data2VecAudioPositionalConvLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 创建一个一维卷积层,指定输入和输出维度相同,卷积核大小、填充方式和卷积组数
self.conv = nn.Conv1d(
config.hidden_size,
config.hidden_size,
kernel_size=config.conv_pos_kernel_size,
padding=config.conv_pos_kernel_size // 2,
groups=config.num_conv_pos_embedding_groups,
)
# 创建一个用于处理填充的 Data2VecAudioPadLayer 类的实例
self.padding = Data2VecAudioPadLayer(config.conv_pos_kernel_size)
# 获取指定的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 创建一个 LayerNorm 层,用于层归一化,参数为隐藏大小,禁用元素级仿射变换
self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
# 前向传播函数,接收隐藏状态并返回转换后的隐藏状态
def forward(self, hidden_states):
# 对输入的隐藏状态进行一维卷积处理
hidden_states = self.conv(hidden_states)
# 使用 padding 层处理卷积输出,截断最后一维的填充
hidden_states = self.padding(hidden_states)
# 将隐藏状态的维度进行转置,交换第一和第二维度
hidden_states = hidden_states.transpose(1, 2)
# 对转置后的隐藏状态进行层归一化处理
hidden_states = self.layer_norm(hidden_states)
# 再次对隐藏状态进行维度转置,还原初始维度排列
hidden_states = hidden_states.transpose(1, 2)
# 使用预定义的激活函数处理隐藏状态并返回
hidden_states = self.activation(hidden_states)
return hidden_states
# 定义一个用于位置卷积嵌入的类,继承自 nn.Module
class Data2VecAudioPositionalConvEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
# 使用列表推导式创建包含多个 Data2VecAudioPositionalConvLayer 实例的 ModuleList
self.layers = nn.ModuleList(
[Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)]
)
# 定义一个方法,用于前向传播神经网络模型中的隐藏状态
def forward(self, hidden_states):
# 转置隐藏状态张量的第一维和第二维,以便适配网络层期望的输入格式
hidden_states = hidden_states.transpose(1, 2)
# 依次通过每一层网络层处理隐藏状态张量
for layer in self.layers:
hidden_states = layer(hidden_states)
# 再次转置隐藏状态张量的第一维和第二维,使其恢复原始输入的维度
hidden_states = hidden_states.transpose(1, 2)
# 返回经过所有网络层处理后的隐藏状态张量
return hidden_states
class Data2VecAudioFeatureEncoder(nn.Module):
"""Construct the features from raw audio waveform"""
def __init__(self, config):
super().__init__()
# 初始化卷积层列表,每层都是 Data2VecAudioConvLayer 类的实例,根据配置创建对应数量的层
self.conv_layers = nn.ModuleList(
[Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
)
# 梯度检查点功能默认关闭
self.gradient_checkpointing = False
# 默认需要计算梯度
self._requires_grad = True
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder._freeze_parameters 复制而来
def _freeze_parameters(self):
# 将所有参数的梯度计算关闭
for param in self.parameters():
param.requires_grad = False
# 同时将类属性 _requires_grad 设置为 False
self._requires_grad = False
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder.forward 复制而来
def forward(self, input_values):
# 将输入的波形数据增加一个维度,以符合模型的输入要求
hidden_states = input_values[:, None]
# 如果当前模型需要计算梯度并且处于训练状态,则设置 hidden_states 变量需要计算梯度
if self._requires_grad and self.training:
hidden_states.requires_grad = True
# 遍历所有卷积层,逐层计算特征表示
for conv_layer in self.conv_layers:
# 如果开启了梯度检查点并且模型在训练阶段,则使用梯度检查点函数来计算卷积层的输出
if self._requires_grad and self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
conv_layer.__call__,
hidden_states,
)
else:
hidden_states = conv_layer(hidden_states)
# 返回最终的隐藏状态表示
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection 复制而来,修改了类名和部分参数
class Data2VecAudioFeatureProjection(nn.Module):
def __init__(self, config):
super().__init__()
# 层归一化层,用于标准化卷积层的输出
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
# 线性投影层,将卷积层的输出映射到隐藏状态的维度上
self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
# Dropout 层,用于随机丢弃部分神经元,防止过拟合
self.dropout = nn.Dropout(config.feat_proj_dropout)
def forward(self, hidden_states):
# 对隐藏状态进行层归一化处理
norm_hidden_states = self.layer_norm(hidden_states)
# 将归一化后的隐藏状态进行线性投影,映射到目标维度
hidden_states = self.projection(norm_hidden_states)
# 对投影后的隐藏状态进行 Dropout 处理,随机丢弃部分神经元
hidden_states = self.dropout(hidden_states)
# 返回投影后的隐藏状态和未投影的归一化隐藏状态,用于量化操作
return hidden_states, norm_hidden_states
# 从 transformers.models.bart.modeling_bart.BartAttention 复制而来,修改了类名和部分参数
class Data2VecAudioAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[Data2VecAudioConfig] = None,
):
# 调用父类初始化方法
super().__init__()
# 设置注意力机制的嵌入维度
self.embed_dim = embed_dim
# 设置注意力头的数量
self.num_heads = num_heads
# 设置dropout率
self.dropout = dropout
# 计算每个注意力头的维度
self.head_dim = embed_dim // num_heads
# 保存配置信息
self.config = config
# 检查嵌入维度是否能被注意力头的数量整除
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
# 缩放系数,用于缩放注意力得分
self.scaling = self.head_dim**-0.5
# 是否为解码器
self.is_decoder = is_decoder
# 是否使用因果(causal)注意力
self.is_causal = is_causal
# 初始化键(key)的线性投影层
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
# 初始化值(value)的线性投影层
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
# 初始化查询(query)的线性投影层
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
# 初始化输出的线性投影层
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
# 重新形状张量,以便为多头注意力做准备
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Data2VecAudio
# 定义一个名为Data2VecAudioFeedForward的神经网络模块,继承自nn.Module
class Data2VecAudioFeedForward(nn.Module):
def __init__(self, config):
super().__init__()
# 中间层的dropout操作,使用配置中的激活函数的dropout率
self.intermediate_dropout = nn.Dropout(config.activation_dropout)
# 中间层的全连接层,输入维度为config.hidden_size,输出维度为config.intermediate_size
self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据配置选择或者初始化激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 输出层的全连接层,输入维度为config.intermediate_size,输出维度为config.hidden_size
self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
# 输出层的dropout操作,使用配置中的dropout率
self.output_dropout = nn.Dropout(config.hidden_dropout)
# 定义前向传播函数,接受隐藏状态hidden_states作为输入
def forward(self, hidden_states):
# 中间层的全连接操作
hidden_states = self.intermediate_dense(hidden_states)
# 中间层的激活函数操作
hidden_states = self.intermediate_act_fn(hidden_states)
# 中间层的dropout操作
hidden_states = self.intermediate_dropout(hidden_states)
# 输出层的全连接操作
hidden_states = self.output_dense(hidden_states)
# 输出层的dropout操作
hidden_states = self.output_dropout(hidden_states)
# 返回处理后的隐藏状态作为输出
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->Data2VecAudio
# 定义一个名为Data2VecAudioEncoderLayer的神经网络模块,继承自nn.Module
class Data2VecAudioEncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 使用Data2VecAudioAttention作为注意力机制
self.attention = Data2VecAudioAttention(
embed_dim=config.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.attention_dropout,
is_decoder=False,
)
# dropout操作,使用配置中的隐藏层dropout率
self.dropout = nn.Dropout(config.hidden_dropout)
# LayerNorm操作,输入维度为config.hidden_size,epsilon值为config.layer_norm_eps
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 使用Data2VecAudioFeedForward作为前馈神经网络
self.feed_forward = Data2VecAudioFeedForward(config)
# 最终的LayerNorm操作,输入维度为config.hidden_size,epsilon值为config.layer_norm_eps
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 定义前向传播函数,接受隐藏状态hidden_states、注意力掩码attention_mask(可选)、是否输出注意力权重output_attentions(可选)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
# 复制隐藏状态用于后续加法残差连接
attn_residual = hidden_states
# 使用注意力机制处理隐藏状态
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
# dropout操作
hidden_states = self.dropout(hidden_states)
# 加法残差连接
hidden_states = attn_residual + hidden_states
# LayerNorm操作
hidden_states = self.layer_norm(hidden_states)
# 前馈神经网络操作
hidden_states = hidden_states + self.feed_forward(hidden_states)
# 最终LayerNorm操作
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
# 如果输出注意力权重,将注意力权重加入输出元组
if output_attentions:
outputs += (attn_weights,)
# 返回输出元组
return outputs
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder with Wav2Vec2->Data2VecAudio
# 定义一个名为Data2VecAudioEncoder的神经网络模块,继承自nn.Module
class Data2VecAudioEncoder(nn.Module):
# 初始化方法,接受一个配置参数对象作为参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 将传入的配置对象保存到实例变量中
self.config = config
# 创建一个位置卷积嵌入对象,并保存到实例变量中
self.pos_conv_embed = Data2VecAudioPositionalConvEmbedding(config)
# 创建一个 LayerNorm 层,并设置其参数
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建一个 Dropout 层,并设置其参数
self.dropout = nn.Dropout(config.hidden_dropout)
# 创建一个由多个音频编码器层组成的模块列表,数量由配置中的 num_hidden_layers 决定
self.layers = nn.ModuleList([Data2VecAudioEncoderLayer(config) for _ in range(config.num_hidden_layers)])
# 初始化梯度检查点标志为 False
self.gradient_checkpointing = False
# 前向传播方法,接受输入的隐藏状态张量和其他可选参数
def forward(
self,
hidden_states: torch.tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if attention_mask is not None:
# 确保填充的 token 输出为 0
expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
hidden_states[~expand_attention_mask] = 0
# 扩展 attention_mask
attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
attention_mask = attention_mask.expand(
attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
)
# 计算位置嵌入
position_embeddings = self.pos_conv_embed(hidden_states)
# 将位置嵌入加到隐藏状态上
hidden_states = hidden_states + position_embeddings
# Layer Normalization
hidden_states = self.layer_norm(hidden_states)
# Dropout
hidden_states = self.dropout(hidden_states)
# 检查是否启用了 DeepSpeed Zero3
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
# 遍历每个 Transformer 层
for layer in self.layers:
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加 LayerDrop(参见 https://arxiv.org/abs/1909.11556 进行描述)
dropout_probability = torch.rand([])
# 判断是否跳过当前层
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
if not skip_the_layer or deepspeed_zero3_is_enabled:
# 如果启用了梯度检查点和处于训练阶段,则调用梯度检查点函数
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
# 否则直接调用 Transformer 层
layer_outputs = layer(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
hidden_states = layer_outputs[0]
if skip_the_layer:
layer_outputs = (None, None)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 根据 return_dict 决定返回的数据结构
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter复制而来,将Wav2Vec2改为Data2VecAudio
class Data2VecAudioAdapter(nn.Module):
def __init__(self, config):
super().__init__()
# 如果配置中的输出隐藏大小不等于隐藏大小,则可能需要降维特征维度
if config.output_hidden_size != config.hidden_size:
# 创建线性投影层,将隐藏状态维度降至输出隐藏大小
self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
# 创建LayerNorm层,用于规范投影后的隐藏状态
self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
else:
self.proj = self.proj_layer_norm = None
# 创建一个包含多个Data2VecAudioAdapterLayer模块的层列表
self.layers = nn.ModuleList(Data2VecAudioAdapterLayer(config) for _ in range(config.num_adapter_layers))
# 设置层的随机丢弃率
self.layerdrop = config.layerdrop
def forward(self, hidden_states):
# 如果存在投影层和LayerNorm层,则将隐藏状态投影至输出隐藏大小并规范化
if self.proj is not None and self.proj_layer_norm is not None:
hidden_states = self.proj(hidden_states)
hidden_states = self.proj_layer_norm(hidden_states)
# 调换维度顺序,将时间维度置于第二位
hidden_states = hidden_states.transpose(1, 2)
# 对每一层进行循环处理
for layer in self.layers:
# 计算当前层是否被丢弃的概率
layerdrop_prob = np.random.random()
# 如果非训练状态或者未丢弃该层,则通过当前层处理隐藏状态
if not self.training or (layerdrop_prob > self.layerdrop):
hidden_states = layer(hidden_states)
# 恢复原始维度顺序,将时间维度放回第三位
hidden_states = hidden_states.transpose(1, 2)
return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer复制而来,将Wav2Vec2改为Data2VecAudio
class Data2VecAudioAdapterLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 创建一个一维卷积层,用于处理隐藏状态
self.conv = nn.Conv1d(
config.output_hidden_size,
2 * config.output_hidden_size,
config.adapter_kernel_size,
stride=config.adapter_stride,
padding=1,
)
def forward(self, hidden_states):
# 将隐藏状态通过一维卷积层处理
hidden_states = self.conv(hidden_states)
# 对卷积结果进行门控线性单元(GLU)操作
hidden_states = nn.functional.glu(hidden_states, dim=1)
return hidden_states
class Data2VecAudioPreTrainedModel(PreTrainedModel):
"""
一个抽象类,处理权重初始化以及下载和加载预训练模型的简单接口。
"""
# 使用Data2VecAudioConfig作为配置类
config_class = Data2VecAudioConfig
# 模型的基本名称前缀
base_model_prefix = "data2vec_audio"
# 主要输入名称
main_input_name = "input_values"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 初始化模型权重的方法,根据模块类型不同采取不同的初始化方式
def _init_weights(self, module):
"""Initialize the weights"""
# 如果模块是 Data2VecAudioFeatureProjection 类型
if isinstance(module, Data2VecAudioFeatureProjection):
# 计算均匀分布的上下界 k
k = math.sqrt(1 / module.projection.in_features)
# 对投影层的权重进行均匀初始化
nn.init.uniform_(module.projection.weight, a=-k, b=k)
# 对投影层的偏置进行均匀初始化
nn.init.uniform_(module.projection.bias, a=-k, b=k)
# 如果模块是 Data2VecAudioPositionalConvLayer 类型
elif isinstance(module, Data2VecAudioPositionalConvLayer):
# 对卷积层的偏置进行常数初始化(设置为0)
nn.init.constant_(module.conv.bias, 0)
# 如果模块是 nn.Linear 类型
elif isinstance(module, nn.Linear):
# 对线性层的权重进行正态分布初始化
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果有偏置项,则将偏置项初始化为零
if module.bias is not None:
module.bias.data.zero_()
# 如果模块是 nn.LayerNorm 或 nn.GroupNorm 类型
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
# 如果有偏置项,则将偏置项初始化为零
if module.bias is not None:
module.bias.data.zero_()
# 如果有权重项,则将权重项初始化为全1
if module.weight is not None:
module.weight.data.fill_(1.0)
# 如果模块是 nn.Conv1d 类型
elif isinstance(module, nn.Conv1d):
# 对卷积层的权重进行 Kaiming 正态分布初始化
nn.init.kaiming_normal_(module.weight)
# 如果有偏置项,则计算均匀分布的上下界 k 并进行均匀初始化
if module.bias is not None:
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-k, b=k)
# 从 Wav2Vec2PreTrainedModel 类的方法 _get_feat_extract_output_lengths 复制而来
def _get_feat_extract_output_lengths(
self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
):
"""
Computes the output length of the convolutional layers
"""
# 根据需要是否添加适配器的标志
add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
# 定义计算卷积层输出长度的内部函数
def _conv_out_length(input_length, kernel_size, stride):
# 使用 PyTorch 文档中描述的公式计算 1D 卷积层输出长度
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
# 根据配置中的卷积核大小和步长循环计算输出长度
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
# 如果需要添加适配器,根据配置中的适配器层数循环计算输出长度
if add_adapter:
for _ in range(self.config.num_adapter_layers):
input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
return input_lengths
# 从 Wav2Vec2PreTrainedModel 类的方法 _get_feature_vector_attention_mask 复制而来
def _get_feature_vector_attention_mask(
self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
):
"""
Computes the attention mask for the feature vector
"""
# 计算未填充部分的长度,即 attention_mask.sum(-1),但不是原地操作以便在推理模式下运行。
non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
# 根据非填充长度获取特征提取器的输出长度,可选择添加适配器
output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
output_lengths = output_lengths.to(torch.long)
# 获取批次大小
batch_size = attention_mask.shape[0]
# 重新初始化 attention_mask,确保所有值在输出长度之前都是被注意的
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
# 在输出长度的索引之前的所有位置设置为1,以确保这些位置被注意到
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
# 反转 attention_mask,累积求和,再次反转,并转换为布尔类型
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
# 返回处理后的 attention_mask
return attention_mask
# DATA2VEC_AUDIO_START_DOCSTRING 的值是一个多行字符串,用于说明 Data2VecAudio 模型的背景和基本信息
DATA2VEC_AUDIO_START_DOCSTRING = r"""
Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
Michael Auli.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`Data2VecAudioConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# DATA2VEC_AUDIO_INPUTS_DOCSTRING 的值暂时为空字符串,用于定义 Data2VecAudio 模型的输入文档字符串
DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
"""
Args:
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
# 输入的原始语音波形的浮点值。可以通过加载 *.flac* 或 *.wav* 音频文件得到值数组,类型为 *List[float]* 或 *numpy.ndarray*,
# 例如通过 soundfile 库(*pip install soundfile*)实现。使用 [`AutoProcessor`] 进行填充并转换为类型为 *torch.FloatTensor* 的张量,
# 详细信息请参阅 [`Wav2Vec2Processor.__call__`]。
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
# 避免对填充令牌索引执行卷积和注意力的掩码。掩码的值为 `[0, 1]`:
#
# - 1 表示**未掩码**的令牌,
# - 0 表示**已掩码**的令牌。
#
# [什么是注意力掩码?](../glossary#attention-mask)
#
# <Tip warning={true}>
# `attention_mask` 应该在相应的处理器具有 `config.return_attention_mask == True` 的情况下传递,这对所有预训练 Data2Vec Audio 模型都是成立的。
# 请注意,即使有 `attention_mask`,零填充的输入与非填充的输入将会有略微不同的输出,因为在位置编码中有多个卷积层。有关更详细的解释,请参见
# [这里](https://github.com/huggingface/transformers/issues/25621#issuecomment-1713759349)。
# </Tip>
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。详细信息请参见返回张量下的 `attentions`。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。详细信息请参见返回张量下的 `hidden_states`。
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
"""
`
"""
注释:
@add_start_docstrings(
"The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
DATA2VEC_AUDIO_START_DOCSTRING,
)
# 定义 Data2VecAudioModel 类,继承自 Data2VecAudioPreTrainedModel 类,表示数据向量音频模型
class Data2VecAudioModel(Data2VecAudioPreTrainedModel):
def __init__(self, config: Data2VecAudioConfig):
# 调用父类构造函数,初始化模型配置
super().__init__(config)
# 保存模型配置
self.config = config
# 创建音频特征提取器对象
self.feature_extractor = Data2VecAudioFeatureEncoder(config)
# 创建音频特征投影对象
self.feature_projection = Data2VecAudioFeatureProjection(config)
# 如果配置中的掩码时间概率大于 0 或者掩码特征概率大于 0,则需要掩码特征嵌入
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
# 初始化掩码特征嵌入参数
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
# 创建音频编码器对象
self.encoder = Data2VecAudioEncoder(config)
# 如果配置中包含适配器,则创建适配器对象;否则适配器对象为 None
self.adapter = Data2VecAudioAdapter(config) if config.add_adapter else None
# 初始化权重并应用最终处理
self.post_init()
# 冻结特征编码器函数,禁止特征编码器的梯度计算,以防止其在训练过程中更新参数
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.feature_extractor._freeze_parameters()
# 掩码隐藏状态函数,用于掩码隐藏状态的处理
def _mask_hidden_states
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
# `config.apply_spec_augment` can set masking to False
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
# generate indices & apply SpecAugment along time axis
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
# apply SpecAugment along time axis with given mask_time_indices
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
# Calculate mask indices for time axis based on configuration parameters
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
# generate indices & apply SpecAugment along feature axis
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0
return hidden_states
@add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Wav2Vec2BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
mask_time_indices: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
# 如果未指定output_attentions,则使用配置中的默认设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定output_hidden_states,则使用配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定return_dict,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 提取输入特征向量
extract_features = self.feature_extractor(input_values)
# 调整特征向量的维度顺序
extract_features = extract_features.transpose(1, 2)
if attention_mask is not None:
# 计算适应特征向量的减少后的attention_mask
attention_mask = self._get_feature_vector_attention_mask(
extract_features.shape[1], attention_mask, add_adapter=False
)
# 对特征向量进行特征投影
hidden_states, extract_features = self.feature_projection(extract_features)
# 在计算中对隐藏状态进行屏蔽处理
hidden_states = self._mask_hidden_states(
hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
)
# 编码器的输出
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器的最后隐藏状态
hidden_states = encoder_outputs[0]
# 如果存在适配器,应用适配器
if self.adapter is not None:
hidden_states = self.adapter(hidden_states)
# 如果不返回字典,则返回元组形式的输出
if not return_dict:
return (hidden_states, extract_features) + encoder_outputs[1:]
# 如果返回字典,则创建Wav2Vec2BaseModelOutput对象并返回
return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states,
extract_features=extract_features,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
"""
Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
Inherited documentation and configuration are added from `DATA2VEC_AUDIO_START_DOCSTRING`.
Args:
config (:class:`~transformers.Data2VecAudioConfig`):
The model configuration class that specifies the model's architecture and parameters.
Attributes:
data2vec_audio (:class:`~transformers.Data2VecAudioModel`):
The base Data2VecAudioModel instance.
dropout (:obj:`torch.nn.Dropout`):
Dropout module with a dropout probability as specified in `config.final_dropout`.
lm_head (:obj:`torch.nn.Linear`):
The linear layer for the language modeling head with output size `config.vocab_size`.
Raises:
ValueError: If `config.vocab_size` is not defined in the model configuration.
Notes:
This class extends `Data2VecAudioPreTrainedModel` and adds a language modeling head on top for CTC.
"""
def __init__(self, config):
super().__init__(config)
# Initialize base Data2VecAudioModel and dropout layer
self.data2vec_audio = Data2VecAudioModel(config)
self.dropout = nn.Dropout(config.final_dropout)
# Check if vocab_size is defined in the configuration
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
# Determine the output size of the linear layer based on model configuration
output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
# Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
Deprecated:
The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5.
Please use the equivalent `freeze_feature_encoder` method instead.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.data2vec_audio.feature_extractor._freeze_parameters()
@add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with wav2vec2->data2vec_audio
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
"""
Forward pass of the Data2VecAudioForCTC model.
Args:
input_values (torch.Tensor, optional):
Input tensor of shape `(batch_size, sequence_length, feature_dim)` containing audio features.
attention_mask (torch.Tensor, optional):
Mask to avoid performing attention on padding tokens.
output_attentions (bool, optional):
Whether to return attentions weights of all attention layers.
output_hidden_states (bool, optional):
Whether to return hidden states of all layers.
return_dict (bool, optional):
Whether to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
labels (torch.Tensor, optional):
Labels for computing the CTC loss.
Returns:
:class:`~transformers.modeling_outputs.CausalLMOutput`: A :class:`~transformers.modeling_outputs.CausalLMOutput` containing:
- loss (`torch.FloatTensor`, optional):
CTC loss if :obj:`labels` is provided.
- logits (`torch.FloatTensor`):
The logits output tensor of the language modeling head.
Examples:
For examples on usage, please see the documentation and code samples provided.
Warnings:
This method is adapted from `transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward`
with modifications for Data2VecAudio models.
"""
) -> Union[Tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
# Decide whether to return results as a dictionary based on the provided argument or configuration
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Convert input audio features into vector representations
outputs = self.data2vec_audio(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract hidden states and apply dropout regularization
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
# Generate logits from the language model head
logits = self.lm_head(hidden_states)
# Initialize loss as None
loss = None
if labels is not None:
# Check if any label index exceeds the vocabulary size, which is invalid
if labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
# Retrieve the lengths of input features using attention mask
attention_mask = (
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# Identify valid labels by creating a mask and calculate their lengths
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
# Apply log softmax to logits and transpose for CTC loss calculation
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
# Disable CuDNN optimizations to ensure reproducibility in loss calculation
with torch.backends.cudnn.flags(enabled=False):
# Compute CTC loss using provided parameters
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
# If return_dict is False, return outputs as a tuple
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# If return_dict is True, return outputs wrapped in CausalLMOutput
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
@add_start_docstrings(
"""
Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
like SUPERB Keyword Spotting.
""",
DATA2VEC_AUDIO_START_DOCSTRING,
)
class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 检查配置中是否存在并且允许使用适配器,如果是,则引发值错误
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Sequence classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
)
# 初始化 Data2VecAudioModel 模型
self.data2vec_audio = Data2VecAudioModel(config)
# 计算层数:transformer 层数加上输入嵌入层
num_layers = config.num_hidden_layers + 1
# 如果配置中使用加权层求和,则初始化层权重参数
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 线性投影层,将隐藏状态映射到分类器投影尺寸
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
# 分类器层,将投影后的特征映射到类别数量
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
# 初始化权重并进行后处理
self.post_init()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
# 发出警告,方法即将弃用,请改用 `freeze_feature_encoder`
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 `freeze_feature_encoder` 方法冻结特征编码器的参数
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 冻结特征编码器的参数,禁止其梯度计算
self.data2vec_audio.feature_extractor._freeze_parameters()
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
# 冻结基础模型的参数,禁止其梯度计算,只有分类头部会被更新
for param in self.data2vec_audio.parameters():
param.requires_grad = False
@add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward 复制,并将 wav2vec2 改为 data2vec_audio
def forward(
self,
input_values: Optional[torch.Tensor], # 输入的张量数据,可选
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码张量,可选
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选
labels: Optional[torch.Tensor] = None, # 分类/回归任务的标签张量,可选
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict # 决定是否返回字典格式的输出,根据传入的参数或配置决定
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states # 如果配置指定使用加权层求和,则强制输出隐藏状态
outputs = self.data2vec_audio(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION] # 获取输出中的隐藏状态
hidden_states = torch.stack(hidden_states, dim=1) # 在指定维度上堆叠隐藏状态张量
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) # 对层权重进行softmax归一化
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) # 使用加权求和隐藏状态
else:
hidden_states = outputs[0] # 获取普通的隐藏状态输出
hidden_states = self.projector(hidden_states) # 投影隐藏状态
if attention_mask is None:
pooled_output = hidden_states.mean(dim=1) # 如果没有注意力掩码,则对隐藏状态进行平均池化
else:
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) # 根据注意力掩码生成填充掩码
hidden_states[~padding_mask] = 0.0 # 使用填充掩码将非填充位置置零
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) # 对填充位置求和并进行平均池化
logits = self.classifier(pooled_output) # 使用分类器得到logits
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss() # 使用交叉熵损失函数
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) # 计算损失
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] # 如果不返回字典,则组合输出
return ((loss,) + output) if loss is not None else output # 返回包含损失的输出或者普通输出
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) # 返回字典格式的分类器输出
@add_start_docstrings(
"""
Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
""",
DATA2VEC_AUDIO_START_DOCSTRING,
)
class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
def __init__(self, config):
super().__init__(config)
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Audio frame classification does not support the use of Data2VecAudio adapters"
" (config.add_adapter=True)"
)
# 初始化 Data2VecAudioModel,并设置层数
self.data2vec_audio = Data2VecAudioModel(config)
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
# 如果设置了使用加权层求和,则初始化层权重
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 初始化分类器线性层
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.num_labels = config.num_labels
# 初始化模型权重
self.init_weights()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用 freeze_feature_encoder 方法来冻结特征编码器的参数
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 冻结特征编码器的参数,使其在训练过程中不会更新
self.data2vec_audio.feature_extractor._freeze_parameters()
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
# 冻结基础模型的参数,使其在训练过程中不会更新,只有分类头会更新
for param in self.data2vec_audio.parameters():
param.requires_grad = False
@add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward 复制,将 wav2vec2 替换为 data2vec_audio
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs
):
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 初始化是否返回字典,默认为配置中的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据权重层求和的配置决定是否输出隐藏状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用data2vec_audio方法处理音频输入,获取输出
outputs = self.data2vec_audio(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置为使用加权层求和,则对隐藏状态进行加权求和处理
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION] # 获取隐藏状态的起始位置
hidden_states = torch.stack(hidden_states, dim=1) # 在指定维度上堆叠隐藏状态张量
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) # 对层权重进行softmax归一化
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) # 加权求和隐藏状态
else:
hidden_states = outputs[0] # 否则直接使用第一个输出作为隐藏状态
logits = self.classifier(hidden_states) # 使用分类器对隐藏状态进行分类预测
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss() # 使用交叉熵损失函数
# 计算分类器的损失,labels需要转换成合适的形状和类型
loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
# 如果不返回字典形式的输出,则返回元组形式的输出
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] # 组合输出结果
return output
# 返回TokenClassifierOutput对象,包含损失、logits、隐藏状态和注意力
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
# 定义了一个 AMSoftmaxLoss 类,用于计算带有 AM-Softmax 的损失函数
class AMSoftmaxLoss(nn.Module):
def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
super(AMSoftmaxLoss, self).__init__()
self.scale = scale # 缩放因子,用于调整角度余弦值的范围
self.margin = margin # AM-Softmax 中的 margin 参数
self.num_labels = num_labels # 标签的数量
self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True) # 权重矩阵,初始化为随机值
self.loss = nn.CrossEntropyLoss() # 使用交叉熵作为损失函数
def forward(self, hidden_states, labels):
labels = labels.flatten() # 将标签展平,以便计算损失
weight = nn.functional.normalize(self.weight, dim=0) # 对权重进行 L2 归一化
hidden_states = nn.functional.normalize(hidden_states, dim=1) # 对隐藏状态进行 L2 归一化
cos_theta = torch.mm(hidden_states, weight) # 计算余弦相似度
psi = cos_theta - self.margin # 计算带有 margin 的调整值
onehot = nn.functional.one_hot(labels, self.num_labels) # 将标签转为 one-hot 格式
logits = self.scale * torch.where(onehot.bool(), psi, cos_theta) # 根据标签应用 AM-Softmax 运算得到最终 logits
loss = self.loss(logits, labels) # 计算损失
return loss
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
# 定义了 TDNNLayer 类,用于实现时延神经网络层
class TDNNLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id] # 输入通道维度
self.out_conv_dim = config.tdnn_dim[layer_id] # 输出通道维度
self.kernel_size = config.tdnn_kernel[layer_id] # 卷积核大小
self.dilation = config.tdnn_dilation[layer_id] # 空洞卷积的扩展率
self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim) # 使用线性层作为卷积核
self.activation = nn.ReLU() # 激活函数为 ReLU
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
if is_peft_available():
from peft.tuners.lora import LoraLayer
if isinstance(self.kernel, LoraLayer):
warnings.warn(
"Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
"You should exclude TDNNLayer from LoRA's target modules.",
)
# for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
hidden_states = hidden_states.transpose(1, 2) # 转置张量以适应卷积操作的输入要求
weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2) # 调整权重形状
hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) # 执行卷积操作
hidden_states = hidden_states.transpose(1, 2) # 还原张量形状
hidden_states = self.activation(hidden_states) # 应用 ReLU 激活函数
return hidden_states
@add_start_docstrings(
"""
Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
""",
DATA2VEC_AUDIO_START_DOCSTRING,
)
# 定义了 Data2VecAudioForXVector 类,扩展自 Data2VecAudioPreTrainedModel,用于 XVector 特征提取
class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
def __init__(self, config):
# 调用父类的初始化方法,传递配置参数
super().__init__(config)
# 创建音频数据转换模型对象
self.data2vec_audio = Data2VecAudioModel(config)
# 计算层数,包括Transformer层和输入嵌入层
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
# 如果配置指定使用加权层求和,则初始化层权重参数
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 创建线性映射层,将隐藏状态映射到TDNN的输入维度
self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
# 创建多个TDNN层的列表
tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
self.tdnn = nn.ModuleList(tdnn_layers)
# 创建特征提取器的线性层,用于生成x-vector的输出维度
self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
# 创建分类器的线性层,将x-vector映射到分类数目的维度
self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
# 初始化AMSoftmax损失函数,使用指定的输出维度和类别数目
self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
# 执行权重初始化的函数
self.init_weights()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 发出警告,表明此函数将被弃用,建议使用freeze_feature_encoder代替
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用freeze_feature_encoder方法,冻结特征编码器的参数
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 冻结特征编码器的参数,使其在训练期间不会更新梯度
self.data2vec_audio.feature_extractor._freeze_parameters()
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
# 遍历数据转换模型的所有参数,并将requires_grad设置为False,以禁用它们的梯度计算
for param in self.data2vec_audio.parameters():
param.requires_grad = False
def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
Computes the output length of the TDNN layers
"""
def _conv_out_length(input_length, kernel_size, stride):
# 从PyTorch文档中获取的1D卷积层输出长度的计算公式
return (input_length - kernel_size) // stride + 1
# 计算每个TDNN层的输出长度
for kernel_size in self.config.tdnn_kernel:
input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
return input_lengths
@add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=XVectorOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward复制而来,修改为使用data2vec_audio
# 定义一个前向传播方法,用于模型推断或训练时的正向处理
def forward(
self,
# 输入数据张量,可以为空
input_values: Optional[torch.Tensor],
# 注意力掩码张量,用于指定模型关注的部分,可以为空
attention_mask: Optional[torch.Tensor] = None,
# 是否输出注意力权重,可选参数,默认为空
output_attentions: Optional[bool] = None,
# 是否输出隐藏状态,可选参数,默认为空
output_hidden_states: Optional[bool] = None,
# 是否返回结果字典形式,可选参数,默认为空
return_dict: Optional[bool] = None,
# 标签数据张量,用于训练时指定真实标签,可以为空
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, XVectorOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 初始化 return_dict,如果未提供则使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果配置中指定了使用加权层求和的隐藏状态,则设置 output_hidden_states 为 True
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 将输入数据通过 data2vec_audio 方法处理,获取模型的输出
outputs = self.data2vec_audio(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置中使用了加权层求和的隐藏状态,对隐藏状态进行加权求和操作
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
hidden_states = torch.stack(hidden_states, dim=1)
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
# 否则直接取模型输出的第一个元素作为隐藏状态
hidden_states = outputs[0]
# 将隐藏状态通过 projector 方法进行投影处理
hidden_states = self.projector(hidden_states)
# 对每个 TDNN 层依次处理隐藏状态
for tdnn_layer in self.tdnn:
hidden_states = tdnn_layer(hidden_states)
# 统计池化操作
if attention_mask is None:
# 如果没有提供 attention_mask,则对隐藏状态在第一维度(batch 维度)上进行均值和标准差计算
mean_features = hidden_states.mean(dim=1)
std_features = hidden_states.std(dim=1)
else:
# 根据 attention_mask 计算特征提取的输出长度和 TDNN 层的输出长度
feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
mean_features = []
std_features = []
# 对每个序列长度进行遍历,计算每个序列的均值和标准差
for i, length in enumerate(tdnn_output_lengths):
mean_features.append(hidden_states[i, :length].mean(dim=0))
std_features.append(hidden_states[i, :length].std(dim=0))
mean_features = torch.stack(mean_features)
std_features = torch.stack(std_features)
# 将均值和标准差拼接在一起作为统计池化的结果
statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
# 将统计池化的结果通过 feature_extractor 进行特征提取
output_embeddings = self.feature_extractor(statistic_pooling)
# 将特征提取的结果通过 classifier 进行分类器预测得到 logits
logits = self.classifier(output_embeddings)
# 初始化损失为 None
loss = None
# 如果提供了 labels,则计算损失
if labels is not None:
loss = self.objective(logits, labels)
# 如果不要求返回字典形式的输出,则直接返回元组形式的输出
if not return_dict:
output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典形式的输出,则构建 XVectorOutput 对象并返回
return XVectorOutput(
loss=loss,
logits=logits,
embeddings=output_embeddings,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)