Transformers 源码解析(一百零二)
.\models\segformer\modeling_segformer.py
""" PyTorch SegFormer model."""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput, SemanticSegmenterOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "SegformerConfig"
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]
_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"nvidia/segformer-b0-finetuned-ade-512-512",
]
class SegFormerImageClassifierOutput(ImageClassifierOutput):
"""
Base class for outputs of image classification models.
"""
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类(如果`config.num_labels==1`则为回归)损失。
Loss for classification (or regression if `config.num_labels==1`).
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类(如果`config.num_labels==1`则为回归)得分(SoftMax 之前)。
Scores for classification (or regression if `config.num_labels==1`), before SoftMax.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
`torch.FloatTensor` 的元组(如果模型有嵌入层,则包括嵌入层输出,以及每个阶段的输出),
形状为 `(batch_size, num_channels, height, width)`。
模型在每个阶段输出的隐藏状态(也称为特征图)。
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer,
plus one for the output of each stage), with shape `(batch_size, num_channels, height, width)`.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
`torch.FloatTensor` 的元组(每个层一个),形状为 `(batch_size, num_heads, patch_size, sequence_length)`。
自注意力机制中注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
Tuple of `torch.FloatTensor` (one for each layer), with shape `(batch_size, num_heads, patch_size, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class SegformerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class SegformerOverlapPatchEmbeddings(nn.Module):
"""Construct the overlapping patch embeddings."""
def __init__(self, patch_size, stride, num_channels, hidden_size):
super().__init__()
self.proj = nn.Conv2d(
num_channels,
hidden_size,
kernel_size=patch_size,
stride=stride,
padding=patch_size // 2,
)
self.layer_norm = nn.LayerNorm(hidden_size)
def forward(self, pixel_values):
embeddings = self.proj(pixel_values)
_, _, height, width = embeddings.shape
embeddings = embeddings.flatten(2).transpose(1, 2)
embeddings = self.layer_norm(embeddings)
return embeddings, height, width
class SegformerEfficientSelfAttention(nn.Module):
"""SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
paper](https://arxiv.org/abs/2102.12122)."""
def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
super().__init__()
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
f"heads ({self.num_attention_heads})"
)
self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(self.hidden_size, self.all_head_size)
self.key = nn.Linear(self.hidden_size, self.all_head_size)
self.value = nn.Linear(self.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.sr_ratio = sequence_reduction_ratio
if sequence_reduction_ratio > 1:
self.sr = nn.Conv2d(
hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
)
self.layer_norm = nn.LayerNorm(hidden_size)
def transpose_for_scores(self, hidden_states):
new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
hidden_states = hidden_states.view(new_shape)
return hidden_states.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
height,
width,
output_attentions=False,
):
query_layer = self.transpose_for_scores(self.query(hidden_states))
if self.sr_ratio > 1:
batch_size, seq_len, num_channels = hidden_states.shape
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
hidden_states = self.sr(hidden_states)
hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
hidden_states = self.layer_norm(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class SegformerSelfOutput(nn.Module):
def __init__(self, config, hidden_size):
super().__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class SegformerAttention(nn.Module):
def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
super().__init__()
self.self = SegformerEfficientSelfAttention(
config=config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
)
self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, height, width, output_attentions=False):
self_outputs = self.self(hidden_states, height, width, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class SegformerDWConv(nn.Module):
def __init__(self, dim=768):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
def forward(self, hidden_states, height, width):
batch_size, seq_len, num_channels = hidden_states.shape
hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
hidden_states = self.dwconv(hidden_states)
hidden_states = hidden_states.flatten(2).transpose(1, 2)
return hidden_states
class SegformerMixFFN(nn.Module):
def __init__(self, config, in_features, hidden_features=None, out_features=None):
super().__init__()
out_features = out_features or in_features
self.dense1 = nn.Linear(in_features, hidden_features)
self.dwconv = SegformerDWConv(hidden_features)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.dense2 = nn.Linear(hidden_features, out_features)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, height, width):
hidden_states = self.dense1(hidden_states)
hidden_states = self.dwconv(hidden_states, height, width)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense2(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class SegformerLayer(nn.Module):
"""This corresponds to the Block class in the original implementation."""
def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(hidden_size)
self.attention = SegformerAttention(
config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
)
self.drop_path = SegformerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.layer_norm_2 = nn.LayerNorm(hidden_size)
mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
def forward(self, hidden_states, height, width, output_attentions=False):
self_attention_outputs = self.attention(
self.layer_norm_1(hidden_states),
height,
width,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
attention_output = self.drop_path(attention_output)
hidden_states = attention_output + hidden_states
mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
mlp_output = self.drop_path(mlp_output)
layer_output = mlp_output + hidden_states
outputs = (layer_output,) + outputs
return outputs
def __init__(self, config):
super().__init__()
self.config = config
drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
embeddings = []
for i in range(config.num_encoder_blocks):
embeddings.append(
SegformerOverlapPatchEmbeddings(
patch_size=config.patch_sizes[i],
stride=config.strides[i],
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
hidden_size=config.hidden_sizes[i],
)
)
self.patch_embeddings = nn.ModuleList(embeddings)
blocks = []
cur = 0
for i in range(config.num_encoder_blocks):
layers = []
if i != 0:
cur += config.depths[i - 1]
for j in range(config.depths[i]):
layers.append(
SegformerLayer(
config,
hidden_size=config.hidden_sizes[i],
num_attention_heads=config.num_attention_heads[i],
drop_path=drop_path_decays[cur + j],
sequence_reduction_ratio=config.sr_ratios[i],
mlp_ratio=config.mlp_ratios[i],
)
)
blocks.append(nn.ModuleList(layers))
self.block = nn.ModuleList(blocks)
self.layer_norm = nn.ModuleList(
[nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)]
)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
batch_size = pixel_values.shape[0]
hidden_states = pixel_values
for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)):
embedding_layer, block_layer, norm_layer = x
hidden_states, height, width = embedding_layer(hidden_states)
for i, blk in enumerate(block_layer):
layer_outputs = blk(hidden_states, height, width, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
hidden_states = norm_layer(hidden_states)
if idx != len(self.patch_embeddings) - 1 or (
idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage
):
hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
@add_start_docstrings(
"The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
SEGFORMER_START_DOCSTRING,
)
class SegformerModel(SegformerPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
Attributes:
config_class (SegformerConfig): Configuration class defining parameters for the model.
base_model_prefix (str): Prefix used in naming the base model.
main_input_name (str): Name of the main input expected by the model.
"""
config_class = SegformerConfig
base_model_prefix = "segformer"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights of the given module."""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def __init__(self, config):
super().__init__(config)
self.config = config
self.encoder = SegformerEncoder(config)
self.post_init()
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""
SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
states) e.g. for ImageNet.
""",
SEGFORMER_START_DOCSTRING,
)
class SegformerForImageClassification(SegformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.segformer = SegformerModel(config)
self.classifier = nn.Linear(config.hidden_sizes[-1], config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=SegFormerImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple, SegFormerImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.segformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
batch_size = sequence_output.shape[0]
if self.config.reshape_last_stage:
sequence_output = sequence_output.permute(0, 2, 3, 1)
sequence_output = sequence_output.reshape(batch_size, -1, self.config.hidden_sizes[-1])
sequence_output = sequence_output.mean(dim=1)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return SegFormerImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class SegformerMLP(nn.Module):
"""
Linear Embedding.
"""
def __init__(self, config: SegformerConfig, input_dim):
super().__init__()
self.proj = nn.Linear(input_dim, config.decoder_hidden_size)
def forward(self, hidden_states: torch.Tensor):
hidden_states = hidden_states.flatten(2).transpose(1, 2)
hidden_states = self.proj(hidden_states)
return hidden_states
class SegformerDecodeHead(SegformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
mlps = []
for i in range(config.num_encoder_blocks):
mlp = SegformerMLP(config, input_dim=config.hidden_sizes[i])
mlps.append(mlp)
self.linear_c = nn.ModuleList(mlps)
self.linear_fuse = nn.Conv2d(
in_channels=config.decoder_hidden_size * config.num_encoder_blocks,
out_channels=config.decoder_hidden_size,
kernel_size=1,
bias=False,
)
self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size)
self.activation = nn.ReLU()
self.dropout = nn.Dropout(config.classifier_dropout_prob)
self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1)
self.config = config
def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor:
batch_size = encoder_hidden_states[-1].shape[0]
all_hidden_states = ()
for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c):
if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3:
height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
encoder_hidden_state = (
encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
)
height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
encoder_hidden_state = mlp(encoder_hidden_state)
encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
encoder_hidden_state = nn.functional.interpolate(
encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False
)
all_hidden_states += (encoder_hidden_state,)
hidden_states = self.linear_fuse(torch.cat(all_hidden_states[::-1], dim=1))
hidden_states = self.batch_norm(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
logits = self.classifier(hidden_states)
return logits
@add_start_docstrings(
"""SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
SEGFORMER_START_DOCSTRING,
)
class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.segformer = SegformerModel(config)
self.decode_head = SegformerDecodeHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\segformer\modeling_tf_segformer.py
from __future__ import annotations
import math
from typing import Optional, Tuple, Union
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...file_utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput
from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import logging
from .configuration_segformer import SegformerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "SegformerConfig"
_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]
_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"nvidia/segformer-b0-finetuned-ade-512-512",
]
class TFSegformerDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
"""
def __init__(self, drop_path: float, **kwargs):
super().__init__(**kwargs)
self.drop_path = drop_path
def call(self, x: tf.Tensor, training=None):
if training:
keep_prob = 1 - self.drop_path
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
random_tensor = tf.floor(random_tensor)
return (x / keep_prob) * random_tensor
return x
class TFSegformerOverlapPatchEmbeddings(keras.layers.Layer):
"""Construct the overlapping patch embeddings."""
def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs):
super().__init__(**kwargs)
self.padding = keras.layers.ZeroPadding2D(padding=patch_size // 2)
self.proj = keras.layers.Conv2D(
filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj"
)
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
self.num_channels = num_channels
self.hidden_size = hidden_size
def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]:
embeddings = self.proj(self.padding(pixel_values))
height = shape_list(embeddings)[1]
width = shape_list(embeddings)[2]
hidden_dim = shape_list(embeddings)[3]
embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim))
embeddings = self.layer_norm(embeddings)
return embeddings, height, width
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "proj", None) is not None:
with tf.name_scope(self.proj.name):
self.proj.build([None, None, None, self.num_channels])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.hidden_size])
height: int,
width: int,
output_attentions: bool = False,
training: bool = False,
hidden_states: tf.Tensor,
height: int,
width: int,
output_attentions: bool = False,
training: bool = False,
) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
batch_size = shape_list(hidden_states)[0]
num_channels = shape_list(hidden_states)[2]
query_layer = self.transpose_for_scores(self.query(hidden_states))
if self.sr_ratio > 1:
hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
hidden_states = self.sr(hidden_states)
hidden_states = tf.reshape(hidden_states, (batch_size, -1, num_channels))
hidden_states = self.layer_norm(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
scale = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
attention_scores = tf.divide(attention_scores, scale)
attention_probs = stable_softmax(logits=attention_scores, axis=-1)
attention_probs = self.dropout(attention_probs, training=training)
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size))
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.hidden_size])
if getattr(self, "sr", None) is not None:
with tf.name_scope(self.sr.name):
self.sr.build([None, None, None, self.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.hidden_size])
class TFSegformerSelfOutput(keras.layers.Layer):
def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(hidden_size, name="dense")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.hidden_size = hidden_size
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.hidden_size])
class TFSegformerAttention(keras.layers.Layer):
def __init__(
self,
config: SegformerConfig,
hidden_size: int,
num_attention_heads: int,
sequence_reduction_ratio: int,
**kwargs,
):
super().__init__(**kwargs)
self.self = TFSegformerEfficientSelfAttention(
config=config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
name="self",
)
self.dense_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
def call(
self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False
) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
self_outputs = self.self(hidden_states, height, width, output_attentions)
attention_output = self.dense_output(self_outputs[0])
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFSegformerDWConv(keras.layers.Layer):
def __init__(self, dim: int = 768, **kwargs):
super().__init__(**kwargs)
self.depthwise_convolution = keras.layers.Conv2D(
filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
)
self.dim = dim
def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
batch_size = shape_list(hidden_states)[0]
num_channels = shape_list(hidden_states)[-1]
hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
hidden_states = self.depthwise_convolution(hidden_states)
new_height = shape_list(hidden_states)[1]
new_width = shape_list(hidden_states)[2]
num_channels = shape_list(hidden_states)[3]
hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels))
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "depthwise_convolution", None) is not None:
with tf.name_scope(self.depthwise_convolution.name):
self.depthwise_convolution.build([None, None, None, self.dim])
class TFSegformerMixFFN(keras.layers.Layer):
def __init__(
self,
config: SegformerConfig,
in_features: int,
hidden_features: int = None,
out_features: int = None,
**kwargs,
):
super().__init__(**kwargs)
out_features = out_features or in_features
self.dense1 = keras.layers.Dense(hidden_features, name="dense1")
self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv")
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.dense2 = keras.layers.Dense(out_features, name="dense2")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.hidden_features = hidden_features
self.in_features = in_features
def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
hidden_states = self.dense1(hidden_states)
hidden_states = self.depthwise_convolution(hidden_states, height, width)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.dense2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense1", None) is not None:
with tf.name_scope(self.dense1.name):
self.dense1.build([None, None, self.in_features])
if getattr(self, "depthwise_convolution", None) is not None:
with tf.name_scope(self.depthwise_convolution.name):
self.depthwise_convolution.build(None)
if getattr(self, "dense2", None) is not None:
with tf.name_scope(self.dense2.name):
self.dense2.build([None, None, self.hidden_features])
class TFSegformerLayer(keras.layers.Layer):
"""This corresponds to the Block class in the original implementation."""
def __init__(
self,
config,
hidden_size: int,
num_attention_heads: int,
drop_path: float,
sequence_reduction_ratio: int,
mlp_ratio: int,
**kwargs,
):
super().__init__(**kwargs)
self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
self.attention = TFSegformerAttention(
config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
name="attention",
)
self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear")
self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
self.hidden_size = hidden_size
def __init__(
self,
config,
hidden_size: int,
num_attention_heads: int,
sequence_reduction_ratio: int,
mlp_ratio: float,
drop_path: float,
**kwargs
):
super().__init__(**kwargs)
self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
self.attention = TFSegformerAttention(
config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
name="attention",
)
self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear")
self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
self.hidden_size = hidden_size
def call(
self,
hidden_states: tf.Tensor,
height: int,
width: int,
output_attentions: bool = False,
training: bool = False,
) -> Tuple:
self_attention_outputs = self.attention(
self.layer_norm_1(hidden_states),
height,
width,
output_attentions=output_attentions,
training=training,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
attention_output = self.drop_path(attention_output, training=training)
hidden_states = attention_output + hidden_states
mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
mlp_output = self.drop_path(mlp_output, training=training)
layer_output = mlp_output + hidden_states
outputs = (layer_output,) + outputs
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norm_1", None) is not None:
with tf.name_scope(self.layer_norm_1.name):
self.layer_norm_1.build([None, None, self.hidden_size])
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "layer_norm_2", None) is not None:
with tf.name_scope(self.layer_norm_2.name):
self.layer_norm_2.build([None, None, self.hidden_size])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
class TFSegformerEncoder(keras.layers.Layer):
def __init__(self, config: SegformerConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
drop_path_decays = [x.numpy() for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
embeddings = []
for i in range(config.num_encoder_blocks):
embeddings.append(
TFSegformerOverlapPatchEmbeddings(
patch_size=config.patch_sizes[i],
stride=config.strides[i],
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
hidden_size=config.hidden_sizes[i],
name=f"patch_embeddings.{i}",
)
)
self.embeddings = embeddings
blocks = []
cur = 0
for i in range(config.num_encoder_blocks):
layers = []
if i != 0:
cur += config.depths[i - 1]
for j in range(config.depths[i]):
layers.append(
TFSegformerLayer(
config,
hidden_size=config.hidden_sizes[i],
num_attention_heads=config.num_attention_heads[i],
drop_path=drop_path_decays[cur + j],
sequence_reduction_ratio=config.sr_ratios[i],
mlp_ratio=config.mlp_ratios[i],
name=f"block.{i}.{j}",
)
)
blocks.append(layers)
self.block = blocks
self.layer_norms = [
keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}")
for i in range(config.num_encoder_blocks)
]
def call(
self,
pixel_values: tf.Tensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
training: bool = False,
) -> Union[Tuple, TFBaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
batch_size = shape_list(pixel_values)[0]
hidden_states = pixel_values
for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)):
embedding_layer, block_layer, norm_layer = x
hidden_states, height, width = embedding_layer(hidden_states)
for i, blk in enumerate(block_layer):
layer_outputs = blk(
hidden_states,
height,
width,
output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
hidden_states = norm_layer(hidden_states)
if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage):
num_channels = shape_list(hidden_states)[-1]
hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer_norms", None) is not None:
for layer, shape in zip(self.layer_norms, self.config.hidden_sizes):
with tf.name_scope(layer.name):
layer.build([None, None, shape])
if getattr(self, "block", None) is not None:
for block in self.block:
for layer in block:
with tf.name_scope(layer.name):
layer.build(None)
if getattr(self, "embeddings", None) is not None:
for layer in self.embeddings:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFSegformerMainLayer(keras.layers.Layer):
config_class = SegformerConfig
def __init__(self, config: SegformerConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.encoder = TFSegformerEncoder(config, name="encoder")
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple, TFBaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
encoder_outputs = self.encoder(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])
if output_hidden_states:
hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
if not return_dict:
if tf.greater(len(encoder_outputs[1:]), 0):
transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0])
return (sequence_output,) + (transposed_encoder_outputs,)
else:
return (sequence_output,) + encoder_outputs[1:]
return TFBaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
class TFSegformerPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
注释:
@keras_serializable
class TFSegformerMainLayer(keras.layers.Layer):
config_class = SegformerConfig
def __init__(self, config: SegformerConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.encoder = TFSegformerEncoder(config, name="encoder")
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple, TFBaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
encoder_outputs = self.encoder(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])
if output_hidden_states:
hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
if not return_dict:
if tf.greater(len(encoder_outputs[1:]), 0):
transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0])
return (sequence_output,) + (transposed_encoder_outputs,)
else:
return (sequence_output,) + encoder_outputs[1:]
return TFBaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
class TFSegformerPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SegformerConfig
base_model_prefix = "segformer"
main_input_name = "pixel_values"
@property
def input_signature(self):
return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 512, 512), dtype=tf.float32)}
"""
定义了 SEGFORMER_START_DOCSTRING,包含了关于模型继承和参数配置的详细描述文档。
"""
SEGFORMER_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
Parameters:
config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""
"""
定义了 SEGFORMER_INPUTS_DOCSTRING,包含了模型输入参数的详细描述文档。
"""
SEGFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`SegformerImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
"""
添加了模型描述的文档字符串,并调用了 `add_start_docstrings` 装饰器,将模型简介和参数文档串联起来。
"""
@add_start_docstrings(
"The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
SEGFORMER_START_DOCSTRING,
)
class TFSegformerModel(TFSegformerPreTrainedModel):
"""
TFSegformerModel 类继承自 TFSegformerPreTrainedModel,表示一个基础的 SegFormer 编码器(混合Transformer),
输出未经特定顶部处理的原始隐藏状态。
Args:
config (SegformerConfig): 包含模型所有参数的配置类。使用配置文件初始化时,不会加载与模型关联的权重,只会加载配置。
查看 `~TFPreTrainedModel.from_pretrained` 方法以加载模型权重。
"""
def __init__(self, config: SegformerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.config = config
self.segformer = TFSegformerMainLayer(config, name="segformer")
"""
添加了文档字符串到模型的前向传播方法,描述了输入参数的详细信息。
"""
@unpack_inputs
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def call(
self,
pixel_values: tf.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple, TFBaseModelOutput]:
outputs = self.segformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "segformer", None) is not None:
with tf.name_scope(self.segformer.name):
self.segformer.build(None)
"""
SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
states) e.g. for ImageNet.
"""
class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: SegformerConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.segformer = TFSegformerMainLayer(config, name="segformer")
self.classifier = keras.layers.Dense(config.num_labels, name="classifier")
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def call(
self,
pixel_values: tf.Tensor | None = None,
labels: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TFSequenceClassifierOutput]:
outputs = self.segformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
batch_size = shape_list(sequence_output)[0]
sequence_output = tf.transpose(sequence_output, perm=[0, 2, 3, 1])
sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1]))
sequence_output = tf.reduce_mean(sequence_output, axis=1)
logits = self.classifier(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "segformer", None) is not None:
with tf.name_scope(self.segformer.name):
self.segformer.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
"""
def __init__(self, input_dim: int, config: SegformerConfig, **kwargs):
super().__init__(**kwargs) # 调用父类的构造函数,传递任何额外的关键字参数
self.proj = keras.layers.Dense(config.decoder_hidden_size, name="proj") # 初始化一个全连接层 Dense 对象,用于投影
self.input_dim = input_dim # 设置输入维度
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
height = shape_list(hidden_states)[1] # 获取 hidden_states 张量的高度维度
width = shape_list(hidden_states)[2] # 获取 hidden_states 张量的宽度维度
hidden_dim = shape_list(hidden_states)[-1] # 获取 hidden_states 张量的最后一个维度(隐藏维度)
hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim)) # 对 hidden_states 张量进行重新形状操作
hidden_states = self.proj(hidden_states) # 应用定义的投影层到 hidden_states 张量上
return hidden_states # 返回变换后的 hidden_states 张量
def build(self, input_shape=None):
if self.built: # 如果模型已经构建过,则直接返回
return
self.built = True # 将模型标记为已构建
if getattr(self, "proj", None) is not None: # 如果投影层存在
with tf.name_scope(self.proj.name): # 使用投影层的名字作为命名空间
self.proj.build([None, None, self.input_dim]) # 构建投影层,指定输入维度
# 定义一个继承自TFSegformerPreTrainedModel的解码头类,用于Segformer模型
class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
# 初始化方法,接受一个SegformerConfig对象和其他关键字参数
def __init__(self, config: SegformerConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(config, **kwargs)
# 初始化一个空列表,用于存储多个MLP模块
mlps = []
# 根据配置中的encoder块数量迭代创建MLP模块
for i in range(config.num_encoder_blocks):
# 创建一个TFSegformerMLP对象,设置输入维度和名称
mlp = TFSegformerMLP(config=config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
# 将创建的MLP模块添加到mlps列表中
mlps.append(mlp)
# 将创建的MLP模块列表赋值给当前对象的mlps属性
self.mlps = mlps
# 创建线性融合层,实现原始实现中的ConvModule
self.linear_fuse = keras.layers.Conv2D(
filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse"
)
# 创建批标准化层
self.batch_norm = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm")
# 创建激活函数层,使用ReLU激活函数
self.activation = keras.layers.Activation("relu")
# 创建dropout层,使用配置中的分类器dropout概率
self.dropout = keras.layers.Dropout(config.classifier_dropout_prob)
# 创建分类器层,输出通道数为配置中的标签数量
self.classifier = keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier")
# 将配置对象保存到当前对象的config属性中
self.config = config
# 定义call方法,接受encoder_hidden_states和training两个参数,返回一个Tensor对象
def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
# 初始化一个空元组,用于存储所有隐藏状态
all_hidden_states = ()
# 迭代encoder_hidden_states和mlps列表
for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
# 如果reshape_last_stage为False且encoder_hidden_state的形状长度为3
if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
# 计算height和width,并将encoder_hidden_state重塑为四维张量
height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32))
height = width = tf.cast(height, tf.int32)
channel_dim = shape_list(encoder_hidden_state)[-1]
encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
# 将encoder_hidden_state的通道维度移动到最后一个维度
encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1])
# 获取当前encoder_hidden_state的height和width
height, width = shape_list(encoder_hidden_state)[1:3]
# 将encoder_hidden_state传入mlp模块中进行处理
encoder_hidden_state = mlp(encoder_hidden_state)
# 获取处理后的encoder_hidden_state的通道维度
channel_dim = shape_list(encoder_hidden_state)[-1]
# 将encoder_hidden_state重塑为四维张量
encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
# 上采样
temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])
upsample_resolution = shape_list(temp_state)[1:-1]
encoder_hidden_state = tf.image.resize(encoder_hidden_state, size=upsample_resolution, method="bilinear")
# 将处理后的encoder_hidden_state添加到all_hidden_states元组中
all_hidden_states += (encoder_hidden_state,)
# 对所有隐藏状态进行拼接并通过线性融合层处理
hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1))
# 对处理后的隐藏状态进行批标准化
hidden_states = self.batch_norm(hidden_states, training=training)
# 对批标准化后的隐藏状态进行ReLU激活
hidden_states = self.activation(hidden_states)
# 对激活后的隐藏状态进行dropout
hidden_states = self.dropout(hidden_states, training=training)
# 计算分类器的logits,形状为(batch_size, height/4, width/4, num_labels)
logits = self.classifier(hidden_states)
# 返回logits
return logits
# 定义模型的构建方法,用于初始化模型结构
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回,避免重复构建
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果存在线性融合层(linear_fuse),则构建该层
if getattr(self, "linear_fuse", None) is not None:
# 使用线性融合层的名称作为命名空间
with tf.name_scope(self.linear_fuse.name):
# 使用给定形状构建线性融合层,形状包括 None 表示任意大小
self.linear_fuse.build(
[None, None, None, self.config.decoder_hidden_size * self.config.num_encoder_blocks]
)
# 如果存在批量归一化层(batch_norm),则构建该层
if getattr(self, "batch_norm", None) is not None:
# 使用批量归一化层的名称作为命名空间
with tf.name_scope(self.batch_norm.name):
# 使用给定形状构建批量归一化层,形状中的 None 表示任意大小
self.batch_norm.build([None, None, None, self.config.decoder_hidden_size])
# 如果存在分类器(classifier),则构建该层
if getattr(self, "classifier", None) is not None:
# 使用分类器的名称作为命名空间
with tf.name_scope(self.classifier.name):
# 使用给定形状构建分类器,形状中的 None 表示任意大小
self.classifier.build([None, None, None, self.config.decoder_hidden_size])
# 如果存在多层感知机(mlps),则逐层构建每个多层感知机层
if getattr(self, "mlps", None) is not None:
for layer in self.mlps:
# 使用每层多层感知机层的名称作为命名空间
with tf.name_scope(layer.name):
# 每层多层感知机层不需要特定的输入形状,因此传入 None
layer.build(None)
# 使用特定的文档字符串初始化一个 SegFormer 模型,该模型在顶部具有全MLP解码头,例如用于 ADE20k、CityScapes 数据集。
# 继承自 TFSegformerPreTrainedModel 类
class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
def __init__(self, config: SegformerConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(config, **kwargs)
# 初始化 SegFormer 主层,并命名为 "segformer"
self.segformer = TFSegformerMainLayer(config, name="segformer")
# 初始化 SegFormer 解码头,并命名为 "decode_head"
self.decode_head = TFSegformerDecodeHead(config, name="decode_head")
def hf_compute_loss(self, logits, labels):
# 将 logits 插值(上采样)到原始图像尺寸
# `labels` 的形状为 (batch_size, height, width)
label_interp_shape = shape_list(labels)[1:]
upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
# 定义加权损失函数
loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
def masked_loss(real, pred):
# 计算未屏蔽的损失
unmasked_loss = loss_fct(real, pred)
# 创建掩码,排除标签为 self.config.semantic_loss_ignore_index 的位置
mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype)
masked_loss = unmasked_loss * mask
# 通过加权损失计算减少的掩码损失
reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)
return tf.reshape(reduced_masked_loss, (1,))
return masked_loss(labels, upsampled_logits)
@unpack_inputs
# 添加前向模型调用的文档字符串,使用 SEGFORMER_INPUTS_DOCSTRING 模板,指定输入的格式为 "batch_size, sequence_length"
@add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 替换返回值文档字符串,指定输出类型为 TFSemanticSegmenterOutput,使用 _CONFIG_FOR_DOC 类配置
@replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
pixel_values: tf.Tensor,
labels: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 return_dict 不为 None,则使用其当前值;否则使用 self.config.use_return_dict 的值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果 output_hidden_states 不为 None,则使用其当前值;否则使用 self.config.output_hidden_states 的值
outputs = self.segformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=True, # we need the intermediate hidden states
return_dict=return_dict,
)
# 调用 self.segformer 进行语义分割模型的前向传播计算,传入像素值 pixel_values,
# 设置 output_attentions 和 return_dict 参数,确保返回中间隐藏状态
encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
# 如果 return_dict 为 True,则使用 outputs 的 hidden_states;否则使用 outputs 的第二个元素作为编码器的隐藏状态
logits = self.decode_head(encoder_hidden_states)
# 根据编码器的隐藏状态计算预测 logits
loss = None
if labels is not None:
if not self.config.num_labels > 1:
raise ValueError("The number of labels should be greater than one")
else:
loss = self.hf_compute_loss(logits=logits, labels=labels)
# 如果 labels 不为 None,则计算损失值,确保标签数量大于1,否则抛出 ValueError
# 调整 logits 的形状为 (batch_size, num_labels, height, width),以保持 API 一致性
logits = tf.transpose(logits, perm=[0, 3, 1, 2])
if not return_dict:
if output_hidden_states:
output = (logits,) + outputs[1:]
else:
output = (logits,) + outputs[2:]
# 如果 return_dict 为 False,根据 output_hidden_states 的值选择返回 logits 和隐藏状态列表或注意力列表
return ((loss,) + output) if loss is not None else output
# 返回包含损失值和输出内容的元组,如果损失值为 None,则返回输出内容
return TFSemanticSegmenterOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions,
)
# 如果 return_dict 为 True,则返回 TFSemanticSegmenterOutput 对象,包含损失值、logits、隐藏状态和注意力信息
# 定义模型构建方法,接受输入形状参数,默认为None
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回,不再重复构建
if self.built:
return
# 设置模型已构建标志为True
self.built = True
# 检查是否存在名为"segformer"的属性,并且该属性不为None
if getattr(self, "segformer", None) is not None:
# 在TensorFlow的命名作用域内,使用self.segformer.name作为作用域名
with tf.name_scope(self.segformer.name):
# 调用self.segformer对象的build方法,传入None作为输入形状参数
self.segformer.build(None)
# 检查是否存在名为"decode_head"的属性,并且该属性不为None
if getattr(self, "decode_head", None) is not None:
# 在TensorFlow的命名作用域内,使用self.decode_head.name作为作用域名
with tf.name_scope(self.decode_head.name):
# 调用self.decode_head对象的build方法,传入None作为输入形状参数
self.decode_head.build(None)
.\models\segformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig", "SegformerOnnxConfig"]
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_segformer"] = ["SegformerFeatureExtractor"]
_import_structure["image_processing_segformer"] = ["SegformerImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_segformer"] = [
"SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"SegformerDecodeHead",
"SegformerForImageClassification",
"SegformerForSemanticSegmentation",
"SegformerLayer",
"SegformerModel",
"SegformerPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_segformer"] = [
"TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFSegformerDecodeHead",
"TFSegformerForImageClassification",
"TFSegformerForSemanticSegmentation",
"TFSegformerModel",
"TFSegformerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig, SegformerOnnxConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_segformer import SegformerFeatureExtractor
from .image_processing_segformer import SegformerImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_segformer import (
SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
SegformerDecodeHead,
SegformerForImageClassification,
SegformerForSemanticSegmentation,
SegformerLayer,
SegformerModel,
SegformerPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_segformer import (
TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
TFSegformerDecodeHead,
TFSegformerForImageClassification,
TFSegformerForSemanticSegmentation,
TFSegformerModel,
TFSegformerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\seggpt\configuration_seggpt.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"BAAI/seggpt-vit-large": "https://huggingface.co/BAAI/seggpt-vit-large/resolve/main/config.json",
}
class SegGptConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SegGptModel`]. It is used to instantiate a SegGPT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the SegGPT
[BAAI/seggpt-vit-large](https://huggingface.co/BAAI/seggpt-vit-large) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 1024):
# 编码器层和池化层的维度。
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 24):
# Transformer 编码器中隐藏层的数量。
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 16):
# 每个注意力层中的注意力头的数量。
Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
# 编码器和池化器中的非线性激活函数。
The non-linear activation function (function or string) in the encoder and pooler.
如果是字符串,支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`。
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
# 嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
initializer_range (`float`, *optional*, defaults to 0.02):
# 用于初始化所有权重矩阵的截断正态分布的标准差。
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
# 层归一化层使用的 epsilon。
The epsilon used by the layer normalization layers.
image_size (`List[int]`, *optional*, defaults to `[896, 448]`):
# 每个图像的大小(分辨率)。
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
# 每个图块的大小(分辨率)。
The size (resolution) of each patch.
num_channels (`int`, *optional*, defaults to 3):
# 输入通道的数量。
The number of input channels.
qkv_bias (`bool`, *optional*, defaults to `True`):
# 是否为查询、键和值添加偏置。
Whether to add a bias to the queries, keys and values.
mlp_dim (`int`, *optional*):
# Transformer 编码器中MLP层的维度。如果未设置,默认为 `hidden_size * 4`。
The dimensionality of the MLP layer in the Transformer encoder.
drop_path_rate (`float`, *optional*, defaults to 0.1):
# dropout层的drop path比率。
The drop path rate for the dropout layers.
pretrain_image_size (`int`, *optional*, defaults to 224):
# 绝对位置嵌入的预训练大小。
The pretrained size of the absolute position embeddings.
decoder_hidden_size (`int`, *optional*, defaults to 64):
# 解码器的隐藏大小。
Hidden size for decoder.
use_relative_position_embeddings (`bool`, *optional*, defaults to `True`):
# 是否在注意力层中使用相对位置嵌入。
Whether to use relative position embeddings in the attention layers.
merge_index (`int`, *optional*, defaults to 2):
# 合并嵌入的编码器层的索引。
The index of the encoder layer to merge the embeddings.
intermediate_hidden_state_indices (`List[int]`, *optional*, defaults to `[5, 11, 17, 23]`):
# 我们存储为解码器特征的编码器层的索引。
The indices of the encoder layers which we store as features for the decoder.
beta (`float`, *optional*, defaults to 0.01):
# SegGptLoss(平滑L1损失)的正则化因子。
Regularization factor for SegGptLoss (smooth-l1 loss).
Example:
```
>>> from transformers import SegGptConfig, SegGptModel
>>> # Initializing a SegGPT seggpt-vit-large style configuration
>>> configuration = SegGptConfig()
# 初始化一个 SegGptModel 模型对象,使用给定的配置参数(包含随机权重)
model = SegGptModel(configuration)
# 访问模型的配置参数
configuration = model.config
.\models\seggpt\convert_seggpt_to_hf.py
"""Convert SegGPT checkpoints from the original repository.
URL: https://github.com/baaivision/Painter/tree/main/SegGPT
"""
import argparse
import requests
import torch
from PIL import Image
from transformers import SegGptConfig, SegGptForImageSegmentation, SegGptImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("patch_embed.proj.weight", "model.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("patch_embed.proj.bias", "model.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("mask_token", "model.embeddings.mask_token"))
rename_keys.append(("segment_token_x", "model.embeddings.segment_token_input"))
rename_keys.append(("segment_token_y", "model.embeddings.segment_token_prompt"))
rename_keys.append(("type_token_cls", "model.embeddings.type_token_semantic"))
rename_keys.append(("type_token_ins", "model.embeddings.type_token_instance"))
rename_keys.append(("pos_embed", "model.embeddings.position_embeddings"))
rename_keys.append(("norm.weight", "model.encoder.layernorm.weight"))
rename_keys.append(("norm.bias", "model.encoder.layernorm.bias"))
rename_keys.append(("decoder_embed.weight", "decoder.decoder_embed.weight"))
rename_keys.append(("decoder_embed.bias", "decoder.decoder_embed.bias"))
rename_keys.append(("decoder_pred.0.weight", "decoder.decoder_pred.conv.weight"))
rename_keys.append(("decoder_pred.0.bias", "decoder.decoder_pred.conv.bias"))
rename_keys.append(("decoder_pred.1.weight", "decoder.decoder_pred.layernorm.weight"))
rename_keys.append(("decoder_pred.1.bias", "decoder.decoder_pred.layernorm.bias"))
rename_keys.append(("decoder_pred.3.weight", "decoder.decoder_pred.head.weight"))
rename_keys.append(("decoder_pred.3.bias", "decoder.decoder_pred.head.bias"))
for i in range(config.num_hidden_layers):
rename_keys.append((f"blocks.{i}.attn.qkv.weight", f"model.encoder.layers.{i}.attention.qkv.weight"))
rename_keys.append((f"blocks.{i}.attn.qkv.bias", f"model.encoder.layers.{i}.attention.qkv.bias"))
rename_keys.append((f"blocks.{i}.attn.proj.weight", f"model.encoder.layers.{i}.attention.proj.weight"))
rename_keys.append((f"blocks.{i}.attn.proj.bias", f"model.encoder.layers.{i}.attention.proj.bias"))
rename_keys.append((f"blocks.{i}.attn.rel_pos_h", f"model.encoder.layers.{i}.attention.rel_pos_h"))
rename_keys.append((f"blocks.{i}.attn.rel_pos_w", f"model.encoder.layers.{i}.attention.rel_pos_w"))
rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"model.encoder.layers.{i}.mlp.lin1.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"model.encoder.layers.{i}.mlp.lin1.bias"))
rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"model.encoder.layers.{i}.mlp.lin2.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"model.encoder.layers.{i}.mlp.lin2.bias"))
rename_keys.append((f"blocks.{i}.norm1.weight", f"model.encoder.layers.{i}.layernorm_before.weight"))
rename_keys.append((f"blocks.{i}.norm1.bias", f"model.encoder.layers.{i}.layernorm_before.bias"))
rename_keys.append((f"blocks.{i}.norm2.weight", f"model.encoder.layers.{i}.layernorm_after.weight"))
rename_keys.append((f"blocks.{i}.norm2.bias", f"model.encoder.layers.{i}.layernorm_after.bias"))
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_input():
image_input_url = (
"https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
)
image_prompt_url = (
"https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
)
mask_prompt_url = (
"https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
)
image_input = Image.open(requests.get(image_input_url, stream=True).raw)
image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw)
return image_input, image_prompt, mask_prompt
@torch.no_grad()
def convert_seggpt_checkpoint(args):
model_name = args.model_name
pytorch_dump_folder_path = args.pytorch_dump_folder_path
verify_logits = args.verify_logits
push_to_hub = args.push_to_hub
config = SegGptConfig()
checkpoint_url = "https://huggingface.co/BAAI/SegGpt/blob/main/seggpt_vit_large.pth"
original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
new_state_dict = original_state_dict.copy()
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(new_state_dict, src, dest)
model = SegGptForImageSegmentation(config)
model.eval()
missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
input_img, prompt_img, prompt_mask = prepare_input()
image_processor = SegGptImageProcessor()
inputs = image_processor(images=input_img, prompt_images=prompt_img, prompt_masks=prompt_mask, return_tensors="pt")
expected_prompt_pixel_values = torch.tensor(
[
[[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965]],
[[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583]],
[[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088]],
]
)
expected_pixel_values = torch.tensor(
[
[[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810], [1.5810, 1.5639, 1.5639]],
[[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031], [1.2206, 1.2031, 1.1681]],
[[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465], [1.6291, 1.6291, 1.6291]],
]
)
expected_prompt_masks = torch.tensor(
[
[[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179]],
[[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357]],
[[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044]],
]
)
assert torch.allclose(inputs.pixel_values[0, :, :3, :3], expected_pixel_values, atol=1e-4)
assert torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_values, atol=1e-4)
assert torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4)
torch.manual_seed(2)
outputs = model(**inputs)
print(outputs)
if verify_logits:
expected_output = torch.tensor(
[
[[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
[[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
[[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
]
)
assert torch.allclose(outputs.pred_masks[0, :, :3, :3], expected_output, atol=1e-4)
print("Looks good!")
else:
print("Converted without verifying logits")
if pytorch_dump_folder_path is not None:
print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
image_processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing model and processor for {model_name} to hub")
model.push_to_hub(f"EduardoPacheco/{model_name}")
image_processor.push_to_hub(f"EduardoPacheco/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="seggpt-vit-large",
type=str,
choices=["seggpt-vit-large"],
help="Name of the SegGpt model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--verify_logits",
action="store_false",
help="Whether or not to verify the logits against the original implementation.",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_seggpt_checkpoint(args)
.\models\seggpt\image_processing_seggpt.py
"""Image processor class for SegGPT."""
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_channel_dimension_axis,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import TensorType, is_torch_available, logging, requires_backends
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
def build_palette(num_labels: int) -> List[Tuple[int, int]]:
base = int(num_labels ** (1 / 3)) + 1
margin = 256 // base
color_list = [(0, 0, 0)]
for location in range(num_labels):
num_seq_r = location // base**2
num_seq_g = (location % base**2) // base
num_seq_b = location % base
R = 255 - num_seq_r * margin
G = 255 - num_seq_g * margin
B = 255 - num_seq_b * margin
color_list.append((R, G, B))
return color_list
def get_num_channels(image: np.ndarray, input_data_format: ChannelDimension) -> int:
if image.ndim == 2:
return 0
channel_idx = get_channel_dimension_axis(image, input_data_format)
return image.shape[channel_idx]
def mask_to_rgb(
mask: np.ndarray,
palette: Optional[List[Tuple[int, int]]] = None,
input_data_format: Optional[ChannelDimension] = None,
data_format: Optional[ChannelDimension] = None,
) -> np.ndarray:
if input_data_format is None and mask.ndim > 2:
input_data_format = infer_channel_dimension_format(mask)
data_format = data_format if data_format is not None else input_data_format
num_channels = get_num_channels(mask, input_data_format)
if num_channels == 3:
return to_channel_dimension_format(mask, data_format, input_data_format) if data_format is not None else mask
if palette is not None:
height, width = mask.shape
rgb_mask = np.zeros((3, height, width), dtype=np.uint8)
classes_in_mask = np.unique(mask)
for class_idx in classes_in_mask:
rgb_value = palette[class_idx]
class_mask = (mask == class_idx).astype(np.uint8)
class_mask = np.expand_dims(class_mask, axis=-1)
class_rgb_mask = class_mask * np.array(rgb_value)
class_rgb_mask = np.moveaxis(class_rgb_mask, -1, 0)
rgb_mask += class_rgb_mask.astype(np.uint8)
rgb_mask = np.clip(rgb_mask, 0, 255).astype(np.uint8)
else:
rgb_mask = np.repeat(mask[None, ...], 3, axis=0)
return (
to_channel_dimension_format(rgb_mask, data_format, input_data_format) if data_format is not None else rgb_mask
)
r"""
Constructs a SegGpt image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
size (`dict`, *optional*, defaults to `{"height": 448, "width": 448}`):
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 448, "width": 448}
size = get_size_dict(size)
self.do_resize = do_resize
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.size = size
self.resample = resample
self.rescale_factor = rescale_factor
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
def get_palette(self, num_labels: int) -> List[Tuple[int, int]]:
"""Build a palette to map the prompt mask from a single channel to a 3 channel RGB.
Args:
num_labels (`int`):
Number of classes in the segmentation task (excluding the background).
Returns:
`List[Tuple[int, int]]`: Palette to map the prompt mask from a single channel to a 3 channel RGB.
"""
return build_palette(num_labels)
def mask_to_rgb(
self,
image: np.ndarray,
palette: Optional[List[Tuple[int, int]]] = None,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Convert a mask to RGB format.
Args:
image (`np.ndarray`):
Mask to convert to RGB format. If the mask is already in RGB format, it will be passed through.
palette (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
Palette to use to convert the mask to RGB format. If unset, the mask is duplicated across the channel
dimension.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The mask in RGB format.
"""
return mask_to_rgb(
image,
palette=palette,
data_format=data_format,
input_data_format=input_data_format,
)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def _preprocess_step(
self,
images: ImageInput,
is_mask: bool = False,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
num_labels: Optional[int] = None,
**kwargs,
):
pass
def preprocess(
self,
images: Optional[ImageInput] = None,
prompt_images: Optional[ImageInput] = None,
prompt_masks: Optional[ImageInput] = None,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
num_labels: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
pass
def post_process_semantic_segmentation(
self, outputs, target_sizes: Optional[List[Tuple[int, int]]] = None, num_labels: Optional[int] = None
):
pass
.\models\seggpt\modeling_seggpt.py
""" PyTorch SegGpt model."""
import collections.abc
import math
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import functional as F
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_seggpt import SegGptConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "SegGptConfig"
_CHECKPOINT_FOR_DOC = "BAAI/seggpt-vit-large"
_EXPECTED_OUTPUT_SHAPE = [3, 896, 448]
SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"BAAI/seggpt-vit-large",
]
@dataclass
class SegGptEncoderOutput(ModelOutput):
"""
Output type of [`SegGptEncoderOutput`].
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape `(batch_size, patch_height, patch_width, hidden_size)`.
attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
Tuple of *torch.FloatTensor* (one for each layer) of shape
`(batch_size, num_heads, seq_len, seq_len)`.
intermediate_hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.intermediate_hidden_state_indices` is set):
Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
Additionaly, each feature passes through a LayerNorm.
"""
last_hidden_state: torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
intermediate_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class SegGptImageSegmentationOutput(ModelOutput):
"""
Output type of [`SegGptImageSegmentationOutput`].
Args:
loss (`torch.FloatTensor`, `optional`, returned when `labels` is provided):
The loss value.
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
The predicted masks.
hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape `(batch_size, patch_height, patch_width, hidden_size)`.
attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape
`(batch_size, num_heads, seq_len, seq_len)`.
"""
loss: Optional[torch.FloatTensor] = None
pred_masks: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
class SegGptPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values):
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
)
embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
return embeddings
class SegGptEmbeddings(nn.Module):
"""
Placeholder for SegGptEmbeddings class definition.
"""
Construct the embeddings from patch, position embeddings for input and prompt.
"""
# 定义一个名为SegGptEmbeddings的类,继承自父类nn.Module
def __init__(self, config: SegGptConfig) -> None:
super().__init__()
# 定义用于掩码的张量参数
self.mask_token = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
# 定义输入分段标记的张量参数
self.segment_token_input = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
# 定义提示分段标记的张量参数
self.segment_token_prompt = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
# 定义语义类型标记的张量参数
# token for seg types
self.type_token_semantic = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
# 定义实例类型标记的张量参数
self.type_token_instance = nn.Parameter(torch.zeros(1, 1, 1, config.hidden_size))
# 初始化图像块嵌入对象
self.patch_embeddings = SegGptPatchEmbeddings(config)
# 计算位置嵌入的数量
num_positions = (config.pretrain_image_size // config.patch_size) ** 2 + 1
# 定义位置嵌入的张量参数
self.position_embeddings = nn.Parameter(torch.randn(1, num_positions, config.hidden_size))
# 定义丢弃层
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 定义一个插值位置编码的方法
def interpolate_pos_encoding(self, height: int, width: int) -> torch.Tensor:
# 获取位置编码中的图像块位置嵌入
patch_pos_embed = self.position_embeddings[:, 1:]
# 计算图像块的数量
num_patches = patch_pos_embed.shape[1]
# 计算预训练图像块大小的平方根
pretrain_patch_size = int(math.sqrt(num_patches))
# 如果预训练图像块大小与给定的高度或宽度不匹配,则进行插值处理
if pretrain_patch_size != height or pretrain_patch_size != width:
# 使用双三次插值方法对位置编码进行插值
patch_pos_embed = F.interpolate(
patch_pos_embed.reshape(1, pretrain_patch_size, pretrain_patch_size, -1).permute(0, 3, 1, 2),
size=(height, width),
mode="bicubic",
align_corners=False,
)
# 将插值后的位置编码张量进行维度调整,并返回
return patch_pos_embed.permute(0, 2, 3, 1)
else:
# 如果不需要插值,则直接返回原始的位置编码张量
return patch_pos_embed.reshape(1, height, width, -1)
# 定义前向传播方法
def forward(
self,
pixel_values: torch.Tensor,
prompt_pixel_values: torch.Tensor,
bool_masked_pos: Optional[torch.BoolTensor] = None,
embedding_type: Optional[str] = None,
# 继续定义其他参数
) -> torch.Tensor:
# 使用self.patch_embeddings方法将像素值转换为输入嵌入
input_embeddings = self.patch_embeddings(pixel_values)
# 使用self.patch_embeddings方法将提示像素值转换为提示嵌入
prompt_embeddings = self.patch_embeddings(prompt_pixel_values)
# 获取输入嵌入的维度信息
batch_size, patch_height, patch_width, _ = input_embeddings.shape
# 扩展mask_token以匹配输入嵌入的形状
mask_token = self.mask_token.expand(batch_size, patch_height, patch_width, -1)
# 使用bool_masked_pos创建一个掩码,将掩码处的视觉标记替换为mask_token
w = bool_masked_pos.unsqueeze(-1).type_as(mask_token).reshape(-1, patch_height, patch_width, 1)
prompt_embeddings = prompt_embeddings * (1 - w) + mask_token * w
# 如果未指定embedding_type,则默认为"instance"
embedding_type = embedding_type if embedding_type is not None else "instance"
# 添加位置编码到每个标记
pos_embed = self.interpolate_pos_encoding(patch_height, patch_width)
# 添加段标记到输入嵌入和提示嵌入
input_embeddings = input_embeddings + self.segment_token_input
prompt_embeddings = prompt_embeddings + self.segment_token_prompt
# 跳过CLS后,添加位置编码到输入嵌入和提示嵌入
input_embeddings = input_embeddings + pos_embed
prompt_embeddings = prompt_embeddings + pos_embed
# 根据embedding_type选择对应的类型嵌入
if embedding_type == "semantic":
type_embedding = self.type_token_semantic
elif embedding_type == "instance":
type_embedding = self.type_token_instance
else:
raise ValueError(f"Embedding type should be either 'semantic' or 'instance', but got {embedding_type}")
# 添加类型嵌入到输入嵌入和提示嵌入
input_embeddings = input_embeddings + type_embedding
prompt_embeddings = prompt_embeddings + type_embedding
# 将输入嵌入和提示嵌入连接起来形成最终的嵌入张量
embeddings = torch.cat((input_embeddings, prompt_embeddings), dim=0)
# 返回最终的嵌入张量
return embeddings
query: torch.Tensor,
rel_pos_h: torch.Tensor,
rel_pos_w: torch.Tensor,
q_size: Tuple[int, int],
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Add decomposed relative positional embeddings to attention scores.
Args:
attn (torch.Tensor):
Attention scores.
query (torch.Tensor):
Query tensor.
rel_pos_h (torch.Tensor):
Relative positional embeddings along height dimension.
rel_pos_w (torch.Tensor):
Relative positional embeddings along width dimension.
q_size (Tuple[int, int]):
Size of the query tensor.
k_size (Tuple[int, int]):
Size of the key tensor.
Returns:
Updated attention scores with added decomposed relative positional embeddings.
"""
# Get relative position embeddings based on query and key sizes
rel_pos_h = self.get_rel_pos(q_size[0], k_size[0], rel_pos_h)
rel_pos_w = self.get_rel_pos(q_size[1], k_size[1], rel_pos_w)
# Add relative position embeddings to attention scores
attn += torch.matmul(query, rel_pos_h.unsqueeze(0)) + torch.matmul(query, rel_pos_w.unsqueeze(0).transpose(-2, -1))
return attn
) -> torch.Tensor:
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
Args:
attn (`torch.Tensor`):
attention map.
query (`torch.Tensor`):
query q in the attention layer with shape (batch_size, query_height * query_width, channel).
rel_pos_h (`torch.Tensor`):
relative position embeddings (Lh, channel) for height axis.
rel_pos_w (`torch.Tensor`):
relative position embeddings (Lw, channel) for width axis.
q_size (tuple):
spatial sequence size of query q with (query_height, query_width).
k_size (tuple):
spatial sequence size of key k with (key_height, key_width).
Returns:
attn (`torch.Tensor`):
attention map with added relative positional embeddings.
"""
# 解构 q_size 元组,获取查询张量的高度和宽度
query_height, query_width = q_size
# 解构 k_size 元组,获取键张量的高度和宽度
key_height, key_width = k_size
# 获取高度轴的相对位置编码,形状为 (batch_size, query_height, query_width, key_height, channel)
relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
# 获取宽度轴的相对位置编码,形状为 (batch_size, query_height, query_width, key_width, channel)
relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
# 获取查询张量的批量大小、高度、宽度和维度
batch_size, _, dim = query.shape
# 将查询张量重塑为四维张量 (batch_size, query_height, query_width, dim)
reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
# 计算高度轴的相对位置编码与查询张量的乘积,形状为 (batch_size, query_height, query_width, key_height)
rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
# 计算宽度轴的相对位置编码与查询张量的乘积,形状为 (batch_size, query_height, query_width, key_width)
rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
# 将注意力图重塑为五维张量 (batch_size, query_height, query_width, key_height, key_width)
attn = attn.reshape(batch_size, query_height, query_width, key_height, key_width)
# 将注意力图与高度轴和宽度轴的相对位置编码相加
attn = attn + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
# 将注意力图重塑为二维张量 (batch_size, query_height * query_width, key_height * key_width)
attn = attn.reshape(batch_size, query_height * query_width, key_height * key_width)
# 返回添加了相对位置编码的注意力图
return attn
def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
# 获取隐藏状态的形状信息,batch_size为批大小,height为高度,width为宽度,_为通道数
batch_size, height, width, _ = hidden_states.shape
# 使用self.qkv对隐藏状态进行qkv计算,结果形状为(3, batch_size, num_attention_heads, height * width, embed_dim)
qkv = (
self.qkv(hidden_states)
.reshape(batch_size, height * width, 3, self.num_attention_heads, -1) # 重塑形状以便后续操作
.permute(2, 0, 3, 1, 4) # 转置以便得到q, k, v分量
)
# 将qkv分解为query, key, value三个部分,形状为(batch_size * num_attention_heads, height * width, embed_dim)
query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
# 计算注意力权重,形状为(batch_size * num_attention_heads, height * width, height * width)
attn_weights = (query * self.scale) @ key.transpose(-2, -1)
# 如果使用相对位置编码,则对注意力权重进行处理
if self.use_relative_position_embeddings:
attn_weights = self.add_decomposed_rel_pos(
attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
)
# 对注意力权重进行softmax操作,保留query的数据类型
attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
# 如果需要输出注意力权重,则进行特定的形状重塑操作,否则attn_weights_reshaped为None
if output_attentions:
# 这个操作有些笨拙,但是需要确保attn_weights保持其梯度。
# 为了做到这一点,attn_weights必须进行两次重塑,并且在接下来的使用中需要重用它们
attn_weights_reshaped = attn_weights.view(batch_size, self.num_attention_heads, height * width, -1)
attn_weights = attn_weights_reshaped.view(batch_size * self.num_attention_heads, height * width, -1)
else:
attn_weights_reshaped = None
# 计算注意力输出,形状为(batch_size, num_attention_heads, height, width, embed_dim)
attn_output = (attn_weights @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
# 调整输出的形状,使其变为(batch_size, height, width, num_attention_heads * embed_dim)
attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
# 对注意力输出进行投影,形状为(batch_size, height, width, embed_dim)
attn_output = self.proj(attn_output)
# 返回注意力输出和注意力权重的重塑形状(如果需要)
return (attn_output, attn_weights_reshaped)
# 从transformers.models.sam.modeling_sam.SamMLPBlock复制到SegGptMlp
class SegGptMlp(nn.Module):
def __init__(self, config):
super().__init__()
# 创建一个线性层,输入维度是config.hidden_size,输出维度是config.mlp_dim
self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
# 创建另一个线性层,输入维度是config.mlp_dim,输出维度是config.hidden_size
self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
# 选择激活函数,根据config.hidden_act从预定义的ACT2FN字典中选择对应的函数
self.act = ACT2FN[config.hidden_act]
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 对输入的hidden_states应用第一个线性层
hidden_states = self.lin1(hidden_states)
# 应用选择的激活函数
hidden_states = self.act(hidden_states)
# 对应用激活函数后的结果应用第二个线性层
hidden_states = self.lin2(hidden_states)
# 返回处理后的hidden_states作为输出
return hidden_states
# 从transformers.models.beit.modeling_beit.drop_path复制
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
按样本丢弃路径(随机深度),应用于残差块的主路径中。
Ross Wightman的注释:这与我为EfficientNet等网络创建的DropConnect实现相同,但原始名称误导,因为'Drop Connect'是另一篇论文中的一种不同的丢弃形式...
参见讨论:https://github.com/tensorflow/tpu/issues/494
"""
if drop_prob == 0.0 or not training:
# 如果drop_prob为0或者不处于训练模式,则直接返回输入
return input
keep_prob = 1 - drop_prob
# 创建一个与输入张量形状相同的随机张量,值在[keep_prob, 1.0)之间
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # 将随机张量二值化
# 应用丢弃路径操作,将输入张量按照keep_prob进行缩放
output = input.div(keep_prob) * random_tensor
return output
# 从transformers.models.beit.modeling_beit.BeitDropPath复制到SegGptDropPath
class SegGptDropPath(nn.Module):
"""按样本丢弃路径(随机深度),应用于残差块的主路径中。"""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 调用drop_path函数,传入hidden_states、drop_prob和当前模块是否处于训练模式
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class SegGptLayer(nn.Module):
def __init__(self, config: SegGptConfig, drop_path_rate: float) -> None:
super().__init__()
# 创建一个SegGptAttention对象,使用给定的config
self.attention = SegGptAttention(config)
# 创建一个SegGptMlp对象,使用给定的config
self.mlp = SegGptMlp(config)
# 如果drop_path_rate大于0.0,则创建一个SegGptDropPath对象,否则创建一个恒等映射(nn.Identity())
self.drop_path = SegGptDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
# 创建一个LayerNorm层,输入维度是config.hidden_size,epsilon值是config.layer_norm_eps
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 定义神经网络的前向传播方法,接收多个输入参数并返回一个或两个张量的元组
def forward(
self,
hidden_states: torch.Tensor,
ensemble_cond: int,
feature_ensemble: bool = False,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
# 使用 self.attention 方法进行自注意力计算,先对 hidden_states 进行 layernorm 处理
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states), # 在 SegGpt 中,在进行自注意力计算前先应用 layernorm
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0] # 提取自注意力计算的输出
outputs = self_attention_outputs[1:] # 如果需要输出注意力权重,则将其添加到 outputs 中
# 如果 feature_ensemble 为 True,且满足 ensemble_cond 条件
if feature_ensemble and attention_output.shape[0] // 2 >= ensemble_cond:
# 将 attention_output 拆分为 prompt 和 inputs
prompt, inputs = attention_output.split(attention_output.shape[1] // 2, dim=1)
# 如果 ensemble_cond 等于 2
if ensemble_cond == 2:
num_prompts = attention_output.shape[0] // 2
# 对 inputs 进行形状调整和均值计算
inputs = inputs.reshape(2, num_prompts, -1)
inputs = inputs.mean(dim=1, keepdim=True).expand_as(inputs)
inputs = inputs.reshape(*prompt.shape)
else:
# 对 inputs 进行均值计算和扩展
inputs = inputs.mean(dim=0, keepdim=True).expand_as(inputs)
# 拼接处理后的 prompt 和 inputs,并更新 attention_output
attention_output = torch.cat([prompt, inputs], dim=1)
# 第一个残差连接
hidden_states = self.drop_path(attention_output) + hidden_states
residual = hidden_states # 保存残差连接后的 hidden_states
# 在 self.layernorm_after 后应用 layernorm
hidden_states = self.layernorm_after(hidden_states)
# 通过 MLP 网络进行非线性变换
hidden_states = self.mlp(hidden_states)
# 第二个残差连接
hidden_states = residual + self.drop_path(hidden_states)
outputs = (hidden_states,) + outputs # 更新 outputs,添加最终的 hidden_states
return outputs # 返回前向传播的结果
class SegGptEncoder(nn.Module):
# SegGpt 编码器类,继承自 nn.Module
def __init__(self, config: SegGptConfig) -> None:
super().__init__()
self.config = config
# 生成一个从0到配置的 drop_path_rate 的线性序列,并转换为 Python 列表
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
# 创建包含多个 SegGptLayer 实例的 ModuleList,每个实例使用不同的 drop_path_rate
self.layers = nn.ModuleList([SegGptLayer(config, dpr[i]) for i in range(config.num_hidden_layers)])
# 创建 LayerNorm 层,用于规范化隐藏状态的尺寸,设置 epsilon 为 config.layer_norm_eps
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 是否开启梯度检查点功能,默认为 False
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
feature_ensemble: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, SegGptEncoderOutput]:
# 如果输出隐藏状态,则初始化一个空元组来存储所有的隐藏状态
all_hidden_states = () if output_hidden_states else None
# 如果输出注意力权重,则初始化一个空元组来存储所有的注意力权重
all_self_attentions = () if output_attentions else None
# 用于存储中间隐藏状态的列表
intermediate_hidden_states = []
# 遍历所有层
for i, layer_module in enumerate(self.layers):
# 如果输出隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 根据当前层的索引判断是否需要多个提示来进行集成
ensemble_cond = 2 if self.config.merge_index > i else 1
# 如果开启梯度检查点功能并且正在训练,则使用梯度检查点函数来执行当前层的调用
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
ensemble_cond,
feature_ensemble,
output_attentions,
)
else:
# 否则直接调用当前层的前向传播方法
layer_outputs = layer_module(hidden_states, ensemble_cond, feature_ensemble, output_attentions)
# 更新隐藏状态为当前层输出的第一个元素
hidden_states = layer_outputs[0]
# 如果当前层的索引等于配置的 merge_index,则执行合并操作
if i == self.config.merge_index:
hidden_states = (
hidden_states[: hidden_states.shape[0] // 2] + hidden_states[hidden_states.shape[0] // 2 :]
) * 0.5
# 如果当前层的索引在配置的 intermediate_hidden_state_indices 中,则将规范化后的隐藏状态添加到中间隐藏状态列表中
if i in self.config.intermediate_hidden_state_indices:
intermediate_hidden_states.append(self.layernorm(hidden_states))
# 如果输出注意力权重,则将当前层的注意力权重添加到 all_self_attentions 中
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 如果输出隐藏状态,则将最后一个隐藏状态添加到 all_hidden_states 中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不返回字典,则返回一个元组,其中包含所有非空的结果项
if not return_dict:
return tuple(
v
for v in [hidden_states, all_hidden_states, all_self_attentions, intermediate_hidden_states]
if v is not None
)
# 否则返回 SegGptEncoderOutput 对象,包含最后的隐藏状态、所有隐藏状态、所有注意力权重和中间隐藏状态列表
return SegGptEncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
intermediate_hidden_states=intermediate_hidden_states,
)
# 从 transformers.models.convnext.modeling_convnext.ConvNextLayerNorm 复制并修改为 SegGptLayerNorm
class SegGptLayerNorm(nn.Module):
r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
"""
# 定义一个支持两种数据格式(channels_last 或 channels_first)的 LayerNorm 类
class LayerNorm(nn.Module):
# 初始化方法,接受 normalized_shape(标准化的维度大小)、eps(防止除零的小常数,默认为 1e-6)、data_format(数据格式,默认为 channels_last)
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
# 初始化权重参数为 1,并将其包装为 nn.Parameter,使其可以被优化
self.weight = nn.Parameter(torch.ones(normalized_shape))
# 初始化偏置参数为 0,并将其包装为 nn.Parameter,使其可以被优化
self.bias = nn.Parameter(torch.zeros(normalized_shape))
# 设置 eps 参数
self.eps = eps
# 检查数据格式是否为支持的 channels_last 或 channels_first
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError(f"Unsupported data format: {self.data_format}")
# 存储标准化的维度信息
self.normalized_shape = (normalized_shape,)
# 前向传播方法,接受输入张量 x,返回标准化后的张量
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 如果数据格式是 channels_last,则使用 torch.nn.functional.layer_norm 函数进行标准化
if self.data_format == "channels_last":
x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
# 如果数据格式是 channels_first,则手动实现标准化过程
elif self.data_format == "channels_first":
# 保存输入张量的数据类型
input_dtype = x.dtype
# 将输入张量转换为 float 类型
x = x.float()
# 计算均值 u
u = x.mean(1, keepdim=True)
# 计算方差 s
s = (x - u).pow(2).mean(1, keepdim=True)
# 标准化过程:(x - u) / sqrt(s + eps)
x = (x - u) / torch.sqrt(s + self.eps)
# 将输出张量的数据类型转换回输入的数据类型
x = x.to(dtype=input_dtype)
# 应用权重和偏置调整
x = self.weight[:, None, None] * x + self.bias[:, None, None]
# 返回标准化后的张量
return x
# 定义一个名为 SegGptDecoderHead 的类,继承自 nn.Module
class SegGptDecoderHead(nn.Module):
# 初始化方法,接收一个 config 参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 定义一个 2D 卷积层,输入和输出通道数都是 config.decoder_hidden_size,卷积核大小为 3x3,填充为 1
self.conv = nn.Conv2d(
config.decoder_hidden_size,
config.decoder_hidden_size,
kernel_size=3,
padding=1,
)
# 初始化一个 SegGptLayerNorm 实例,对输入进行归一化,通道顺序为 "channels_first"
self.layernorm = SegGptLayerNorm(
normalized_shape=config.decoder_hidden_size, eps=config.layer_norm_eps, data_format="channels_first"
)
# 根据配置选择激活函数,ACT2FN 是一个预定义的激活函数字典
self.act_fct = ACT2FN[config.hidden_act]
# 定义一个 1x1 的 2D 卷积层,将隐藏状态映射到 3 个通道,带有偏置
self.head = nn.Conv2d(config.decoder_hidden_size, 3, kernel_size=1, bias=True) # decoder to patch
# 前向传播方法,接收输入 hidden_states
def forward(self, hidden_states: torch.FloatTensor):
# 对隐藏状态进行卷积操作
hidden_states = self.conv(hidden_states)
# 对卷积后的结果进行归一化
hidden_states = self.layernorm(hidden_states)
# 应用预定义的激活函数
hidden_states = self.act_fct(hidden_states)
# 将激活后的结果再次经过一个 1x1 卷积层
hidden_states = self.head(hidden_states)
return hidden_states
# 定义一个名为 SegGptDecoder 的类,继承自 nn.Module
class SegGptDecoder(nn.Module):
# 初始化方法,接收一个 config 参数
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 定义一个线性层,用于将输入维度转换为 config.patch_size^2 * config.decoder_hidden_size 的输出维度
self.decoder_embed = nn.Linear(
config.hidden_size * len(config.intermediate_hidden_state_indices),
config.patch_size**2 * config.decoder_hidden_size,
bias=True,
)
# 初始化一个 SegGptDecoderHead 的实例,作为解码器的预测头部
self.decoder_pred = SegGptDecoderHead(config)
# 记录 patch 的大小
self.patch_size = config.patch_size
# 记录解码器隐藏层的大小
self.decoder_hidden_size = config.decoder_hidden_size
# 记录配置对象
self.config = config
# 定义一个辅助方法,用于重塑隐藏状态的形状
def _reshape_hidden_states(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
# 获取输入的张量形状信息
batch_size, patch_height, patch_width, _ = hidden_states.shape
# 将输入的张量重塑为新的形状
hidden_states = hidden_states.reshape(
batch_size, patch_height, patch_width, self.patch_size, self.patch_size, self.decoder_hidden_size
)
# 对重塑后的张量进行维度排列变换
hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
# 再次重塑为指定形状
hidden_states = hidden_states.reshape(
shape=(batch_size, -1, patch_height * self.patch_size, patch_width * self.patch_size)
)
return hidden_states
# 前向传播方法,接收输入 hidden_states
def forward(self, hidden_states: torch.FloatTensor):
# 将输入的隐藏状态先经过线性层进行维度转换
hidden_states = self.decoder_embed(hidden_states)
# 调用辅助方法重塑隐藏状态的形状
hidden_states = self._reshape_hidden_states(hidden_states)
# 将重塑后的隐藏状态传入解码器的预测头部进行处理
hidden_states = self.decoder_pred(hidden_states)
return hidden_states
# 定义一个名为 SegGptPreTrainedModel 的类,继承自 PreTrainedModel
class SegGptPreTrainedModel(PreTrainedModel):
"""
一个抽象类,处理权重初始化、预训练模型下载和加载的简单接口。
"""
# 类属性:配置类为 SegGptConfig
config_class = SegGptConfig
# 模型基础名称前缀为 "model"
base_model_prefix = "model"
# 主要输入名称为 "pixel_values"
main_input_name = "pixel_values"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 不拆分的模块列表
_no_split_modules = ["SegGptEmbeddings", "SegGptLayer"]
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""
# 从配置中获取初始化的标准差
std = self.config.initializer_range
# 如果模块是线性层或卷积层
if isinstance(module, (nn.Linear, nn.Conv2d)):
# 使用截断正态分布初始化权重,先将权重转换为 float32 类型以避免在 half 精度下出现 `trunc_normal_cpu` 未实现的问题,然后再转回原始的 dtype
module.weight.data = nn.init.trunc_normal_(module.weight.data.to(torch.float32), mean=0.0, std=std).to(
module.weight.dtype
)
# 如果存在偏置,则初始化为零
if module.bias is not None:
module.bias.data.zero_()
# 如果模块是 LayerNorm 层
elif isinstance(module, nn.LayerNorm):
# 初始化偏置为零
module.bias.data.zero_()
# 初始化权重为全 1
module.weight.data.fill_(1.0)
# 如果模块是 SegGptAttention 类型
elif isinstance(module, SegGptAttention):
# 使用截断正态分布初始化相对位置编码的水平方向数据
module.rel_pos_h.data = nn.init.trunc_normal_(
module.rel_pos_h.data.to(torch.float32),
mean=0.0,
std=std,
).to(module.rel_pos_h.dtype)
# 使用截断正态分布初始化相对位置编码的垂直方向数据
module.rel_pos_w.data = nn.init.trunc_normal_(
module.rel_pos_w.data.to(torch.float32),
mean=0.0,
std=std,
).to(module.rel_pos_w.dtype)
# 如果模块是 SegGptEmbeddings 类型
elif isinstance(module, SegGptEmbeddings):
# 使用截断正态分布初始化位置嵌入数据
module.position_embeddings.data = nn.init.trunc_normal_(
module.position_embeddings.data.to(torch.float32),
mean=0.0,
std=std,
).to(module.position_embeddings.dtype)
# 初始化其他特殊令牌的数据,使用正态分布初始化
torch.nn.init.normal_(module.mask_token, std=std)
torch.nn.init.normal_(module.segment_token_input, std=std)
torch.nn.init.normal_(module.segment_token_prompt, std=std)
torch.nn.init.normal_(module.type_token_semantic, std=std)
torch.nn.init.normal_(module.type_token_instance, std=std)
"""
This model is a PyTorch `torch.nn.Module` subclass designed for SegGpt model architecture. Use it
like any regular PyTorch Module and refer to the PyTorch documentation for general usage and behavior.
Parameters:
config (`SegGptConfig`): Model configuration class containing all model parameters.
Initializing with a config file loads the configuration settings only, not the model weights.
Use `PreTrainedModel.from_pretrained` to load weights associated with the model.
"""
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Input pixel values. These are obtained using `AutoImageProcessor`. See `SegGptImageProcessor.__call__`
for detailed information.
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Prompt-specific pixel values. These are obtained using `AutoImageProcessor`. See `SegGptImageProcessor.__call__`
for detailed information.
prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Mask applied to prompts. This is obtained using `AutoImageProcessor`. See `SegGptImageProcessor.__call__`
for detailed information.
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean tensor indicating masked positions (1 for masked, 0 for not masked).
feature_ensemble (`bool`, *optional*):
Indicates whether to use feature ensemble. If `True`, the model uses feature ensemble when multiple prompts
are present. If `False`, it does not. Relevant for few-shot inference on an input image with more than one prompt.
embedding_type (`str`, *optional*):
Type of embedding used for prompts. Can be 'instance' or 'semantic'.
output_attentions (`bool`, *optional*):
Whether to return the attentions tensors of all attention layers. See `attentions` in returned tensors
for more details.
output_hidden_states (`bool`, *optional*):
Whether to return the hidden states of all layers. See `hidden_states` in returned tensors for more details.
return_dict (`bool`, *optional*):
Whether to return a `utils.ModelOutput` instead of a plain tuple.
"""
@add_start_docstrings(
"The bare SegGpt Model transformer outputting raw hidden-states without any specific head on top.",
SEGGPT_START_DOCSTRING,
)
class SegGptModel(SegGptPreTrainedModel):
def __init__(self, config: SegGptConfig):
super().__init__(config)
self.config = config
self.embeddings = SegGptEmbeddings(config)
self.encoder = SegGptEncoder(config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> SegGptPatchEmbeddings:
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
"""
Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
# Access each layer of the encoder and prune specified heads in the attention mechanism
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(SEGGPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SegGptEncoderOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.Tensor,
prompt_pixel_values: torch.Tensor,
prompt_masks: torch.Tensor,
bool_masked_pos: Optional[torch.BoolTensor] = None,
feature_ensemble: Optional[bool] = None,
embedding_type: Optional[str] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Forward pass of the SegGptModel.
pixel_values: torch.Tensor of shape (batch_size, num_patches, embed_dim)
Tensor containing pixel values.
prompt_pixel_values: torch.Tensor of shape (batch_size, num_prompt_patches, embed_dim)
Tensor containing prompt pixel values.
prompt_masks: torch.Tensor of shape (batch_size, num_patches)
Mask to ignore prompt tokens.
bool_masked_pos: Optional[torch.BoolTensor], optional
Boolean mask for masked positions, by default None.
feature_ensemble: Optional[bool], optional
Whether to use feature ensemble, by default None.
embedding_type: Optional[str], optional
Type of embedding used, by default None.
output_attentions: Optional[bool], optional
Whether to output attentions, by default None.
output_hidden_states: Optional[bool], optional
Whether to output hidden states, by default None.
return_dict: Optional[bool], optional
Whether to return a dictionary, by default None.
Returns:
SegGptEncoderOutput or Tuple(torch.Tensor), torch.Tensor))
A SegGptEncoderOutput (if return_dict=True) or a tuple of torch.Tensors
(prompt_tokens, prompt_mask, prompt_patch_embedding)
"""
# Forward pass logic would be implemented here, detailing how inputs are processed
# through the layers of the model to produce the desired outputs.
# 定义一个函数,将输入的张量切分成指定大小的图块,并重新组织形状
def patchify(tensor: torch.Tensor, patch_size: int) -> torch.Tensor:
# 获取张量的批量大小、通道数、高度和宽度
batch_size, num_channels, height, width = tensor.shape
# 计算图块的高度和宽度
patch_height = height // patch_size
patch_width = width // patch_size
# 将张量重新形状为(batch_size, num_channels, patch_height, patch_size, patch_width, patch_size)
tensor = tensor.reshape(shape=(batch_size, num_channels, patch_height, patch_size, patch_width, patch_size))
# 对张量进行维度置换,调整为(batch_size, patch_height, patch_width, patch_size, patch_size, num_channels)
tensor = tensor.permute(0, 2, 4, 3, 5, 1)
# 再次重新形状为(batch_size, patch_height * patch_width, patch_size^2 * num_channels)
tensor = tensor.reshape(shape=(batch_size, patch_height * patch_width, patch_size**2 * 3))
return tensor
# 定义一个函数,将输入的张量反转回原始的高度和宽度
def unpatchify(tensor: torch.Tensor, patch_height: int, patch_width: int) -> torch.Tensor:
# 获取张量的批量大小
batch_size = tensor.shape[0]
# 推断出图块的大小
patch_size = int((tensor.shape[-1] / 3) ** 0.5)
# 检查图块数量是否与给定的patch_height和patch_width相匹配
if patch_height * patch_width != tensor.shape[1]:
raise ValueError(f"Number of patches {tensor.shape[1]} does not match patch height and width.")
# 将张量重新形状为(batch_size, patch_height, patch_width, patch_size, patch_size, 3)
tensor = tensor.reshape(shape=(batch_size, patch_height, patch_width, patch_size, patch_size, 3))
# 对张量进行维度置换,调整为(batch_size, 3, patch_height * patch_size, patch_width * patch_size)
tensor = tensor.permute(0, 5, 1, 3, 2, 4)
return tensor
# 定义一个用于语义分割和GPT模型的损失函数类
class SegGptLoss(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化损失函数的参数
self.beta = config.beta
self.patch_size = config.patch_size
# 前向传播方法,计算损失
def forward(
self,
pixel_values: torch.FloatTensor,
prompt_pixel_values: torch.FloatTensor,
pred_masks: torch.FloatTensor,
labels: torch.FloatTensor,
bool_masked_pos: torch.BoolTensor,
```
):
# 此处应该继续注释forward方法的其余部分,但这里不做展示
pass # 在此处插入pass语句
"""
计算预测掩码与实际掩码之间的L1损失。
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
合并的像素值,来自提示图像和输入图像。
prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
来自掩码提示的合并像素值。
pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
预测的掩码。
labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
输入图像的实际掩码。
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
布尔掩码位置。指示哪些补丁被掩盖(1),哪些没有(0)。
Returns:
`torch.FloatTensor`: 预测掩码与实际掩码之间的平均L1损失。
"""
# 根据掩码位置创建掩码
mask = bool_masked_pos[:, :, None].repeat(1, 1, self.patch_size**2 * 3)
# 将掩码映射回原始尺寸
mask = unpatchify(mask, pixel_values.shape[1] // self.patch_size, pixel_values.shape[2] // self.patch_size)
# 将掩码提示中的虚拟掩码改为实际标签值
prompt_pixel_values = prompt_pixel_values.clone()
prompt_pixel_values[:, :, prompt_pixel_values.shape[2] // 2 :, :] = labels
# 计算平滑L1损失,不进行缩减,并根据掩码应用损失
loss = F.smooth_l1_loss(pred_masks, prompt_pixel_values, reduction="none", beta=self.beta)
loss = (loss * mask).sum() / mask.sum() # 计算移除补丁后的平均损失
return loss
# 添加类的文档字符串,描述 SegGptForImageSegmentation 类的作用及其特性
@add_start_docstrings(
"SegGpt model with a decoder on top for one-shot image segmentation.",
SEGGPT_START_DOCSTRING,
)
# 定义 SegGptForImageSegmentation 类,继承自 SegGptPreTrainedModel
class SegGptForImageSegmentation(SegGptPreTrainedModel):
# 初始化方法,接受一个 SegGptConfig 类型的参数 config
def __init__(self, config: SegGptConfig):
# 调用父类的初始化方法
super().__init__(config)
# 将参数 config 存储在实例的 config 属性中
self.config = config
# 使用给定的 config 创建 SegGptModel 实例,并赋值给 self.model
self.model = SegGptModel(config)
# 使用给定的 config 创建 SegGptDecoder 实例,并赋值给 self.decoder
self.decoder = SegGptDecoder(config)
# 初始化权重并进行最终处理
self.post_init()
# 前向传播方法,接受多个输入参数,执行模型的前向计算
@add_start_docstrings_to_model_forward(SEGGPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SegGptImageSegmentationOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.Tensor, # 图像像素值张量
prompt_pixel_values: torch.Tensor, # 提示像素值张量
prompt_masks: torch.Tensor, # 提示掩码张量
bool_masked_pos: Optional[torch.BoolTensor] = None, # 可选的布尔类型掩码位置张量
feature_ensemble: Optional[bool] = None, # 可选的特征合集标志
embedding_type: Optional[str] = None, # 可选的嵌入类型
labels: Optional[torch.FloatTensor] = None, # 可选的标签张量
output_attentions: Optional[bool] = None, # 可选的注意力输出标志
output_hidden_states: Optional[bool] = None, # 可选的隐藏状态输出标志
return_dict: Optional[bool] = None, # 可选的返回字典标志
.\models\seggpt\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule
from ...utils import is_torch_available, is_vision_available
_import_structure = {
"configuration_seggpt": ["SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegGptConfig", "SegGptOnnxConfig"]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_seggpt"] = [
"SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"SegGptModel",
"SegGptPreTrainedModel",
"SegGptForImageSegmentation",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_seggpt"] = ["SegGptImageProcessor"]
if TYPE_CHECKING:
from .configuration_seggpt import SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, SegGptConfig, SegGptOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_seggpt import (
SEGGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
SegGptForImageSegmentation,
SegGptModel,
SegGptPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_seggpt import SegGptImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\sew\configuration_sew.py
logger = logging.get_logger(__name__)
SEW_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"asapp/sew-tiny-100k": "https://huggingface.co/asapp/sew-tiny-100k/resolve/main/config.json",
}
class SEWConfig(PretrainedConfig):
r"""
这是 SEWModel 的配置类,用于存储 SEW 模型的配置信息。根据指定的参数实例化配置对象,定义模型的架构。
使用默认参数实例化配置对象将得到与 asapp/sew-tiny-100k 架构类似的配置。
配置对象继承自 PretrainedConfig,可用于控制模型的输出。详细信息请参阅 PretrainedConfig 的文档。
Example:
```
>>> from transformers import SEWConfig, SEWModel
>>> # 初始化一个 SEW 类型的配置,例如 asapp/sew-tiny-100k
>>> configuration = SEWConfig()
>>> # 使用 SEW 类型的配置初始化一个模型(随机权重)
>>> model = SEWModel(configuration)
>>> # 访问模型的配置信息
>>> configuration = model.config
```
"""
model_type = "sew"
def __init__(
self,
vocab_size=32,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
squeeze_factor=2,
hidden_act="gelu",
hidden_dropout=0.1,
activation_dropout=0.1,
attention_dropout=0.1,
feat_proj_dropout=0.0,
final_dropout=0.1,
layerdrop=0.1,
initializer_range=0.02,
layer_norm_eps=1e-5,
feat_extract_norm="group",
feat_extract_activation="gelu",
conv_dim=(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512),
conv_stride=(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1),
conv_kernel=(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1),
conv_bias=False,
num_conv_pos_embeddings=128,
num_conv_pos_embedding_groups=16,
apply_spec_augment=True,
mask_time_prob=0.05,
mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0,
mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="mean",
ctc_zero_infinity=False,
use_weighted_layer_sum=False,
classifier_proj_size=256,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
**kwargs,
):
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
self.hidden_size = hidden_size
self.feat_extract_norm = feat_extract_norm
self.feat_extract_activation = feat_extract_activation
self.conv_dim = list(conv_dim)
self.conv_stride = list(conv_stride)
self.conv_kernel = list(conv_kernel)
self.conv_bias = conv_bias
self.num_conv_pos_embeddings = num_conv_pos_embeddings
self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
self.num_feat_extract_layers = len(self.conv_dim)
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.squeeze_factor = squeeze_factor
self.hidden_act = hidden_act
self.num_attention_heads = num_attention_heads
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.feat_proj_dropout = feat_proj_dropout
self.final_dropout = final_dropout
self.layerdrop = layerdrop
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.vocab_size = vocab_size
if (
(len(self.conv_stride) != self.num_feat_extract_layers)
or (len(self.conv_kernel) != self.num_feat_extract_layers)
or (len(self.conv_dim) != self.num_feat_extract_layers)
):
raise ValueError(
"Configuration for convolutional layers is incorrect. "
"It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
)
self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
self.use_weighted_layer_sum = use_weighted_layer_sum
self.classifier_proj_size = classifier_proj_size
@property
def inputs_to_logits_ratio(self):
return functools.reduce(operator.mul, self.conv_stride, 1)
.\models\sew\convert_sew_original_pytorch_checkpoint_to_pytorch.py
"""Convert SEW checkpoint."""
import argparse
import json
import os
import fairseq
import torch
from fairseq.data import Dictionary
from sew_asapp import tasks
from transformers import (
SEWConfig,
SEWForCTC,
SEWModel,
Wav2Vec2CTCTokenizer,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
logging,
)
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
MAPPING = {
"post_extract_proj": "feature_projection",
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
"fc2": "encoder.layers.*.feed_forward.output_dense",
"final_layer_norm": "encoder.layers.*.final_layer_norm",
"encoder.upsample.0": "encoder.upsample.projection",
"encoder.layer_norm": "encoder.layer_norm",
"w2v_model.layer_norm": "layer_norm",
"w2v_encoder.proj": "lm_head",
"mask_emb": "masked_spec_embed",
}
def set_recursively(hf_pointer, key, value, full_name, weight_type):
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None:
hf_shape = getattr(hf_pointer, weight_type).shape
else:
hf_shape = hf_pointer.shape
assert hf_shape == value.shape, (
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
f" {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
elif weight_type == "weight_g":
hf_pointer.weight_g.data = value
elif weight_type == "weight_v":
hf_pointer.weight_v.data = value
elif weight_type == "bias":
hf_pointer.bias.data = value
else:
hf_pointer.data = value
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
unused_weights = []
fairseq_dict = fairseq_model.state_dict()
feature_extractor = hf_model.sew.feature_extractor if is_finetuned else hf_model.feature_extractor
for name, value in fairseq_dict.items():
is_used = False
if "conv_layers" in name:
load_conv_layer(
name,
value,
feature_extractor,
unused_weights,
hf_model.config.feat_extract_norm == "group",
)
is_used = True
else:
for key, mapped_key in MAPPING.items():
mapped_key = "sew." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
is_used = True
if "*" in mapped_key:
layer_index = name.split(key)[0].split(".")[-2]
mapped_key = mapped_key.replace("*", layer_index)
if "weight_g" in name:
weight_type = "weight_g"
elif "weight_v" in name:
weight_type = "weight_v"
elif "weight" in name:
weight_type = "weight"
elif "bias" in name:
weight_type = "bias"
else:
weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type)
continue
if not is_used:
unused_weights.append(name)
logger.warning(f"Unused weights: {unused_weights}")
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
name = full_name.split("conv_layers.")[-1]
items = name.split(".")
layer_id = int(items[0])
type_id = int(items[1])
if type_id == 0:
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
" found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
f"{full_name} has size {value.shape}, but"
f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
unused_weights.append(full_name)
def convert_config(model, is_finetuned):
config = SEWConfig()
if is_finetuned:
fs_config = model.w2v_encoder.w2v_model.cfg
else:
fs_config = model.cfg
config.conv_bias = fs_config.conv_bias
conv_layers = eval(fs_config.conv_feature_layers)
config.conv_dim = [x[0] for x in conv_layers]
config.conv_kernel = [x[1] for x in conv_layers]
config.conv_stride = [x[2] for x in conv_layers]
config.feat_extract_activation = "gelu"
config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
config.final_dropout = 0.0
config.hidden_act = fs_config.activation_fn.name
config.hidden_size = fs_config.encoder_embed_dim
config.initializer_range = 0.02
config.intermediate_size = fs_config.encoder_ffn_embed_dim
config.layer_norm_eps = 1e-5
config.layerdrop = fs_config.encoder_layerdrop
config.num_attention_heads = fs_config.encoder_attention_heads
config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
config.num_conv_pos_embeddings = fs_config.conv_pos
config.num_feat_extract_layers = len(conv_layers)
config.num_hidden_layers = fs_config.encoder_layers
config.squeeze_factor = fs_config.squeeze_factor
if is_finetuned:
fs_config = model.cfg
config.final_dropout = fs_config.final_dropout
config.layerdrop = fs_config.layerdrop
config.activation_dropout = fs_config.activation_dropout
config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
config.attention_dropout = fs_config.attention_dropout
config.feat_proj_dropout = fs_config.dropout_input
config.hidden_dropout = fs_config.dropout
config.mask_feature_length = fs_config.mask_channel_length
config.mask_feature_prob = fs_config.mask_channel_prob
config.mask_time_length = fs_config.mask_length
config.mask_time_prob = fs_config.mask_prob
config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
config.tokenizer_class = "Wav2Vec2CTCTokenizer"
return config
@torch.no_grad()
def convert_sew_checkpoint(
checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if is_finetuned:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
)
else:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
if config_path is not None:
config = SEWConfig.from_pretrained(config_path)
else:
config = convert_config(model[0], is_finetuned)
model = model[0].eval()
return_attention_mask = True if config.feat_extract_norm == "layer" else False
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0,
do_normalize=True,
return_attention_mask=return_attention_mask,
)
if is_finetuned:
if dict_path:
target_dict = Dictionary.load(dict_path)
target_dict.indices[target_dict.bos_word] = target_dict.pad_index
target_dict.indices[target_dict.pad_word] = target_dict.bos_index
config.bos_token_id = target_dict.pad_index
config.pad_token_id = target_dict.bos_index
config.eos_token_id = target_dict.eos_index
config.vocab_size = len(target_dict.symbols)
vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
if not os.path.isdir(pytorch_dump_folder_path):
logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
return
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
json.dump(target_dict.indices, vocab_handle)
tokenizer = Wav2Vec2CTCTokenizer(
vocab_path,
unk_token=target_dict.unk_word,
pad_token=target_dict.pad_word,
bos_token=target_dict.bos_word,
eos_token=target_dict.eos_word,
word_delimiter_token="|",
do_lower_case=False,
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(pytorch_dump_folder_path)
hf_model = SEWForCTC(config)
else:
hf_model = SEWModel(config)
feature_extractor.save_pretrained(pytorch_dump_folder_path)
recursively_load_weights(model, hf_model, is_finetuned)
hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args()
convert_sew_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
)
.\models\sew\modeling_sew.py
""" PyTorch SEW 模型。"""
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_sew import SEWConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 1
_CONFIG_FOR_DOC = "SEWConfig"
_CHECKPOINT_FOR_DOC = "asapp/sew-tiny-100k-ft-ls100h"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 512]
_CTC_EXPECTED_OUTPUT = (
"'MISTER QUILTER IS THE APPOSTILE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPOLLE'"
)
_CTC_EXPECTED_LOSS = 0.42
_SEQ_CLASS_CHECKPOINT = "anton-l/sew-mid-100k-ft-keyword-spotting"
_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
_SEQ_CLASS_EXPECTED_LOSS = 9.52
SEW_PRETRAINED_MODEL_ARCHIVE_LIST = [
"asapp/sew-tiny-100k",
"asapp/sew-small-100k",
"asapp/sew-mid-100k",
]
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.LongTensor] = None,
min_masks: int = 0,
) -> np.ndarray:
"""
计算给定形状的随机掩码间隔。用于实现 ASR 的 [SpecAugment: A Simple Data Augmentation Method for
ASR](https://arxiv.org/abs/1904.08779)。请注意,此方法未经优化,应在 CPU 上作为训练预处理的一部分运行,而不是在 TPU 上运行。
"""
Args:
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
the first element is the batch size and the second element is the length of the axis to span.
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
independently generated mask spans of length `mask_length` is computed by
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask
min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
"""
# 解包形状参数
batch_size, sequence_length = shape
# 检查 mask_length 是否合法
if mask_length < 1:
raise ValueError("`mask_length` has to be bigger than 0.")
# 检查 mask_length 是否小于 sequence_length
if mask_length > sequence_length:
raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
)
# epsilon 用于概率舍入
epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length):
"""Given input length, compute how many spans should be masked"""
# 计算应该屏蔽的 span 的数量
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
# 确保不低于最小屏蔽数
num_masked_span = max(num_masked_span, min_masks)
# 确保 num_masked_span 不超过 sequence_length
if num_masked_span * mask_length > sequence_length:
num_masked_span = sequence_length // mask_length
# 确保 num_masked span 不超过 input_length - (mask_length - 1)
if input_length - (mask_length - 1) < num_masked_span:
num_masked_span = max(input_length - (mask_length - 1), 0)
return num_masked_span
# 计算每个 batch 中的屏蔽 span 的数量
input_lengths = (
attention_mask.sum(-1).detach().tolist()
if attention_mask is not None
else [sequence_length for _ in range(batch_size)]
)
# 创建用于 SpecAugment 的屏蔽 mask
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
spec_aug_mask_idxs = []
# 计算最大可能的屏蔽 span 数量
max_num_masked_span = compute_num_masked_span(sequence_length)
# 如果最大屏蔽 span 数量为 0,则直接返回空的 spec_aug_mask
if max_num_masked_span == 0:
return spec_aug_mask
# 遍历输入长度列表中的每个长度
for input_length in input_lengths:
# 计算当前输入长度下的需要屏蔽的片段数量
num_masked_span = compute_num_masked_span(input_length)
# 随机选择要屏蔽的索引
spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
)
# 选择第一个样本索引作为填充向量的虚拟索引,确保所有批次具有相同的维度
# 这是由于概率舍入导致的维度问题的解决方案
if len(spec_aug_mask_idx) == 0:
# 只有在 `input_length` 严格小于 `sequence_length` 时才会发生这种情况,
# 此时最后一个标记必须是填充标记,可以用作虚拟屏蔽标识符
dummy_mask_idx = sequence_length - 1
else:
dummy_mask_idx = spec_aug_mask_idx[0]
# 将虚拟屏蔽索引扩展到匹配的数量,并添加到屏蔽索引列表中
spec_aug_mask_idx = np.concatenate(
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
)
spec_aug_mask_idxs.append(spec_aug_mask_idx)
# 将屏蔽索引列表转换为 numpy 数组
spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
# 将屏蔽索引扩展为屏蔽段
spec_aug_mask_idxs = np.broadcast_to(
spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
)
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# 添加偏移量到起始索引,以确保索引现在创建一个完整的屏蔽段
offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length
)
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
# 确保屏蔽索引不会超过序列长度
if spec_aug_mask_idxs.max() > sequence_length - 1:
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
# 使用屏蔽索引在 spec_aug_mask 上进行填充操作
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
# 返回填充后的 spec_aug_mask
return spec_aug_mask
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->SEW
class SEWNoLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 设置输入卷积维度为上一层的卷积维度或者默认为1(如果是第一层)
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 设置输出卷积维度为当前层的卷积维度
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个卷积层,指定输入和输出维度,卷积核大小,步长和是否有偏置
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 设置激活函数为预定义的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
# 执行卷积操作
hidden_states = self.conv(hidden_states)
# 应用激活函数
hidden_states = self.activation(hidden_states)
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->SEW
class SEWLayerNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 设置输入卷积维度为上一层的卷积维度或者默认为1(如果是第一层)
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 设置输出卷积维度为当前层的卷积维度
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个卷积层,指定输入和输出维度,卷积核大小,步长和是否有偏置
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 创建一个LayerNorm层,对输出进行标准化,并可选地进行仿射变换
self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
# 设置激活函数为预定义的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
def forward(self, hidden_states):
# 执行卷积操作
hidden_states = self.conv(hidden_states)
# 将卷积输出的维度转置以便进行LayerNorm操作
hidden_states = hidden_states.transpose(-2, -1)
# 应用LayerNorm进行标准化
hidden_states = self.layer_norm(hidden_states)
# 再次将维度转置回来
hidden_states = hidden_states.transpose(-2, -1)
# 应用激活函数
hidden_states = self.activation(hidden_states)
return hidden_states
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->SEW
class SEWGroupNormConvLayer(nn.Module):
def __init__(self, config, layer_id=0):
super().__init__()
# 设置输入卷积维度为上一层的卷积维度或者默认为1(如果是第一层)
self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
# 设置输出卷积维度为当前层的卷积维度
self.out_conv_dim = config.conv_dim[layer_id]
# 创建一个卷积层,指定输入和输出维度,卷积核大小,步长和是否有偏置
self.conv = nn.Conv1d(
self.in_conv_dim,
self.out_conv_dim,
kernel_size=config.conv_kernel[layer_id],
stride=config.conv_stride[layer_id],
bias=config.conv_bias,
)
# 设置激活函数为预定义的激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 创建一个GroupNorm层,指定组数和通道数,对输出进行标准化
self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
def forward(self, hidden_states):
# 执行卷积操作
hidden_states = self.conv(hidden_states)
# 应用GroupNorm进行标准化
hidden_states = self.layer_norm(hidden_states)
# 应用激活函数
hidden_states = self.activation(hidden_states)
return hidden_states
class SEWPositionalConvEmbedding(nn.Module):
# 在此处继续实现其他功能
pass
# 初始化函数,用于初始化类的实例
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 创建一个一维卷积层对象
self.conv = nn.Conv1d(
config.hidden_size, # 输入通道数(隐藏大小)
config.hidden_size, # 输出通道数(隐藏大小,保持不变)
kernel_size=config.num_conv_pos_embeddings, # 卷积核大小
padding=config.num_conv_pos_embeddings // 2, # 填充大小
groups=config.num_conv_pos_embedding_groups, # 分组卷积的组数
stride=config.squeeze_factor, # 步长
)
# 如果启用了Deepspeed的zero3模式
if is_deepspeed_zero3_enabled():
import deepspeed
# 使用Deepspeed的分布式参数收集功能
with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
# 对卷积层进行权重归一化,并命名为"weight",dim=2表示在输出通道维度上进行归一化
self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
# 注册卷积层权重的外部参数
deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
else:
# 对卷积层进行权重归一化,并命名为"weight",dim=2表示在输出通道维度上进行归一化
self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
# 创建一个与卷积层同样大小的填充层对象
self.padding = SEWSamePadLayer(config.num_conv_pos_embeddings)
# 根据配置选择激活函数
self.activation = ACT2FN[config.feat_extract_activation]
# 前向传播函数,定义了数据如何通过网络层流动
def forward(self, hidden_states):
# 经过一维卷积层
hidden_states = self.conv(hidden_states)
# 经过填充层
hidden_states = self.padding(hidden_states)
# 经过激活函数
hidden_states = self.activation(hidden_states)
# 返回处理后的隐藏状态
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制代码,并将 Wav2Vec2 更改为 SEW
class SEWSamePadLayer(nn.Module):
def __init__(self, num_conv_pos_embeddings):
super().__init__()
# 根据卷积位置嵌入的数量确定是否需要移除一个填充元素
self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
def forward(self, hidden_states):
if self.num_pad_remove > 0:
# 如果需要移除填充元素,则截取掉隐藏状态的末尾
hidden_states = hidden_states[:, :, :-self.num_pad_remove]
return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制代码,并将 Wav2Vec2 更改为 SEW
class SEWFeatureEncoder(nn.Module):
"""从原始音频波形中构造特征"""
def __init__(self, config):
super().__init__()
# 根据配置选择不同的特征提取归一化方式
if config.feat_extract_norm == "group":
# 如果是组归一化,则使用 SEWGroupNormConvLayer 作为第一层,其余层为 SEWNoLayerNormConvLayer
conv_layers = [SEWGroupNormConvLayer(config, layer_id=0)] + [
SEWNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
]
elif config.feat_extract_norm == "layer":
# 如果是层归一化,则所有层均使用 SEWLayerNormConvLayer
conv_layers = [SEWLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
else:
# 如果归一化方式不是 'group' 或 'layer',则抛出异常
raise ValueError(
f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
)
# 将所有的卷积层组成一个模块列表
self.conv_layers = nn.ModuleList(conv_layers)
self.gradient_checkpointing = False
self._requires_grad = True
def _freeze_parameters(self):
# 冻结模型的所有参数,使其不可训练
for param in self.parameters():
param.requires_grad = False
self._requires_grad = False
def forward(self, input_values):
# 将输入的张量扩展维度,增加一个维度,用于卷积操作
hidden_states = input_values[:, None]
# 如果需要计算梯度并且处于训练模式,则将 hidden_states 设置为需要梯度计算
if self._requires_grad and self.training:
hidden_states.requires_grad = True
# 遍历所有的卷积层进行前向传播
for conv_layer in self.conv_layers:
# 如果需要计算梯度、启用了梯度检查点功能并且处于训练模式,则使用梯度检查点函数进行前向传播
if self._requires_grad and self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
conv_layer.__call__,
hidden_states,
)
else:
# 否则直接调用卷积层进行前向传播计算
hidden_states = conv_layer(hidden_states)
# 返回最终的隐藏状态张量
return hidden_states
class SEWFeatureExtractor(SEWFeatureEncoder):
# SEWFeatureExtractor 类继承自 SEWFeatureEncoder 类
def __init__(self, config):
# 初始化函数,接受一个配置参数 config
super().__init__(config)
# 调用父类 SEWFeatureEncoder 的初始化方法
# 发出警告,提示 SEWFeatureExtractor 类已过时,建议使用 SEWFeatureEncoder 类
warnings.warn(
f"The class `{self.__class__.__name__}` has been depreciated "
"and will be removed in Transformers v5. "
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
FutureWarning,
)
# 从 transformers.models.bart.modeling_bart.BartAttention 复制并修改为 SEWAttention
class SEWAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[SEWConfig] = None,
):
# 初始化函数,定义注意力机制的参数
super().__init__()
self.embed_dim = embed_dim # 注意力机制的输入维度
self.num_heads = num_heads # 注意力头的数量
self.dropout = dropout # Dropout 概率
self.head_dim = embed_dim // num_heads # 每个头的维度
self.config = config # SEW 的配置对象
# 检查 embed_dim 是否可以被 num_heads 整除,否则抛出错误
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5 # 缩放因子
self.is_decoder = is_decoder # 是否为解码器
self.is_causal = is_causal # 是否是因果的
# 线性变换层,用于计算 Q、K、V 矩阵
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
# 将输入张量 tensor 重新形状为 (bsz, seq_len, num_heads, head_dim) 并转置
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
# 前向传播函数,接受多个输入参数并进行注意力计算
pass # 此处未实现具体功能,需要根据具体的注意力机制进行实现
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward 复制并修改为 SEWFeedForward
class SEWFeedForward(nn.Module):
def __init__(self, config):
# 初始化函数,接受一个配置参数 config
super().__init__()
self.intermediate_dropout = nn.Dropout(config.activation_dropout)
# 中间层线性变换,用于激活函数前的线性变换
self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 输出层线性变换,用于最终输出
self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.output_dropout = nn.Dropout(config.hidden_dropout)
# 定义前向传播函数,接受隐藏状态作为输入参数
def forward(self, hidden_states):
# 使用中间层的稠密层对隐藏状态进行变换
hidden_states = self.intermediate_dense(hidden_states)
# 对变换后的隐藏状态应用中间层的激活函数
hidden_states = self.intermediate_act_fn(hidden_states)
# 对应用激活函数后的隐藏状态进行中间层的dropout操作
# 使用输出层的稠密层对经过中间层处理后的隐藏状态再次进行变换
hidden_states = self.output_dense(hidden_states)
# 对输出层变换后的隐藏状态进行输出层的dropout操作
return hidden_states
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer复制而来,将Wav2Vec2替换为SEW
class SEWEncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
# 初始化注意力机制,使用SEWAttention类
self.attention = SEWAttention(
embed_dim=config.hidden_size, # 设置嵌入维度为隐藏大小
num_heads=config.num_attention_heads, # 设置注意力头数
dropout=config.attention_dropout, # 设置注意力机制的dropout率
is_decoder=False,
)
# 随机失活层,使用隐藏dropout率
self.dropout = nn.Dropout(config.hidden_dropout)
# 层归一化,使用隐藏大小和层归一化epsilon值
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 前馈神经网络,使用SEWFeedForward类
self.feed_forward = SEWFeedForward(config)
# 最终层归一化,使用隐藏大小和层归一化epsilon值
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
# 注意力残差连接
attn_residual = hidden_states
# 执行注意力计算
hidden_states, attn_weights, _ = self.attention(
hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
)
# 应用dropout
hidden_states = self.dropout(hidden_states)
# 添加注意力残差到隐藏状态
hidden_states = attn_residual + hidden_states
# 应用层归一化
hidden_states = self.layer_norm(hidden_states)
# 添加前馈神经网络输出到隐藏状态
hidden_states = hidden_states + self.feed_forward(hidden_states)
# 最终层归一化
hidden_states = self.final_layer_norm(hidden_states)
# 构建输出元组
outputs = (hidden_states,)
# 如果需要输出注意力权重,添加到输出元组中
if output_attentions:
outputs += (attn_weights,)
return outputs
class SEWEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
# SEW位置卷积嵌入
self.pos_conv_embed = SEWPositionalConvEmbedding(config)
# 平均池化层,使用squeeze因子配置
self.pool = nn.AvgPool1d(config.squeeze_factor, config.squeeze_factor)
# 层归一化,使用隐藏大小和层归一化epsilon值
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 随机失活层,使用隐藏dropout率
self.dropout = nn.Dropout(config.hidden_dropout)
# SEW编码器层列表,根据隐藏层数配置
self.layers = nn.ModuleList([SEWEncoderLayer(config) for _ in range(config.num_hidden_layers)])
# SEW上采样
self.upsample = SEWUpsampling(config)
# 梯度检查点,默认为关闭
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
pass # 此处省略forward方法实现,仅给出了方法签名
class SEWPreTrainedModel(PreTrainedModel):
"""
处理权重初始化和预训练模型下载加载的抽象类。
"""
config_class = SEWConfig # SEW模型的配置类
base_model_prefix = "sew" # 基础模型前缀
main_input_name = "input_values" # 主输入名称
supports_gradient_checkpointing = True # 支持梯度检查点
def _init_weights(self, module):
"""Initialize the weights""" # 定义一个初始化权重的函数,参数是一个神经网络模块
if isinstance(module, SEWPositionalConvEmbedding):
# 对 SEWPositionalConvEmbedding 类型的模块,使用正态分布初始化卷积层的权重,均值为 0,标准差为根据核大小和输入通道数计算的值
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
# 将卷积层的偏置初始化为 0
nn.init.constant_(module.conv.bias, 0)
elif isinstance(module, nn.Linear):
# 对线性层,使用正态分布初始化权重,均值为 0,标准差为配置中的初始化范围
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
# 对 LayerNorm 和 GroupNorm 模块,偏置初始化为 0,权重初始化为 1
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
if is_deepspeed_zero3_enabled():
import deepspeed # 导入 deepspeed 库
# 如果启用了 DeepSpeed 的 Zero-3,且卷积层有 weight_v 和 weight_g 属性,使用 GatheredParameters 初始化权重
if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
nn.init.kaiming_normal_(module.weight.data)
else:
with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
nn.init.kaiming_normal_(module.weight.data)
else:
# 否则使用 He 初始化方法,适用于 ReLU 激活函数
nn.init.kaiming_normal_(module.weight.data)
# 对于 Linear 和 Conv1d 类型的模块,且具有偏置的,偏置初始化为 0
if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
module.bias.data.zero_()
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
"""
Computes the output length of the convolutional layers
""" # 定义一个计算卷积层输出长度的函数,输入是一个长度张量或整数
def _conv_out_length(input_length, kernel_size, stride):
# 计算 1D 卷积层输出长度的公式,取自 PyTorch 文档
# input_length 是输入长度,kernel_size 是卷积核大小,stride 是步幅
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
# 遍历配置中的卷积核大小和步幅,依次计算输出长度
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
# 返回计算得到的输出长度
return input_lengths
# 定义一个方法用于生成特征向量的注意力掩码
def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
# 计算输出长度,即根据注意力掩码每个样本的有效长度来确定输出的长度
output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# 获取批次大小
batch_size = attention_mask.shape[0]
# 初始化一个全零的注意力掩码张量,形状为(batch_size, feature_vector_length)
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
# 将输出长度前的所有位置设为1,以确保在这些位置之前的所有值都被注意到
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
# 将注意力掩码张量沿最后一个维度翻转,累加并再次翻转,并转换为布尔类型,以生成正确的注意力掩码
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
# 返回生成的注意力掩码张量
return attention_mask
"""
SEW was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech
Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger,
Yoav Artzi.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`SEWConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare SEW Model transformer outputting raw hidden-states without any specific head on top.",
SEW_START_DOCSTRING,
)
class SEWModel(SEWPreTrainedModel):
# 初始化函数,接收一个SEWConfig类型的配置对象作为参数
def __init__(self, config: SEWConfig):
# 调用父类的初始化方法,传入配置对象作为参数
super().__init__(config)
# 将配置对象保存到实例变量self.config中
self.config = config
# 使用配置对象创建SEWFeatureEncoder类型的特征提取器实例,并保存到self.feature_extractor中
self.feature_extractor = SEWFeatureEncoder(config)
# 创建一个LayerNorm层,用于归一化最后一个卷积层的输出,参数为config.conv_dim[-1],eps为config.layer_norm_eps
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
# 判断是否需要对特征进行投影
self.project_features = config.conv_dim[-1] != config.hidden_size
if self.project_features:
# 如果需要投影,创建一个Linear层,将config.conv_dim[-1]维的特征投影到config.hidden_size维
self.feature_projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
# 创建一个Dropout层,用于特征投影后的dropout,dropout率为config.feat_proj_dropout
self.feature_dropout = nn.Dropout(config.feat_proj_dropout)
# 如果配置中mask_time_prob或mask_feature_prob大于0,创建一个参数化的特征嵌入张量
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
# 使用配置对象创建SEWEncoder类型的编码器实例,并保存到self.encoder中
self.encoder = SEWEncoder(config)
# 初始化权重并进行最终处理
self.post_init()
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states复制而来
def _mask_hidden_states(
self,
hidden_states: torch.FloatTensor,
mask_time_indices: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
# `config.apply_spec_augment` can set masking to False
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
# generate indices & apply SpecAugment along time axis
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
# apply SpecAugment along time axis with given mask_time_indices
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
# compute mask indices for time axis if not provided
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
# apply SpecAugment along time axis using computed mask indices
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
# generate indices & apply SpecAugment along feature axis
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
# expand feature mask indices to match the shape of hidden_states and apply
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0
return hidden_states
) -> Union[Tuple, BaseModelOutput]:
# 如果未提供输出注意力机制,则使用配置中的默认值
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未提供输出隐藏状态,则使用配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未提供返回字典选项,则使用配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 提取输入特征
extract_features = self.feature_extractor(input_values)
# 调整特征的维度顺序
extract_features = extract_features.transpose(1, 2)
# 应用层归一化到特征
extract_features = self.layer_norm(extract_features)
# 如果需要对特征进行投影
if self.project_features:
extract_features = self.feature_projection(extract_features)
# 使用特征丢弃层处理特征
hidden_states = self.feature_dropout(extract_features)
# 如果提供了注意力掩码
if attention_mask is not None:
# 计算与特征向量对应的减少注意力掩码
attention_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
# 对隐藏状态进行掩码处理,根据时间索引进行掩码
hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
# 编码器处理隐藏状态
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 取编码器输出的第一个元素作为隐藏状态
hidden_states = encoder_outputs[0]
# 如果不需要返回字典形式的输出
if not return_dict:
# 返回元组形式的结果,包含隐藏状态和额外的编码器输出
return (hidden_states,) + encoder_outputs[1:]
# 返回基础模型输出对象,包含最后的隐藏状态、所有隐藏状态和注意力权重
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
SEW_START_DOCSTRING,
)
# 使用`add_start_docstrings`装饰器添加模型文档字符串,描述SEW模型具有一个在顶部的语言建模头用于连接主义时间分类(CTC)。
# SEW_START_DOCSTRING为预定义的模型开始文档字符串。
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC类复制而来,修改Wav2Vec2为SEW,wav2vec2为sew,WAV_2_VEC_2为SEW
class SEWForCTC(SEWPreTrainedModel):
def __init__(self, config, target_lang: Optional[str] = None):
super().__init__(config)
# 初始化SEW模型
self.sew = SEWModel(config)
# 使用config中的final_dropout创建一个dropout层
self.dropout = nn.Dropout(config.final_dropout)
self.target_lang = target_lang
# 如果config中未定义语言模型头的词汇量大小,则抛出异常
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
# 根据config的设置选择输出隐藏大小
output_hidden_size = (
config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
)
# 创建一个线性层作为语言模型头,连接隐藏大小和词汇量大小
self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
# 初始化权重并应用最终处理
self.post_init()
def tie_weights(self):
"""
重写`~PreTrainedModel.tie_weights`方法,以便在通过`from_pretrained(...)`传递`target_lang=...`时正确加载适配器权重。
该方法不应由用户调用,并且可能在未来发生更改。
"""
# 注意,`tie_weights`通常用于绑定输入和输出嵌入权重。在这里重新用于SEW,以便我们在SEW加载适配器层时不必引入新的API。
# 虽然有些巧妙,SEW永远不必绑定输入和输出嵌入,因此在这里重新使用该函数是可以的。
target_lang = self.target_lang
# 如果target_lang不为None,并且config中未定义adapter_attn_dim,则抛出异常
if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
# 如果target_lang为None,并且config中定义了adapter_attn_dim,则记录信息提示,默认将target_lang设置为'eng'
elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
logger.info("By default `target_lang` is set to 'eng'.")
# 如果target_lang不为None,则加载适配器
elif target_lang is not None:
self.load_adapter(target_lang, force_load=True)
# 调用此函数将禁用特征编码器的梯度计算,使其参数在训练过程中不会更新
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 发出警告,提醒方法`freeze_feature_extractor`已过时,并将在 Transformers v5 中移除。请使用等效的`freeze_feature_encoder`方法。
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
# 调用等效的`freeze_feature_encoder`方法,冻结特征编码器的参数
self.freeze_feature_encoder()
# 调用此函数将禁用特征编码器的梯度计算,使其参数在训练过程中不会更新
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
# 冻结特征编码器的参数
self.sew.feature_extractor._freeze_parameters()
# 调用此函数将禁用基础模型的梯度计算,使其参数在训练过程中不会更新。只有分类头部会更新。
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
# 遍历self.sew对象的参数,并设置requires_grad为False,以禁用梯度计算
for param in self.sew.parameters():
param.requires_grad = False
# 使用add_start_docstrings_to_model_forward和add_code_sample_docstrings为模型的forward方法添加文档字符串
@add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
# 定义模型的forward方法,用于执行前向传播
def forward(
self,
input_values: Optional[torch.Tensor], # 输入值,类型为可选的torch.Tensor
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,类型为可选的torch.Tensor,默认为None
output_attentions: Optional[bool] = None, # 是否输出注意力权重,类型为可选的bool,默认为None
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,类型为可选的bool,默认为None
return_dict: Optional[bool] = None, # 是否返回字典形式的结果,类型为可选的bool,默认为None
labels: Optional[torch.Tensor] = None, # 标签,类型为可选的torch.Tensor,默认为None
) -> Union[Tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
# Decide whether to return a dictionary or not based on the provided argument or configuration
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Perform sequence to sequence processing using the model's encoder-decoder structure
outputs = self.sew(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Extract hidden states from the model's output and apply dropout regularization
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states)
# Generate logits from the processed hidden states using the language model head
logits = self.lm_head(hidden_states)
# Initialize loss as None; compute CTC (Connectionist Temporal Classification) loss if labels are provided
loss = None
if labels is not None:
# Validate label values to ensure they are within the vocabulary size
if labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
# Calculate input lengths based on the attention mask
attention_mask = (
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
# Determine which labels are valid and compute target lengths
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
# Apply log softmax to logits and transpose dimensions for CTC loss computation
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
# Compute CTC loss with adjustments for padding and configuration settings
with torch.backends.cudnn.flags(enabled=False):
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
# If return_dict is False, format the output tuple accordingly
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# If return_dict is True, construct and return a CausalLMOutput object with relevant attributes
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
# SEW 模型,顶部带有一个序列分类头(在汇总输出之上的线性层),用于诸如 SUPERB 关键词识别等任务。
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification 复制而来,将 Wav2Vec2 改为 SEW,wav2vec2 改为 sew,WAV_2_VEC_2 改为 SEW。
class SEWForSequenceClassification(SEWPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 如果配置允许添加适配器且为真,则引发值错误,因为序列分类不支持使用 SEW 适配器(config.add_adapter=True)。
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Sequence classification does not support the use of SEW adapters (config.add_adapter=True)"
)
# 创建 SEW 模型对象
self.sew = SEWModel(config)
# 计算层数:变压器层数 + 输入嵌入层
num_layers = config.num_hidden_layers + 1
# 如果配置使用加权层求和,则初始化层权重参数为均匀值
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
# 线性投影层,将隐藏状态大小映射到分类器投影大小
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
# 分类器层,将分类器投影大小映射到类别数量
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
# 冻结特征提取器,不再计算特征编码器的梯度,使其在训练期间不更新
def freeze_feature_extractor(self):
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
# 冻结特征编码器,不再计算特征编码器的梯度,使其在训练期间不更新
def freeze_feature_encoder(self):
self.sew.feature_extractor._freeze_parameters()
# 冻结基础模型,不再计算基础模型的梯度,使其在训练期间不更新,仅更新分类头
def freeze_base_model(self):
for param in self.sew.parameters():
param.requires_grad = False
# 在模型前向方法上添加文档字符串注释,详细描述输入和输出
@add_start_docstrings_to_model_forward(SEW_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_SEQ_CLASS_CHECKPOINT,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 设置是否返回字典形式的输出,默认从模型配置中获取
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 根据配置决定是否输出隐藏状态
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
# 调用序列编码器模块进行前向传播
outputs = self.sew(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 如果配置要求使用加权层求和,则进行相应的处理
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
hidden_states = torch.stack(hidden_states, dim=1)
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
hidden_states = outputs[0]
# 将处理后的隐藏状态传递给投影层进行处理
hidden_states = self.projector(hidden_states)
# 如果没有提供注意力掩码,则对隐藏状态进行均值池化
if attention_mask is None:
pooled_output = hidden_states.mean(dim=1)
else:
# 根据注意力掩码生成填充掩码,并对隐藏状态进行相应的处理
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
hidden_states[~padding_mask] = 0.0
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
# 对池化后的输出应用分类器以获得最终的分类预测
logits = self.classifier(pooled_output)
# 如果提供了标签,则计算损失
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
# 根据是否返回字典形式的输出进行结果的返回
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
# 返回序列分类器的输出,包括损失、预测 logits、隐藏状态和注意力权重
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)