Transformers 源码解析(四十三)
.\models\efficientformer\image_processing_efficientformer.py
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_batched,
is_scaled_image,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
class EfficientFormerImageProcessor(BaseImageProcessor):
r"""
Constructs a EfficientFormer image processor.
"""
def __init__(self):
super().__init__()
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
`preprocess` method.
crop_size (`Dict[str, int]` *optional*, defaults to 224):
Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
# 初始化方法,设置预处理参数
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
crop_size: Dict[str, int] = None,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
# 初始化函数,继承父类并设置参数
def __init__(
self,
do_resize: bool = False,
do_rescale: bool = False,
do_normalize: bool = False,
do_center_crop: bool = False,
size: Optional[Dict[str, int]] = None,
crop_size: Optional[Dict[str, int]] = None,
resample: Optional[PILImageResampling] = None,
rescale_factor: Optional[float] = None,
image_mean: Optional[List[float]] = None,
image_std: Optional[List[float]] = None,
**kwargs,
) -> None:
# 调用父类初始化函数,并传递额外的参数
super().__init__(**kwargs)
# 如果未指定 size 参数,默认设置为 224x224
size = size if size is not None else {"height": 224, "width": 224}
# 使用函数处理 size 参数,确保格式正确
size = get_size_dict(size)
# 如果未指定 crop_size 参数,默认设置为 224x224,并采用默认的正方形裁剪
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
# 使用函数处理 crop_size 参数,采用默认的正方形裁剪
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
# 设置各种图像处理标志
self.do_resize = do_resize
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.do_center_crop = do_center_crop
# 设置裁剪尺寸
self.crop_size = crop_size
# 设置图像大小
self.size = size
# 设置重采样方式
self.resample = resample
# 设置图像缩放因子
self.rescale_factor = rescale_factor
# 设置图像均值,默认为 IMAGENET_DEFAULT_MEAN
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
# 设置图像标准差,默认为 IMAGENET_DEFAULT_STD
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
# 设置有效的处理键列表
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample:
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
Returns:
`np.ndarray`: The resized image.
"""
# 获取经过处理后的尺寸字典
size = get_size_dict(size)
# 如果 size 字典中有 "shortest_edge" 键,根据最短边调整图片大小
if "shortest_edge" in size:
# 调用函数计算输出图片的大小
size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
# 如果 size 中有 "height" 和 "width" 键,将它们作为尺寸
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
# 如果 size 中既不含 "shortest_edge" 也不含完整的尺寸信息,则抛出异常
raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
# 调用 resize 函数进行图片大小调整,并返回调整后的图片
return resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
.\models\efficientformer\modeling_efficientformer.py
""" PyTorch EfficientFormer 模型。"""
import itertools
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_efficientformer import EfficientFormerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "EfficientFormerConfig"
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"snap-research/efficientformer-l1-300",
]
class EfficientFormerPatchEmbeddings(nn.Module):
"""
此类在两个阶段之间执行下采样。对于形状为 [batch_size, num_channels, height, width] 的输入张量,
它生成形状为 [batch_size, num_channels, height/stride, width/stride] 的输出张量。
"""
def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True):
super().__init__()
self.num_channels = num_channels
self.projection = nn.Conv2d(
num_channels,
embed_dim,
kernel_size=config.downsample_patch_size,
stride=config.downsample_stride,
padding=config.downsample_pad,
)
self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values)
embeddings = self.norm(embeddings)
return embeddings
class EfficientFormerSelfAttention(nn.Module):
def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int):
super().__init__()
self.num_heads = num_heads
self.key_dim = key_dim
self.attention_ratio = attention_ratio
self.scale = key_dim**-0.5
self.total_key_dim = key_dim * num_heads
self.expanded_key_dim = int(attention_ratio * key_dim)
self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
self.qkv = nn.Linear(dim, hidden_size)
self.projection = nn.Linear(self.total_expanded_key_dim, dim)
points = list(itertools.product(range(resolution), range(resolution)))
num_points = len(points)
attention_offsets = {}
idxs = []
for point_1 in points:
for point_2 in points:
offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
idxs.append(attention_offsets[offset])
self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points))
@torch.no_grad()
def train(self, mode=True):
super().train(mode)
if mode and hasattr(self, "ab"):
del self.ab
else:
self.ab = self.attention_biases[:, self.attention_bias_idxs]
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
batch_size, sequence_length, num_channels = hidden_states.shape
qkv = self.qkv(hidden_states)
query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split(
[self.key_dim, self.key_dim, self.expanded_key_dim], dim=3
)
query_layer = query_layer.permute(0, 2, 1, 3)
key_layer = key_layer.permute(0, 2, 1, 3)
value_layer = value_layer.permute(0, 2, 1, 3)
if not self.training:
self.ab = self.ab.to(self.attention_biases.device)
attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + (
self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
)
attention_probs = attention_probs.softmax(dim=-1)
context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2)
context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim)
context_layer = self.projection(context_layer)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class EfficientFormerConvStem(nn.Module):
def __init__(self, config: EfficientFormerConfig, out_channels: int):
super().__init__()
self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
self.activation = nn.ReLU()
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
features = self.batchnorm_before(self.convolution1(pixel_values))
features = self.activation(features)
features = self.batchnorm_after(self.convolution2(features))
features = self.activation(features)
return features
class EfficientFormerPooling(nn.Module):
def __init__(self, pool_size: int):
super().__init__()
self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
output = self.pool(hidden_states) - hidden_states
return output
class EfficientFormerDenseMlp(nn.Module):
def __init__(
self,
config: EfficientFormerConfig,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.linear_in = nn.Linear(in_features, hidden_features)
self.activation = ACT2FN[config.hidden_act]
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.linear_out = nn.Linear(hidden_features, out_features)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.linear_in(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.linear_out(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class EfficientFormerConvMlp(nn.Module):
def __init__(
self,
config: EfficientFormerConfig,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
drop: float = 0.0,
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
self.activation = ACT2FN[config.hidden_act]
self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
self.dropout = nn.Dropout(drop)
self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.convolution1(hidden_state)
hidden_state = self.batchnorm_before(hidden_state)
hidden_state = self.activation(hidden_state)
hidden_state = self.dropout(hidden_state)
hidden_state = self.convolution2(hidden_state)
hidden_state = self.batchnorm_after(hidden_state)
hidden_state = self.dropout(hidden_state)
return hidden_state
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class EfficientFormerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class EfficientFormerFlat(nn.Module):
def __init__(self):
super().__init__()
def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
hidden_states = hidden_states.flatten(2).transpose(1, 2)
return hidden_states
class EfficientFormerMeta3D(nn.Module):
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
super().__init__()
self.token_mixer = EfficientFormerSelfAttention(
dim=config.dim,
key_dim=config.key_dim,
num_heads=config.num_attention_heads,
attention_ratio=config.attention_ratio,
resolution=config.resolution,
)
self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.use_layer_scale = config.use_layer_scale
if config.use_layer_scale:
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
if self.use_layer_scale:
layer_output = hidden_states + self.drop_path(
self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output
)
layer_output = layer_output + self.drop_path(
self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output))
)
else:
layer_output = hidden_states + self.drop_path(attention_output)
layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output)))
outputs = (layer_output,) + outputs
return outputs
class EfficientFormerMeta4DLayers(nn.Module):
def __init__(self, config: EfficientFormerConfig):
super().__init__()
drop_paths = [
config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
for block_idx in range(config.num_meta4d_blocks)
]
self.blocks = nn.ModuleList(
[EfficientFormerMeta4D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths]
)
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
all_attention_outputs = () if output_attentions else None
for layer_module in self.blocks:
if isinstance(hidden_states, tuple):
hidden_states = hidden_states[0]
hidden_states = layer_module(hidden_states)
if output_attentions:
all_attention_outputs = all_attention_outputs + (hidden_states[1],)
if output_attentions:
outputs = (hidden_states[0],) + all_attention_outputs
return outputs
return hidden_states
def __init__(self, config: EfficientFormerConfig, stage_idx: int):
super().__init__()
num_layers = (
config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
)
drop_paths = [
config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
]
self.blocks = nn.ModuleList(
[
EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
for drop_path in drop_paths
]
)
def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
for layer_module in self.blocks:
hidden_states = layer_module(hidden_states)
return hidden_states
class EfficientFormerIntermediateStage(nn.Module):
def __init__(self, config: EfficientFormerConfig, index: int):
super().__init__()
self.meta4D_layers = EfficientFormerMeta4DLayers(config, index)
def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
hidden_states = self.meta4D_layers(hidden_states)
return hidden_states
class EfficientFormerLastStage(nn.Module):
def __init__(self, config: EfficientFormerConfig):
super().__init__()
self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1)
self.flat = EfficientFormerFlat()
self.meta3D_layers = EfficientFormerMeta3DLayers(config)
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
hidden_states = self.meta4D_layers(hidden_states)
hidden_states = self.flat(hidden_states)
hidden_states = self.meta3D_layers(hidden_states, output_attentions)
return hidden_states
class EfficientFormerEncoder(nn.Module):
def __init__(self, config: EfficientFormerConfig):
super().__init__()
self.config = config
num_intermediate_stages = len(config.depths) - 1
downsamples = [
config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
for i in range(num_intermediate_stages)
]
intermediate_stages = []
for i in range(num_intermediate_stages):
intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
if downsamples[i]:
intermediate_stages.append(
EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1])
)
self.intermediate_stages = nn.ModuleList(intermediate_stages)
self.last_stage = EfficientFormerLastStage(config)
def forward(
self,
hidden_states: torch.Tensor,
output_hidden_states: bool = False,
output_attentions: bool = False,
return_dict: bool = True,
) -> BaseModelOutput:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
for layer_module in self.intermediate_stages:
hidden_states = layer_module(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
if output_attentions:
all_self_attentions = all_self_attentions + layer_output[1:]
if output_hidden_states:
all_hidden_states = all_hidden_states + (layer_output[0],)
if not return_dict:
return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=layer_output[0],
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
@add_start_docstrings(
"The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerModel(EfficientFormerPreTrainedModel):
"""
EfficientFormerModel extends EfficientFormerPreTrainedModel and represents a transformer model architecture
without specific task heads.
Args:
config (EfficientFormerConfig): The configuration class for initializing the model.
Attributes:
patch_embed (EfficientFormerConvStem): Patch embedding layer.
encoder (EfficientFormerEncoder): Transformer encoder.
layernorm (nn.LayerNorm): Layer normalization for the final output.
Methods:
forward: Implements the forward pass of the model.
Inherits from:
EfficientFormerPreTrainedModel: Handles weights initialization and pretrained model loading interface.
"""
def __init__(self, config: EfficientFormerConfig):
super().__init__(config)
self.config = config
self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
self.encoder = EfficientFormerEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
def forward(self, pixel_values, output_attentions=False, output_hidden_states=False, return_dict=True):
"""
Defines the forward pass of EfficientFormerModel.
Args:
pixel_values (torch.FloatTensor): Input pixel values of shape (batch_size, num_channels, height, width).
output_attentions (bool, optional): Whether to return attention tensors of all layers.
output_hidden_states (bool, optional): Whether to return hidden states of all layers.
return_dict (bool, optional): Whether to return a ModelOutput instead of a tuple.
Returns:
ModelOutput or tuple:
Depending on `return_dict`, either:
- ModelOutput if `return_dict=True` (default),
- A tuple of torch.FloatTensor otherwise.
"""
pass
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.patch_embed(pixel_values)
encoder_outputs = self.encoder(
embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
if not return_dict:
head_outputs = (sequence_output,)
return head_outputs + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""
EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final
hidden state of the [CLS] token) e.g. for ImageNet.
""",
EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel):
def __init__(self, config: EfficientFormerConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.efficientformer = EfficientFormerModel(config)
self.classifier = (
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.efficientformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output.mean(-2))
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@dataclass
class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
"""
[`EfficientFormerForImageClassificationWithTeacher`] 的输出类型。
Args:
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
预测分数,是 `cls_logits` 和 `distillation_logits` 的平均值。
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类头部的预测分数(即最终隐藏状态的类标记之上的线性层)。
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
蒸馏头部的预测分数(即最终隐藏状态的蒸馏标记之上的线性层)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 传递或 `config.output_hidden_states=True` 时返回):
`torch.FloatTensor` 元组(一个用于嵌入的输出 + 每层的输出)的形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每层输出的隐藏状态加上初始嵌入的输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 传递或 `config.output_attentions=True` 时返回):
`torch.FloatTensor` 元组(每层一个)的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
在注意力 softmax 之后的注意力权重,用于计算自注意力头中的加权平均值。
"""
@add_start_docstrings(
"""
`EfficientFormer` 模型变换器,其顶部包含图像分类头部(一个在 [CLS] 标记的最终隐藏状态之上的线性层,一个在蒸馏标记的最终隐藏状态之上的线性层),
例如用于 ImageNet。
<Tip warning={true}>
此模型仅支持推断。目前不支持使用蒸馏进行微调(即带有教师模型)。
</Tip>
""",
EFFICIENTFORMER_START_DOCSTRING,
)
class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel):
def __init__(self, config: EfficientFormerConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.efficientformer = EfficientFormerModel(config)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.distillation_classifier = (
nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=EfficientFormerForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.efficientformer(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
cls_logits = self.classifier(sequence_output.mean(-2))
distillation_logits = self.distillation_classifier(sequence_output.mean(-2))
logits = (cls_logits + distillation_logits) / 2
if not return_dict:
output = (logits, cls_logits, distillation_logits) + outputs[1:]
return output
return EfficientFormerForImageClassificationWithTeacherOutput(
logits=logits,
cls_logits=cls_logits,
distillation_logits=distillation_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\efficientformer\modeling_tf_efficientformer.py
""" TensorFlow EfficientFormer 模型。"""
import itertools
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import tensorflow as tf
from ...activations_tf import ACT2FN
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFImageClassifierOutput,
)
from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_efficientformer import EfficientFormerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "EfficientFormerConfig"
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"snap-research/efficientformer-l1-300",
]
class TFEfficientFormerPatchEmbeddings(keras.layers.Layer):
"""
此类在两个阶段之间执行下采样。
对于形状为 [batch_size, num_channels, height, width] 的输入张量,
它产生形状为 [batch_size, num_channels, height/stride, width/stride] 的输出张量。
"""
def __init__(
self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs
):
super().__init__(**kwargs)
) -> None:
super().__init__(**kwargs)
self.num_channels = num_channels
self.padding = keras.layers.ZeroPadding2D(padding=config.downsample_pad)
self.projection = keras.layers.Conv2D(
filters=embed_dim,
kernel_size=config.downsample_patch_size,
strides=config.downsample_stride,
padding="valid",
name="projection",
)
self.norm = (
keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm")
if apply_norm
else tf.identity
)
self.embed_dim = embed_dim
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
tf.debugging.assert_shapes(
[(pixel_values, (..., None, None, self.num_channels))],
message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
)
embeddings = self.projection(self.padding(pixel_values))
embeddings = self.norm(embeddings, training=training)
return embeddings
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
if getattr(self, "norm", None) is not None:
if hasattr(self.norm, "name"):
with tf.name_scope(self.norm.name):
self.norm.build([None, None, None, self.embed_dim])
class TFEfficientFormerSelfAttention(keras.layers.Layer):
def __init__(
self,
dim: int,
key_dim: int,
num_heads: int,
attention_ratio: int,
resolution: int,
config: EfficientFormerConfig,
**kwargs,
):
super().__init__(**kwargs)
self.num_heads = num_heads
self.key_dim = key_dim
self.attention_ratio = attention_ratio
self.scale = key_dim**-0.5
self.total_key_dim = key_dim * num_heads
self.expanded_key_dim = int(attention_ratio * key_dim)
self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
self.qkv = keras.layers.Dense(
units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
)
self.projection = keras.layers.Dense(
units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
)
self.resolution = resolution
self.dim = dim
def build(self, input_shape: tf.TensorShape) -> None:
points = list(itertools.product(range(self.resolution), range(self.resolution)))
num_points = len(points)
attention_offsets = {}
idxs = []
for point_1 in points:
for point_2 in points:
offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
if offset not in attention_offsets:
attention_offsets[offset] = len(attention_offsets)
idxs.append(attention_offsets[offset])
self.attention_biases = self.add_weight(
shape=(self.num_heads, len(attention_offsets)),
initializer=keras.initializers.zeros(),
trainable=True,
name="attention_biases",
)
self.attention_bias_idxs = self.add_weight(
shape=(num_points, num_points),
trainable=False,
dtype=tf.int32,
name="attention_bias_idxs",
)
self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points)))
if self.built:
return
self.built = True
if getattr(self, "qkv", None) is not None:
with tf.name_scope(self.qkv.name):
self.qkv.build([None, None, self.dim])
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, self.total_expanded_key_dim])
def call(
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
):
) -> Tuple[tf.Tensor]:
batch_size, sequence_length, *_ = shape_list(hidden_states)
qkv = self.qkv(inputs=hidden_states)
query_layer, key_layer, value_layer = tf.split(
tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)),
num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim],
axis=3,
)
query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2]))
scale = tf.cast(self.scale, dtype=attention_probs.dtype)
attention_probs = tf.multiply(attention_probs, scale)
attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1)
attention_probs = attention_probs + attention_biases
attention_probs = stable_softmax(logits=attention_probs, axis=-1)
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(
tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim)
)
context_layer = self.projection(context_layer)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class TFEfficientFormerConvStem(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs):
super().__init__(**kwargs)
self.padding = keras.layers.ZeroPadding2D(padding=1)
self.convolution1 = keras.layers.Conv2D(
filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1"
)
self.batchnorm_before = keras.layers.BatchNormalization(
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
)
self.convolution2 = keras.layers.Conv2D(
filters=out_channels,
kernel_size=3,
strides=2,
padding="valid",
name="convolution2",
)
self.batchnorm_after = keras.layers.BatchNormalization(
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
)
self.activation = keras.layers.Activation(activation=keras.activations.relu, name="activation")
self.out_channels = out_channels
self.config = config
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training)
features = self.activation(features)
features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training)
features = self.activation(features)
return features
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution1", None) is not None:
with tf.name_scope(self.convolution1.name):
self.convolution1.build([None, None, None, self.config.num_channels])
if getattr(self, "batchnorm_before", None) is not None:
with tf.name_scope(self.batchnorm_before.name):
self.batchnorm_before.build([None, None, None, self.out_channels // 2])
if getattr(self, "convolution2", None) is not None:
with tf.name_scope(self.convolution2.name):
self.convolution2.build([None, None, None, self.out_channels // 2])
if getattr(self, "batchnorm_after", None) is not None:
with tf.name_scope(self.batchnorm_after.name):
self.batchnorm_after.build([None, None, None, self.out_channels])
if getattr(self, "activation", None) is not None:
with tf.name_scope(self.activation.name):
self.activation.build(None)
def __init__(self, pool_size: int, **kwargs):
super().__init__(**kwargs)
self.pool = keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same")
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
output = self.pool(hidden_states)
output = output - hidden_states
return output
class TFEfficientFormerDenseMlp(keras.layers.Layer):
def __init__(
self,
config: EfficientFormerConfig,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
**kwargs,
):
super().__init__(**kwargs)
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.linear_in = keras.layers.Dense(
units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in"
)
self.activation = ACT2FN[config.hidden_act]
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.linear_out = keras.layers.Dense(
units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out"
)
self.hidden_features = hidden_features
self.in_features = in_features
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.linear_in(inputs=hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.linear_out(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "linear_in", None) is not None:
with tf.name_scope(self.linear_in.name):
self.linear_in.build([None, None, self.in_features])
if getattr(self, "linear_out", None) is not None:
with tf.name_scope(self.linear_out.name):
self.linear_out.build([None, None, self.hidden_features])
class TFEfficientFormerConvMlp(keras.layers.Layer):
def __init__(
self,
config: EfficientFormerConfig,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
drop: float = 0.0,
**kwargs,
):
super().__init__(**kwargs),
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.drop = drop
):
super().__init__(**kwargs)
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.convolution1 = keras.layers.Conv2D(
filters=hidden_features,
kernel_size=1,
name="convolution1",
padding="valid",
)
self.activation = ACT2FN[config.hidden_act]
self.convolution2 = keras.layers.Conv2D(
filters=out_features,
kernel_size=1,
name="convolution2",
padding="valid",
)
self.dropout = keras.layers.Dropout(rate=drop)
self.batchnorm_before = keras.layers.BatchNormalization(
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
)
self.batchnorm_after = keras.layers.BatchNormalization(
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
)
self.hidden_features = hidden_features
self.in_features = in_features
self.out_features = out_features
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution1(hidden_state)
hidden_state = self.batchnorm_before(hidden_state, training=training)
hidden_state = self.activation(hidden_state)
hidden_state = self.dropout(hidden_state, training=training)
hidden_state = self.convolution2(hidden_state)
hidden_state = self.batchnorm_after(hidden_state, training=training)
hidden_state = self.dropout(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution1", None) is not None:
with tf.name_scope(self.convolution1.name):
self.convolution1.build([None, None, None, self.in_features])
if getattr(self, "convolution2", None) is not None:
with tf.name_scope(self.convolution2.name):
self.convolution2.build([None, None, None, self.hidden_features])
if getattr(self, "batchnorm_before", None) is not None:
with tf.name_scope(self.batchnorm_before.name):
self.batchnorm_before.build([None, None, None, self.hidden_features])
if getattr(self, "batchnorm_after", None) is not None:
with tf.name_scope(self.batchnorm_after.name):
self.batchnorm_after.build([None, None, None, self.out_features])
class TFEfficientFormerDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
"""
def __init__(self, drop_path: float, **kwargs):
super().__init__(**kwargs)
self.drop_path = drop_path
def call(self, x: tf.Tensor, training=None):
if training:
keep_prob = 1 - self.drop_path
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
random_tensor = tf.floor(random_tensor)
return (x / keep_prob) * random_tensor
return x
class TFEfficientFormerFlat(keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def call(self, hidden_states: tf.Tensor) -> Tuple[tf.Tensor]:
batch_size, _, _, in_channels = shape_list(hidden_states)
hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels])
return hidden_states
class TFEfficientFormerMeta3D(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
super().__init__(**kwargs)
self.token_mixer = TFEfficientFormerSelfAttention(
dim=config.dim,
key_dim=config.key_dim,
num_heads=config.num_attention_heads,
attention_ratio=config.attention_ratio,
resolution=config.resolution,
name="token_mixer",
config=config,
)
self.dim = dim
self.config = config
self.layernorm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1")
self.layernorm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2")
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp")
self.drop_path = (
TFEfficientFormerDropPath(drop_path)
if drop_path > 0.0
else keras.layers.Activation("linear", name="drop_path")
)
self.config = config
def build(self, input_shape=None):
self.layer_scale_1 = None
self.layer_scale_2 = None
if self.config.use_layer_scale:
self.layer_scale_1 = self.add_weight(
shape=(self.dim,),
initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
trainable=True,
name="layer_scale_1",
)
self.layer_scale_2 = self.add_weight(
shape=(self.dim,),
initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
trainable=True,
name="layer_scale_2",
)
if self.built:
return
self.built = True
if getattr(self, "token_mixer", None) is not None:
with tf.name_scope(self.token_mixer.name):
self.token_mixer.build(None)
if getattr(self, "layernorm1", None) is not None:
with tf.name_scope(self.layernorm1.name):
self.layernorm1.build([None, None, self.dim])
if getattr(self, "layernorm2", None) is not None:
with tf.name_scope(self.layernorm2.name):
self.layernorm2.build([None, None, self.dim])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
def call(
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
) -> Tuple[tf.Tensor]:
self_attention_outputs = self.token_mixer(
hidden_states=self.layernorm1(hidden_states, training=training),
output_attentions=output_attentions,
training=training,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
if self.config.use_layer_scale:
layer_output = hidden_states + self.drop_path(
tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output,
training=training,
)
layer_output = layer_output + self.drop_path(
tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
* self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
training=training,
)
else:
layer_output = hidden_states + self.drop_path(attention_output, training=training)
layer_output = layer_output + self.drop_path(
self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
training=training,
)
outputs = (layer_output,) + outputs
return outputs
class TFEfficientFormerMeta3DLayers(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, **kwargs):
super().__init__(**kwargs)
drop_paths = [
config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
for block_idx in range(config.num_meta3d_blocks)
]
self.blocks = [
TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}")
for i, drop_path in enumerate(drop_paths)
]
def call(
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
) -> Tuple[tf.Tensor]:
all_attention_outputs = () if output_attentions else None
for i, layer_module in enumerate(self.blocks):
if isinstance(hidden_states, tuple):
hidden_states = hidden_states[0]
hidden_states = layer_module(
hidden_states=hidden_states, output_attentions=output_attentions, training=training
)
if output_attentions:
all_attention_outputs = all_attention_outputs + (hidden_states[1],)
if output_attentions:
outputs = (hidden_states[0],) + all_attention_outputs
return outputs
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "blocks", None) is not None:
for layer in self.blocks:
with tf.name_scope(layer.name):
layer.build(None)
class TFEfficientFormerMeta4D(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
super().__init__(**kwargs)
pool_size = config.pool_size if config.pool_size is not None else 3
self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer")
self.dim = dim
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
self.mlp = TFEfficientFormerConvMlp(
config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp"
)
self.drop_path = (
TFEfficientFormerDropPath(drop_path, name="drop_path")
if drop_path > 0.0
else keras.layers.Activation("linear", name="drop_path")
)
self.config = config
def build(self, input_shape=None):
self.layer_scale_1 = None
self.layer_scale_2 = None
if self.config.use_layer_scale:
self.layer_scale_1 = self.add_weight(
shape=(self.dim),
initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
trainable=True,
name="layer_scale_1",
)
self.layer_scale_2 = self.add_weight(
shape=(self.dim),
initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
trainable=True,
name="layer_scale_2",
)
if self.built:
return
self.built = True
if getattr(self, "token_mixer", None) is not None:
with tf.name_scope(self.token_mixer.name):
self.token_mixer.build(None)
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
outputs = self.token_mixer(hidden_states)
if self.config.use_layer_scale:
layer_output = hidden_states + self.drop_path(
tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs,
training=training,
)
layer_output = layer_output + self.drop_path(
tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
* self.mlp(hidden_state=layer_output, training=training),
training=training,
)
else:
layer_output = hidden_states + self.drop_path(outputs, training=training)
layer_output = layer_output + self.drop_path(
self.mlp(hidden_state=layer_output, training=training), training=training
)
return layer_output
class TFEfficientFormerMeta4DLayers(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs):
super().__init__(**kwargs)
num_layers = (
config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
)
drop_paths = [
config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
]
self.blocks = [
TFEfficientFormerMeta4D(
config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}"
)
for i in range(len(drop_paths))
]
def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
for layer_module in self.blocks:
hidden_states = layer_module(hidden_states=hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "blocks", None) is not None:
for layer in self.blocks:
with tf.name_scope(layer.name):
layer.build(None)
class TFEfficientFormerIntermediateStage(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, index: int, **kwargs):
super().__init__(**kwargs)
self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers")
def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "meta4D_layers", None) is not None:
with tf.name_scope(self.meta4D_layers.name):
self.meta4D_layers.build(None)
class TFEfficientFormerLastStage(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, **kwargs):
super().__init__(**kwargs)
self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers")
self.flat = TFEfficientFormerFlat(name="flat")
self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers")
def call(
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
) -> Tuple[tf.Tensor]:
hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
hidden_states = self.flat(hidden_states=hidden_states)
hidden_states = self.meta3D_layers(
hidden_states=hidden_states, output_attentions=output_attentions, training=training
)
return hidden_states
if self.built:
return
self.built = True
if getattr(self, "meta4D_layers", None) is not None:
with tf.name_scope(self.meta4D_layers.name):
self.meta4D_layers.build(None)
if getattr(self, "flat", None) is not None:
with tf.name_scope(self.flat.name):
self.flat.build(None)
if getattr(self, "meta3D_layers", None) is not None:
with tf.name_scope(self.meta3D_layers.name):
self.meta3D_layers.build(None)
class TFEfficientFormerEncoder(keras.layers.Layer):
def __init__(self, config: EfficientFormerConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
num_intermediate_stages = len(config.depths) - 1
downsamples = [
config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
for i in range(num_intermediate_stages)
]
intermediate_stages = []
layer_count = -1
for i in range(num_intermediate_stages):
layer_count += 1
intermediate_stages.append(
TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}")
)
if downsamples[i]:
layer_count += 1
intermediate_stages.append(
TFEfficientFormerPatchEmbeddings(
config,
config.hidden_sizes[i],
config.hidden_sizes[i + 1],
name=f"intermediate_stages.{layer_count}",
)
)
self.intermediate_stages = intermediate_stages
self.last_stage = TFEfficientFormerLastStage(config, name="last_stage")
def call(
self,
hidden_states: tf.Tensor,
output_hidden_states: bool,
output_attentions: bool,
return_dict: bool,
training: bool = False,
) -> TFBaseModelOutput:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
for layer_module in self.intermediate_stages:
hidden_states = layer_module(hidden_states, training=training)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training)
if output_attentions:
all_self_attentions = all_self_attentions + layer_output[1:]
if output_hidden_states:
all_hidden_states = all_hidden_states + (layer_output[0],)
if not return_dict:
return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=layer_output[0],
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "last_stage", None) is not None:
with tf.name_scope(self.last_stage.name):
self.last_stage.build(None)
for layer in self.intermediate_stages:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFEfficientFormerMainLayer(keras.layers.Layer):
config_class = EfficientFormerConfig
def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
super().__init__(**kwargs)
self.config = config
self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed")
self.encoder = TFEfficientFormerEncoder(config, name="encoder")
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
@unpack_inputs
def call(
self,
pixel_values: Optional[tf.Tensor] = None,
output_attentions: Optional[tf.Tensor] = None,
output_hidden_states: Optional[tf.Tensor] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor, ...]]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
embedding_output = self.patch_embed(pixel_values, training=training)
encoder_outputs = self.encoder(
hidden_states=embedding_output,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output, training=training)
if output_hidden_states:
hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]]) + (
encoder_outputs[1][-1],
)
if not return_dict:
head_outputs = (sequence_output,)
return head_outputs + encoder_outputs[1:]
return TFBaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "patch_embed", None) is not None:
with tf.name_scope(self.patch_embed.name):
self.patch_embed.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.hidden_sizes[-1]])
@add_start_docstrings(
"The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
EFFICIENTFORMER_START_DOCSTRING,
)
class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel):
def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
super().__init__(config, **kwargs)
self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
@unpack_inputs
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def call(
self,
pixel_values: Optional[tf.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
"""
Call function to forward pass through EfficientFormer model.
Args:
pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using `AutoImageProcessor`. See
`EfficientFormerImageProcessor.__call__` for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers.
See `attentions` under returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers.
See `hidden_states` under returned tensors for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a `ModelOutput` instead of a plain tuple.
training (`bool`, *optional*):
Whether the model is in training mode or evaluation mode.
Returns:
Either a `TFBaseModelOutputWithPooling` or a tuple containing a `tf.Tensor`.
"""
return self.efficientformer(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
**kwargs,
)
) -> Union[Tuple, TFBaseModelOutput]:
outputs = self.efficientformer(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "efficientformer", None) is not None:
with tf.name_scope(self.efficientformer.name):
self.efficientformer.build(None)
@add_start_docstrings(
"""
EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for
ImageNet.
""",
EFFICIENTFORMER_START_DOCSTRING,
)
class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: EfficientFormerConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
self.classifier = (
keras.layers.Dense(config.num_labels, name="classifier")
if config.num_labels > 0
else keras.layers.Activation("linear", name="classifier")
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def call(
self,
pixel_values: Optional[tf.Tensor] = None,
labels: Optional[tf.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[tf.Tensor, TFImageClassifierOutput]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.efficientformer(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFImageClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "efficientformer", None) is not None:
with tf.name_scope(self.efficientformer.name):
self.efficientformer.build(None)
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
@dataclass
class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
"""
Args:
Output type of [`EfficientFormerForImageClassificationWithTeacher`].
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
预测分数,作为 cls_logits 和 distillation_logits 的平均值。
cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
分类头部的预测分数(即最终类令牌的隐藏状态上的线性层)。
distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
蒸馏头部的预测分数(即蒸馏令牌的隐藏状态上的线性层)。
hidden_states (`tuple(tf.Tensor)`, *optional*, 当 `output_hidden_states=True` 时返回或当
`config.output_hidden_states=True` 时返回):
`tf.Tensor` 元组(一个用于嵌入的输出 + 每层输出的一个),形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每一层输出的隐藏状态加上初始嵌入输出。
attentions (`tuple(tf.Tensor)`, *optional*, 当 `output_attentions=True` 时返回或当
`config.output_attentions=True` 时返回):
`tf.Tensor` 元组(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
logits: tf.Tensor = None
cls_logits: tf.Tensor = None
distillation_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@add_start_docstrings(
"""
EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
.. warning::
This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
supported.
""",
EFFICIENTFORMER_START_DOCSTRING,
)
class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel):
def __init__(self, config: EfficientFormerConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
self.classifier = (
keras.layers.Dense(config.num_labels, name="classifier")
if config.num_labels > 0
else keras.layers.Activation("linear", name="classifier")
)
self.distillation_classifier = (
keras.layers.Dense(config.num_labels, name="distillation_classifier")
if config.num_labels > 0
else keras.layers.Activation("linear", name="distillation_classifier")
)
@unpack_inputs
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFEfficientFormerForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def call(
self,
pixel_values: Optional[tf.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if training:
raise Exception(
"This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported."
)
outputs = self.efficientformer(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2))
logits = (cls_logits + distillation_logits) / 2
if not return_dict:
output = (logits, cls_logits, distillation_logits) + outputs[1:]
return output
return TFEfficientFormerForImageClassificationWithTeacherOutput(
logits=logits,
cls_logits=cls_logits,
distillation_logits=distillation_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "efficientformer", None) is not None:
with tf.name_scope(self.efficientformer.name):
self.efficientformer.build(None)
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_sizes[-1]])
if getattr(self, "distillation_classifier", None) is not None:
if hasattr(self.distillation_classifier, "name"):
with tf.name_scope(self.distillation_classifier.name):
self.distillation_classifier.build([None, None, self.config.hidden_sizes[-1]])
.\models\efficientformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_efficientformer": [
"EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"EfficientFormerConfig",
]
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_efficientformer"] = ["EfficientFormerImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_efficientformer"] = [
"EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"EfficientFormerForImageClassification",
"EfficientFormerForImageClassificationWithTeacher",
"EfficientFormerModel",
"EfficientFormerPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_efficientformer"] = [
"TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFEfficientFormerForImageClassification",
"TFEfficientFormerForImageClassificationWithTeacher",
"TFEfficientFormerModel",
"TFEfficientFormerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_efficientformer import EfficientFormerImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_efficientformer import (
EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
EfficientFormerForImageClassification,
EfficientFormerForImageClassificationWithTeacher,
EfficientFormerModel,
EfficientFormerPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_efficientformer import (
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
TFEfficientFormerForImageClassification,
TFEfficientFormerForImageClassificationWithTeacher,
TFEfficientFormerModel,
TFEfficientFormerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\efficientnet\configuration_efficientnet.py
""" EfficientNet model configuration"""
from collections import OrderedDict
from typing import List, Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/efficientnet-b7": "https://huggingface.co/google/efficientnet-b7/resolve/main/config.json",
}
class EfficientNetConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`EfficientNetModel`]. It is used to instantiate an
EfficientNet model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the EfficientNet
[google/efficientnet-b7](https://huggingface.co/google/efficientnet-b7) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import EfficientNetConfig, EfficientNetModel
>>> # Initializing a EfficientNet efficientnet-b7 style configuration
>>> configuration = EfficientNetConfig()
>>> # Initializing a model (with random weights) from the efficientnet-b7 style configuration
>>> model = EfficientNetModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "efficientnet"
def __init__(
self,
num_channels: int = 3,
image_size: int = 600,
width_coefficient: float = 2.0,
depth_coefficient: float = 3.1,
depth_divisor: int = 8,
kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3],
in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],
out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320],
depthwise_padding: List[int] = [],
strides: List[int] = [1, 2, 2, 2, 1, 2, 1],
num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],
expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6],
squeeze_expansion_ratio: float = 0.25,
hidden_act: str = "swish",
hidden_dim: int = 2560,
pooling_type: str = "mean",
initializer_range: float = 0.02,
batch_norm_eps: float = 0.001,
batch_norm_momentum: float = 0.99,
dropout_rate: float = 0.5,
drop_connect_rate: float = 0.2,
**kwargs,
):
super().__init__(**kwargs)
self.num_channels = num_channels
self.image_size = image_size
self.width_coefficient = width_coefficient
self.depth_coefficient = depth_coefficient
self.depth_divisor = depth_divisor
self.kernel_sizes = kernel_sizes
self.in_channels = in_channels
self.out_channels = out_channels
self.depthwise_padding = depthwise_padding
self.strides = strides
self.num_block_repeats = num_block_repeats
self.expand_ratios = expand_ratios
self.squeeze_expansion_ratio = squeeze_expansion_ratio
self.hidden_act = hidden_act
self.hidden_dim = hidden_dim
self.pooling_type = pooling_type
self.initializer_range = initializer_range
self.batch_norm_eps = batch_norm_eps
self.batch_norm_momentum = batch_norm_momentum
self.dropout_rate = dropout_rate
self.drop_connect_rate = drop_connect_rate
self.num_hidden_layers = sum(num_block_repeats) * 4
class EfficientNetOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-5
.\models\efficientnet\convert_efficientnet_to_pytorch.py
"""Convert EfficientNet checkpoints from the original repository.
URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py"""
import argparse
import json
import os
import numpy as np
import PIL
import requests
import tensorflow.keras.applications.efficientnet as efficientnet
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from tensorflow.keras.preprocessing import image
from transformers import (
EfficientNetConfig,
EfficientNetForImageClassification,
EfficientNetImageProcessor,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
model_classes = {
"b0": efficientnet.EfficientNetB0,
"b1": efficientnet.EfficientNetB1,
"b2": efficientnet.EfficientNetB2,
"b3": efficientnet.EfficientNetB3,
"b4": efficientnet.EfficientNetB4,
"b5": efficientnet.EfficientNetB5,
"b6": efficientnet.EfficientNetB6,
"b7": efficientnet.EfficientNetB7,
}
CONFIG_MAP = {
"b0": {
"hidden_dim": 1280,
"width_coef": 1.0,
"depth_coef": 1.0,
"image_size": 224,
"dropout_rate": 0.2,
"dw_padding": [],
},
"b1": {
"hidden_dim": 1280,
"width_coef": 1.0,
"depth_coef": 1.1,
"image_size": 240,
"dropout_rate": 0.2,
"dw_padding": [16],
},
"b2": {
"hidden_dim": 1408,
"width_coef": 1.1,
"depth_coef": 1.2,
"image_size": 260,
"dropout_rate": 0.3,
"dw_padding": [5, 8, 16],
},
"b3": {
"hidden_dim": 1536,
"width_coef": 1.2,
"depth_coef": 1.4,
"image_size": 300,
"dropout_rate": 0.3,
"dw_padding": [5, 18],
},
"b4": {
"hidden_dim": 1792,
"width_coef": 1.4,
"depth_coef": 1.8,
"image_size": 380,
"dropout_rate": 0.4,
"dw_padding": [6],
},
"b5": {
"hidden_dim": 2048,
"width_coef": 1.6,
"depth_coef": 2.2,
"image_size": 456,
"dropout_rate": 0.4,
"dw_padding": [13, 27],
},
"b6": {
"hidden_dim": 2304,
"width_coef": 1.8,
"depth_coef": 2.6,
"image_size": 528,
"dropout_rate": 0.5,
"dw_padding": [31],
},
"b7": {
"hidden_dim": 2560,
"width_coef": 2.0,
"depth_coef": 3.1,
"image_size": 600,
"dropout_rate": 0.5,
"dw_padding": [18],
},
}
def get_efficientnet_config(model_name):
config = EfficientNetConfig()
config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"]
config.width_coefficient = CONFIG_MAP[model_name]["width_coef"]
config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"]
config.image_size = CONFIG_MAP[model_name]["image_size"]
config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"]
config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"]
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
def convert_image_processor(model_name):
size = CONFIG_MAP[model_name]["image_size"]
preprocessor = EfficientNetImageProcessor(
size={"height": size, "width": size},
image_mean=[0.485, 0.456, 0.406],
image_std=[0.47853944, 0.4732864, 0.47434163],
do_center_crop=False,
)
return preprocessor
def rename_keys(original_param_names):
block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
block_names = sorted(set(block_names))
num_blocks = len(block_names)
block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
rename_keys = []
rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
for b in block_names:
hf_b = block_name_mapping[b]
rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
rename_keys.append((f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean"))
rename_keys.append((f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var"))
rename_keys.append((f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight"))
rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
rename_keys.append((f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean"))
rename_keys.append((f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var"))
rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
rename_keys.append((f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight"))
rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
rename_keys.append((f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean"))
rename_keys.append((f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var"))
rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight"))
rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight"))
rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias"))
rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean"))
rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var"))
key_mapping = {}
for item in rename_keys:
if item[0] in original_param_names:
key_mapping[item[0]] = "efficientnet." + item[1]
key_mapping["predictions/kernel:0"] = "classifier.weight"
key_mapping["predictions/bias:0"] = "classifier.bias"
return key_mapping
def replace_params(hf_params, tf_params, key_mapping):
for key, value in tf_params.items():
if "normalization" in key:
continue
hf_key = key_mapping[key]
if "_conv" in key and "kernel" in key:
new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
elif "depthwise_kernel" in key:
new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
elif "kernel" in key:
new_hf_value = torch.from_numpy(np.transpose(value))
else:
new_hf_value = torch.from_numpy(value)
assert hf_params[hf_key].shape == new_hf_value.shape
hf_params[hf_key].copy_(new_hf_value)
@torch.no_grad()
def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub):
"""
Copy/paste/tweak model's weights to our EfficientNet structure.
"""
original_model = model_classes[model_name](
include_top=True,
weights="imagenet",
input_tensor=None,
input_shape=None,
pooling=None,
classes=1000,
classifier_activation="softmax",
)
tf_params = original_model.trainable_variables
tf_non_train_params = original_model.non_trainable_variables
tf_params = {param.name: param.numpy() for param in tf_params}
for param in tf_non_train_params:
tf_params[param.name] = param.numpy()
tf_param_names = list(tf_params.keys())
config = get_efficientnet_config(model_name)
hf_model = EfficientNetForImageClassification(config).eval()
hf_params = hf_model.state_dict()
print("Converting parameters...")
key_mapping = rename_keys(tf_param_names)
replace_params(hf_params, tf_params, key_mapping)
preprocessor = convert_image_processor(model_name)
inputs = preprocessor(images=prepare_img(), return_tensors="pt")
hf_model.eval()
with torch.no_grad():
outputs = hf_model(**inputs)
hf_logits = outputs.logits.detach().numpy()
original_model.trainable = False
image_size = CONFIG_MAP[model_name]["image_size"]
img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
original_logits = original_model.predict(x)
assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same."
print("Model outputs match!")
if save_model:
if not os.path.isdir(pytorch_dump_folder_path):
os.mkdir(pytorch_dump_folder_path)
hf_model.save_pretrained(pytorch_dump_folder_path)
preprocessor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing converted {model_name} to the hub...")
model_name = f"efficientnet-{model_name}"
preprocessor.push_to_hub(model_name)
hf_model.push_to_hub(model_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="b0",
type=str,
help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="hf_model",
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--save_model", action="store_true", help="Save model to local")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
args = parser.parse_args()
convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
.\models\efficientnet\image_processing_efficientnet.py
"""
Image processor class for EfficientNet.
"""
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import rescale, resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
class EfficientNetImageProcessor(BaseImageProcessor):
r"""
Constructs a EfficientNet image processor.
This class inherits from BaseImageProcessor and is specialized for EfficientNet models.
It provides methods for preprocessing images before feeding them into an EfficientNet model.
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 346, "width": 346}`):
Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
resample (`PILImageResampling` filter, *optional*, defaults to 0):
Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
do_center_crop (`bool`, *optional*, defaults to `False`):
Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 289, "width": 289}`):
Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
rescale_offset (`bool`, *optional*, defaults to `False`):
Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range]. Can be
overridden by the `rescale_factor` parameter in the `preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
include_top (`bool`, *optional*, defaults to `True`):
Whether to rescale the image again. Should be set to True if the inputs are used for image classification.
"""
# 定义模型输入名称列表,只包含一个元素:"pixel_values"
model_input_names = ["pixel_values"]
# 初始化函数,设置图像处理器的各项参数
def __init__(
self,
do_resize: bool = True, # 是否进行大小调整,默认为True
size: Dict[str, int] = None, # 图像大小的字典,键为"height"和"width"
resample: PILImageResampling = PIL.Image.NEAREST, # 图像调整大小时的重采样方法,默认为最近邻插值
do_center_crop: bool = False, # 是否进行中心裁剪,默认为False
crop_size: Dict[str, int] = None, # 裁剪大小的字典,键为"height"和"width"
rescale_factor: Union[int, float] = 1 / 255, # 图像缩放因子,默认为1/255
rescale_offset: bool = False, # 是否进行缩放偏移,默认为False
do_rescale: bool = True, # 是否进行缩放,默认为True
do_normalize: bool = True, # 是否进行归一化,默认为True
image_mean: Optional[Union[float, List[float]]] = None, # 图像均值,可以是浮点数或浮点数列表
image_std: Optional[Union[float, List[float]]] = None, # 图像标准差,可以是浮点数或浮点数列表
include_top: bool = True, # 是否包含顶部处理,默认为True
**kwargs, # 其他关键字参数
) -> None:
# 调用父类的初始化方法
super().__init__(**kwargs)
# 如果未提供图像大小,则使用默认大小346x346
size = size if size is not None else {"height": 346, "width": 346}
# 根据给定的大小参数获取有效的尺寸字典
size = get_size_dict(size)
# 如果未提供裁剪大小,则使用默认大小289x289
crop_size = crop_size if crop_size is not None else {"height": 289, "width": 289}
# 根据给定的裁剪大小参数获取有效的裁剪尺寸字典
crop_size = get_size_dict(crop_size, param_name="crop_size")
# 初始化对象的各个属性
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.rescale_offset = rescale_offset
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN # 如果未提供图像均值,则使用预设的ImageNet标准均值
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD # 如果未提供图像标准差,则使用预设的ImageNet标准标准差
self.include_top = include_top
# 初始化有效的处理器键列表
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"rescale_offset",
"do_normalize",
"image_mean",
"image_std",
"include_top",
"return_tensors",
"data_format",
"input_data_format",
]
# 从transformers.models.vit.image_processing_vit.ViTImageProcessor.resize复制,将PILImageResampling.BILINEAR更改为PILImageResampling.NEAREST
def resize(
self,
image: np.ndarray, # 输入的图像数据,numpy数组格式
size: Dict[str, int], # 调整后的图像大小字典,键为"height"和"width"
resample: PILImageResampling = PILImageResampling.NEAREST, # 图像调整大小时的重采样方法,默认为最近邻插值
data_format: Optional[Union[str, ChannelDimension]] = None, # 图像数据格式,可以是字符串或通道维度
input_data_format: Optional[Union[str, ChannelDimension]] = None, # 输入图像的数据格式,可以是字符串或通道维度
**kwargs, # 其他关键字参数
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: Optional[PILImageResampling] = PILImageResampling.NEAREST,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.NEAREST`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.NEAREST`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
# 获取调整后的尺寸,确保 `size` 字典包含 `height` 和 `width` 键
size = get_size_dict(size)
if "height" not in size or "width" not in size:
# 如果 `size` 字典不包含 `height` 或 `width` 键,则抛出 ValueError 异常
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
# 计算输出的图像尺寸
output_size = (size["height"], size["width"])
# 调用 resize 函数对图像进行调整大小,并返回调整后的图像
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
):
"""
Rescale an image by a scale factor.
If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
1/127.5, the image is rescaled between [-1, 1].
image = image * scale - 1
If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
image = image * scale
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
offset (`bool`, *optional*):
Whether to scale the image in both negative and positive directions.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 调用外部函数 `rescale` 对图像进行缩放
rescaled_image = rescale(
image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
)
# 如果需要进行偏移处理
if offset:
# 将图像数值做偏移处理
rescaled_image = rescaled_image - 1
# 返回经过缩放和可能的偏移处理后的图像
return rescaled_image
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample=None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
rescale_offset: bool = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
include_top: bool = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\efficientnet\modeling_efficientnet.py
""" PyTorch EfficientNet model."""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithNoAttention,
BaseModelOutputWithPoolingAndNoAttention,
ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_efficientnet import EfficientNetConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "EfficientNetConfig"
_CHECKPOINT_FOR_DOC = "google/efficientnet-b7"
_EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "google/efficientnet-b7"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/efficientnet-b7",
]
EFFICIENTNET_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`EfficientNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
EFFICIENTNET_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.EfficientNetTokenizer`.
See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
for more details.
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, optional):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not** masked,
- 0 for tokens that are **masked**.
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, optional):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, optional):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, optional):
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
returned tensors for more detail.
output_hidden_states (:obj:`bool`, optional):
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
for more detail.
return_dict (:obj:`bool`, optional):
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
When set to ``True``, the output will be a :class:`~transformers.file_utils.ModelOutput` object.
Returns:
:class:`~transformers.file_utils.ModelOutput` or tuple:
Example of output for a model with 12 hidden layers and a vocabulary size of 30522.
Args:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(torch.FloatTensor)`, optional, returned when ``output_hidden_states=True`` is passed or when ``return_dict=True`` is passed):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, optional, returned when ``output_attentions=True`` is passed or when ``return_dict=True`` is passed):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
from transformers import EfficientNetTokenizer, EfficientNetModel
import torch
tokenizer = EfficientNetTokenizer.from_pretrained('efficientnet')
model = EfficientNetModel.from_pretrained('efficientnet')
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
def round_filters(config: EfficientNetConfig, num_channels: int):
divisor = config.depth_divisor
num_channels *= config.width_coefficient
new_dim = max(divisor, int(num_channels + divisor / 2) // divisor * divisor)
if new_dim < 0.9 * num_channels:
new_dim += divisor
return int(new_dim)
def correct_pad(kernel_size: Union[int, Tuple], adjust: bool = True):
if isinstance(kernel_size, int):
kernel_size = (kernel_size, kernel_size)
correct = (kernel_size[0] // 2, kernel_size[1] // 2)
if adjust:
return (correct[1] - 1, correct[1], correct[0] - 1, correct[0])
else:
return (correct[1], correct[1], correct[0], correct[0])
class EfficientNetEmbeddings(nn.Module):
r"""
EfficientNet 的嵌入模块,对应原始工作中的 stem 模块。
"""
def __init__(self, config: EfficientNetConfig):
super().__init__()
self.out_dim = round_filters(config, 32)
self.padding = nn.ZeroPad2d(padding=(0, 1, 0, 1))
self.convolution = nn.Conv2d(
config.num_channels, self.out_dim, kernel_size=3, stride=2, padding="valid", bias=False
)
self.batchnorm = nn.BatchNorm2d(self.out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum)
self.activation = ACT2FN[config.hidden_act]
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
features = self.padding(pixel_values)
features = self.convolution(features)
features = self.batchnorm(features)
features = self.activation(features)
return features
class EfficientNetDepthwiseConv2d(nn.Conv2d):
def __init__(
self,
in_channels,
depth_multiplier=1,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
bias=True,
padding_mode="zeros",
):
out_channels = in_channels * depth_multiplier
super().__init__(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=in_channels,
bias=bias,
padding_mode=padding_mode,
)
class EfficientNetExpansionLayer(nn.Module):
r"""
这个类对应原始实现中每个块的扩展阶段。
"""
def __init__(self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int):
super().__init__()
self.expand_conv = nn.Conv2d(
in_channels=in_dim,
out_channels=out_dim,
kernel_size=1,
padding="same",
bias=False,
)
self.expand_bn = nn.BatchNorm2d(num_features=out_dim, eps=config.batch_norm_eps)
self.expand_act = ACT2FN[config.hidden_act]
def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
hidden_states = self.expand_conv(hidden_states)
hidden_states = self.expand_bn(hidden_states)
hidden_states = self.expand_act(hidden_states)
return hidden_states
class EfficientNetDepthwiseLayer(nn.Module):
r"""
This corresponds to the depthwise convolution phase of each block in the original implementation.
"""
def __init__(
self,
config: EfficientNetConfig,
in_dim: int,
stride: int,
kernel_size: int,
adjust_padding: bool,
):
super().__init__()
self.stride = stride
conv_pad = "valid" if self.stride == 2 else "same"
padding = correct_pad(kernel_size, adjust=adjust_padding)
self.depthwise_conv_pad = nn.ZeroPad2d(padding=padding)
self.depthwise_conv = EfficientNetDepthwiseConv2d(
in_dim, kernel_size=kernel_size, stride=stride, padding=conv_pad, bias=False
)
self.depthwise_norm = nn.BatchNorm2d(
num_features=in_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
)
self.depthwise_act = ACT2FN[config.hidden_act]
def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
if self.stride == 2:
hidden_states = self.depthwise_conv_pad(hidden_states)
hidden_states = self.depthwise_conv(hidden_states)
hidden_states = self.depthwise_norm(hidden_states)
hidden_states = self.depthwise_act(hidden_states)
return hidden_states
class EfficientNetSqueezeExciteLayer(nn.Module):
r"""
This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
"""
def __init__(self, config: EfficientNetConfig, in_dim: int, expand_dim: int, expand: bool = False):
super().__init__()
self.dim = expand_dim if expand else in_dim
self.dim_se = max(1, int(in_dim * config.squeeze_expansion_ratio))
self.squeeze = nn.AdaptiveAvgPool2d(output_size=1)
self.reduce = nn.Conv2d(
in_channels=self.dim,
out_channels=self.dim_se,
kernel_size=1,
padding="same",
)
self.expand = nn.Conv2d(
in_channels=self.dim_se,
out_channels=self.dim,
kernel_size=1,
padding="same",
)
self.act_reduce = ACT2FN[config.hidden_act]
self.act_expand = nn.Sigmoid()
def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
inputs = hidden_states
hidden_states = self.squeeze(hidden_states)
hidden_states = self.reduce(hidden_states)
hidden_states = self.act_reduce(hidden_states)
hidden_states = self.expand(hidden_states)
hidden_states = self.act_expand(hidden_states)
hidden_states = torch.mul(inputs, hidden_states)
return hidden_states
class EfficientNetFinalBlockLayer(nn.Module):
r"""
This corresponds to the final phase of each block in the original implementation.
"""
def __init__(
self, config: EfficientNetConfig, in_dim: int, out_dim: int, stride: int, drop_rate: float, id_skip: bool
):
super().__init__()
self.apply_dropout = stride == 1 and not id_skip
self.project_conv = nn.Conv2d(
in_channels=in_dim,
out_channels=out_dim,
kernel_size=1,
padding="same",
bias=False,
)
self.project_bn = nn.BatchNorm2d(
num_features=out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
)
self.dropout = nn.Dropout(p=drop_rate)
def forward(self, embeddings: torch.FloatTensor, hidden_states: torch.FloatTensor) -> torch.Tensor:
hidden_states = self.project_conv(hidden_states)
hidden_states = self.project_bn(hidden_states)
if self.apply_dropout:
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + embeddings
return hidden_states
class EfficientNetBlock(nn.Module):
r"""
This corresponds to the expansion and depthwise convolution phase of each block in the original implementation.
Args:
config ([`EfficientNetConfig`]):
Model configuration class.
in_dim (`int`):
Number of input channels.
out_dim (`int`):
Number of output channels.
stride (`int`):
Stride size to be used in convolution layers.
expand_ratio (`int`):
Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
kernel_size (`int`):
Kernel size for the depthwise convolution layer.
drop_rate (`float`):
Dropout rate to be used in the final phase of each block.
id_skip (`bool`):
Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
of each block. Set to `True` for the first block of each stage.
adjust_padding (`bool`):
Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
operation, set to `True` for inputs with odd input sizes.
"""
def __init__(
self,
config: EfficientNetConfig,
in_dim: int,
out_dim: int,
stride: int,
expand_ratio: int,
kernel_size: int,
drop_rate: float,
id_skip: bool,
adjust_padding: bool,
):
super().__init__()
self.expand_ratio = expand_ratio
self.expand = True if self.expand_ratio != 1 else False
expand_in_dim = in_dim * expand_ratio
if self.expand:
self.expansion = EfficientNetExpansionLayer(
config=config, in_dim=in_dim, out_dim=expand_in_dim, stride=stride
)
self.depthwise_conv = EfficientNetDepthwiseLayer(
config=config,
in_dim=expand_in_dim if self.expand else in_dim,
stride=stride,
kernel_size=kernel_size,
adjust_padding=adjust_padding,
)
self.squeeze_excite = EfficientNetSqueezeExciteLayer(
config=config, in_dim=in_dim, expand_dim=expand_in_dim, expand=self.expand
)
self.projection = EfficientNetFinalBlockLayer(
config=config,
in_dim=expand_in_dim if self.expand else in_dim,
out_dim=out_dim,
stride=stride,
drop_rate=drop_rate,
id_skip=id_skip,
)
def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
embeddings = hidden_states
if self.expand_ratio != 1:
hidden_states = self.expansion(hidden_states)
hidden_states = self.depthwise_conv(hidden_states)
hidden_states = self.squeeze_excite(hidden_states)
hidden_states = self.projection(embeddings, hidden_states)
return hidden_states
r"""
Forward propogates the embeddings through each EfficientNet block.
Args:
config ([`EfficientNetConfig`]):
Model configuration class.
"""
def __init__(self, config: EfficientNetConfig):
super().__init__()
self.config = config
self.depth_coefficient = config.depth_coefficient
def round_repeats(repeats):
return int(math.ceil(self.depth_coefficient * repeats))
num_base_blocks = len(config.in_channels)
num_blocks = sum(round_repeats(n) for n in config.num_block_repeats)
curr_block_num = 0
blocks = []
for i in range(num_base_blocks):
in_dim = round_filters(config, config.in_channels[i])
out_dim = round_filters(config, config.out_channels[i])
stride = config.strides[i]
kernel_size = config.kernel_sizes[i]
expand_ratio = config.expand_ratios[i]
for j in range(round_repeats(config.num_block_repeats[i])):
id_skip = True if j == 0 else False
stride = 1 if j > 0 else stride
in_dim = out_dim if j > 0 else in_dim
adjust_padding = False if curr_block_num in config.depthwise_padding else True
drop_rate = config.drop_connect_rate * curr_block_num / num_blocks
block = EfficientNetBlock(
config=config,
in_dim=in_dim,
out_dim=out_dim,
stride=stride,
kernel_size=kernel_size,
expand_ratio=expand_ratio,
drop_rate=drop_rate,
id_skip=id_skip,
adjust_padding=adjust_padding,
)
blocks.append(block)
curr_block_num += 1
self.blocks = nn.ModuleList(blocks)
self.top_conv = nn.Conv2d(
in_channels=out_dim,
out_channels=round_filters(config, 1280),
kernel_size=1,
padding="same",
bias=False,
)
self.top_bn = nn.BatchNorm2d(
num_features=config.hidden_dim,
eps=config.batch_norm_eps,
momentum=config.batch_norm_momentum
)
self.top_activation = ACT2FN[config.hidden_act]
def forward(
self,
hidden_states: torch.FloatTensor,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> BaseModelOutputWithNoAttention:
all_hidden_states = (hidden_states,) if output_hidden_states else None
for block in self.blocks:
hidden_states = block(hidden_states)
if output_hidden_states:
all_hidden_states += (hidden_states,)
hidden_states = self.top_conv(hidden_states)
hidden_states = self.top_bn(hidden_states)
hidden_states = self.top_activation(hidden_states)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
)
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = EfficientNetConfig
base_model_prefix = "efficientnet"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@add_start_docstrings(
"The bare EfficientNet model outputting raw features without any specific head on top.",
EFFICIENTNET_START_DOCSTRING,
)
class EfficientNetModel(EfficientNetPreTrainedModel):
def __init__(self, config: EfficientNetConfig):
super().__init__(config)
self.config = config
self.embeddings = EfficientNetEmbeddings(config)
self.encoder = EfficientNetEncoder(config)
if config.pooling_type == "mean":
self.pooler = nn.AvgPool2d(config.hidden_dim, ceil_mode=True)
elif config.pooling_type == "max":
self.pooler = nn.MaxPool2d(config.hidden_dim, ceil_mode=True)
else:
raise ValueError(f"config.pooling must be one of ['mean', 'max'] got {config.pooling}")
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: torch.FloatTensor = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = self.pooler(last_hidden_state)
pooled_output = pooled_output.reshape(pooled_output.shape[:2])
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
"""
EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g.
for ImageNet.
"""
class EfficientNetForImageClassification(EfficientNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.efficientnet = EfficientNetModel(config)
self.dropout = nn.Dropout(p=config.dropout_rate)
self.classifier = nn.Linear(config.hidden_dim, self.num_labels) if self.num_labels > 0 else nn.Identity()
self.post_init()
@add_start_docstrings_to_model_forward(EFFICIENTNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: torch.FloatTensor = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.efficientnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
.\models\efficientnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_efficientnet": [
"EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
"EfficientNetConfig",
"EfficientNetOnnxConfig",
]
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_efficientnet"] = ["EfficientNetImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_efficientnet"] = [
"EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"EfficientNetForImageClassification",
"EfficientNetModel",
"EfficientNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_efficientnet import (
EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
EfficientNetConfig,
EfficientNetOnnxConfig,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_efficientnet import EfficientNetImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_efficientnet import (
EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
EfficientNetForImageClassification,
EfficientNetModel,
EfficientNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\electra\configuration_electra.py
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/config.json",
"google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/config.json",
"google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/config.json",
"google/electra-small-discriminator": (
"https://huggingface.co/google/electra-small-discriminator/resolve/main/config.json"
),
"google/electra-base-discriminator": (
"https://huggingface.co/google/electra-base-discriminator/resolve/main/config.json"
),
"google/electra-large-discriminator": (
"https://huggingface.co/google/electra-large-discriminator/resolve/main/config.json"
),
}
class ElectraConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ElectraModel`] or a [`TFElectraModel`]. It is
used to instantiate a ELECTRA model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the ELECTRA
[google/electra-small-discriminator](https://huggingface.co/google/electra-small-discriminator) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import ElectraConfig, ElectraModel
>>> # Initializing a ELECTRA electra-base-uncased style configuration
>>> configuration = ElectraConfig()
>>> # Initializing a model (with random weights) from the electra-base-uncased style configuration
>>> model = ElectraModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "electra"
def __init__(
self,
vocab_size=30522,
embedding_size=128,
hidden_size=256,
num_hidden_layers=12,
num_attention_heads=4,
intermediate_size=1024,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
summary_type="first",
summary_use_proj=True,
summary_activation="gelu",
summary_last_dropout=0.1,
pad_token_id=0,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
class ElectraOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
("token_type_ids", dynamic_axis),
]
)
.\models\electra\convert_electra_original_tf_checkpoint_to_pytorch.py
"""Convert ELECTRA checkpoint."""
import argparse
import torch
from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra
from transformers.utils import logging
logging.set_verbosity_info()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
config = ElectraConfig.from_json_file(config_file)
print(f"Building PyTorch model from configuration: {config}")
if discriminator_or_generator == "discriminator":
model = ElectraForPreTraining(config)
elif discriminator_or_generator == "generator":
model = ElectraForMaskedLM(config)
else:
raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")
load_tf_weights_in_electra(
model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
)
print(f"Save PyTorch model to {pytorch_dump_path}")
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--discriminator_or_generator",
default=None,
type=str,
required=True,
help=(
"Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
"'generator'."
),
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(
args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
)
.\models\electra\modeling_electra.py
"""PyTorch ELECTRA model."""
import math
import os
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, get_activation
from ...modeling_outputs import (
BaseModelOutputWithCrossAttentions,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_electra import ElectraConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator"
_CONFIG_FOR_DOC = "ElectraConfig"
ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator",
"google/electra-base-generator",
"google/electra-large-generator",
"google/electra-small-discriminator",
"google/electra-base-discriminator",
"google/electra-large-discriminator",
]
def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
original_name: str = name
try:
if isinstance(model, ElectraForMaskedLM):
name = name.replace("electra/embeddings/", "generator/embeddings/")
if discriminator_or_generator == "generator":
name = name.replace("electra/", "discriminator/")
name = name.replace("generator/", "electra/")
name = name.replace("dense_1", "dense_prediction")
name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias")
name = name.split("/")
if any(n in ["global_step", "temperature"] for n in name):
logger.info(f"Skipping {original_name}")
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
pointer = getattr(pointer, scope_names[0])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name.endswith("_embeddings"):
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except ValueError as e:
e.args += (pointer.shape, array.shape)
raise
print(f"Initialize PyTorch weight {name}", original_name)
pointer.data = torch.from_numpy(array)
except AttributeError as e:
print(f"Skipping {original_name}", name, e)
continue
return model
class ElectraEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values_length: int = 0,
def forward(
self,
input_ids=None,
token_type_ids=None,
inputs_embeds=None,
position_ids=None,
past_key_values_length=0,
) -> torch.Tensor:
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class ElectraSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
):
pass
class ElectraSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class ElectraAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = ElectraSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = ElectraSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class ElectraIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class ElectraOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class ElectraLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = ElectraAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = ElectraAttention(config, position_embedding_type="absolute")
self.intermediate = ElectraIntermediate(config)
self.output = ElectraOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class ElectraEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([ElectraLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class ElectraDiscriminatorPredictions(nn.Module):
"""Prediction module for the discriminator, made up of two dense layers."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = get_activation(config.hidden_act)
self.dense_prediction = nn.Linear(config.hidden_size, 1)
self.config = config
def forward(self, discriminator_hidden_states):
hidden_states = self.dense(discriminator_hidden_states)
hidden_states = self.activation(hidden_states)
logits = self.dense_prediction(hidden_states).squeeze(-1)
return logits
class ElectraGeneratorPredictions(nn.Module):
"""Prediction module for the generator, made up of two dense layers."""
def __init__(self, config):
super().__init__()
self.activation = get_activation("gelu")
self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
def forward(self, generator_hidden_states):
hidden_states = self.dense(generator_hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class ElectraPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = ElectraConfig
load_tf_weights = load_tf_weights_in_electra
base_model_prefix = "electra"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@dataclass
class ElectraForPreTrainingOutput(ModelOutput):
"""
Output type of [`ElectraForPreTraining`].
"""
Args:
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
ELECTRA 目标函数的总损失。
如果提供了 `labels`,则返回此损失。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
头部预测分数(SoftMax 前每个标记的分数)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
元组包含 `torch.FloatTensor` 类型的张量(当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回)。
形状为 `(batch_size, sequence_length, hidden_size)`。
模型每一层的隐藏状态加上初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
元组包含 `torch.FloatTensor` 类型的张量(每层一个)。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
自注意力机制中注意力权重经过 softmax 后的结果,用于计算自注意力头的加权平均值。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
# ELECTRA 模型的文档字符串,描述了模型继承自 PreTrainedModel,并提供了一些通用方法的描述和链接
ELECTRA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# ELECTRA 模型的输入文档字符串,当前为空,通常应包含有关输入参数的描述
ELECTRA_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
"the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
"hidden size and embedding size are different. "
""
"Both the generator and discriminator checkpoints may be loaded into this model.",
ELECTRA_START_DOCSTRING,
)
# ElectraModel 类的定义,继承自 ElectraPreTrainedModel
class ElectraModel(ElectraPreTrainedModel):
# ElectraModel 类的初始化方法
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 初始化词嵌入层
self.embeddings = ElectraEmbeddings(config)
# 如果 embedding_size 与 hidden_size 不同,添加一个线性映射层
if config.embedding_size != config.hidden_size:
self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
# 初始化编码器
self.encoder = ElectraEncoder(config)
self.config = config
# 初始化权重并应用最终处理
self.post_init()
# 获取输入的词嵌入
def get_input_embeddings(self):
return self.embeddings.word_embeddings
# 设置输入的词嵌入
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
# 剪枝模型中的注意力头
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
# 向模型的前向方法添加文档字符串,描述了输入参数的格式
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 向模型的前向方法添加代码示例的文档字符串,包括了加载检查点、输出类型和配置类
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
# 定义 Transformer 模型的前向传播方法,接收多个输入参数
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的 token IDs,可以为空
attention_mask: Optional[torch.Tensor] = None, # 注意力 mask,可以为空
token_type_ids: Optional[torch.Tensor] = None, # token 类型 IDs,可以为空
position_ids: Optional[torch.Tensor] = None, # 位置 IDs,可以为空
head_mask: Optional[torch.Tensor] = None, # 头部 mask,可以为空
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入表示,可以为空
encoder_hidden_states: Optional[torch.Tensor] = None, # 编码器隐藏状态,可以为空
encoder_attention_mask: Optional[torch.Tensor] = None, # 编码器注意力 mask,可以为空
past_key_values: Optional[List[torch.FloatTensor]] = None, # 历史的键值对,可以为空
use_cache: Optional[bool] = None, # 是否使用缓存,可以为空
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可以为空
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可以为空
return_dict: Optional[bool] = None, # 是否返回字典格式的结果,可以为空
# ElectraClassificationHead 类定义,用于处理句子级别的分类任务
class ElectraClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
# 一个全连接层,输入和输出维度都是 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 分类器的 dropout 率,如果没有指定,则使用 config.hidden_dropout_prob
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
# 激活函数为 GELU
self.activation = get_activation("gelu")
# Dropout 层,使用指定的 dropout 率
self.dropout = nn.Dropout(classifier_dropout)
# 输出层全连接层,输入维度为 config.hidden_size,输出维度为 config.num_labels
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
# 取 <s> 标记对应的特征 (等效于 [CLS] 标记)
x = features[:, 0, :]
# 应用 dropout
x = self.dropout(x)
# 经过全连接层
x = self.dense(x)
# 应用激活函数 GELU
x = self.activation(x)
# 再次应用 dropout
x = self.dropout(x)
# 经过输出全连接层
x = self.out_proj(x)
# 返回分类结果
return x
# ElectraForSequenceClassification 类,继承自 ElectraPreTrainedModel 类,用于序列分类任务
@add_start_docstrings(
"""
ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
ELECTRA_START_DOCSTRING,
)
class ElectraForSequenceClassification(ElectraPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 标签数量
self.num_labels = config.num_labels
# 配置
self.config = config
# Electra 模型
self.electra = ElectraModel(config)
# 序列分类器头部
self.classifier = ElectraClassificationHead(config)
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="bhadresh-savani/electra-base-emotion",
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="'joy'",
expected_loss=0.06,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 根据需要确定是否使用返回字典,如果未指定则根据配置决定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 ELECTRA 模型进行前向传播,获取鉴别器的隐藏状态
discriminator_hidden_states = self.electra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取鉴别器输出的序列特征向量
sequence_output = discriminator_hidden_states[0]
# 使用分类器对序列特征向量进行分类预测
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
# 如果存在标签,则计算损失
if labels is not None:
# 根据配置确定问题类型,如果未指定则根据标签类型确定
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择相应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不需要返回字典,则构造输出元组
if not return_dict:
output = (logits,) + discriminator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output
# 返回带有损失、预测 logits、隐藏状态和注意力权重的 SequenceClassifierOutput 对象
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
)
@add_start_docstrings(
"""
Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
It is recommended to load the discriminator checkpoint into that model.
""",
ELECTRA_START_DOCSTRING,
)
class ElectraForPreTraining(ElectraPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.electra = ElectraModel(config) # 初始化 Electra 模型
self.discriminator_predictions = ElectraDiscriminatorPredictions(config) # 初始化判别器预测组件
# 初始化权重并进行最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
Electra model with a language modeling head on top.
Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
the two to have been trained for the masked language modeling task.
""",
ELECTRA_START_DOCSTRING,
)
class ElectraForMaskedLM(ElectraPreTrainedModel):
_tied_weights_keys = ["generator_lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.electra = ElectraModel(config) # 初始化 Electra 模型
self.generator_predictions = ElectraGeneratorPredictions(config) # 初始化生成器预测组件
self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) # 初始化生成器的语言建模头部
# 初始化权重并进行最终处理
self.post_init()
def get_output_embeddings(self):
return self.generator_lm_head
def set_output_embeddings(self, word_embeddings):
self.generator_lm_head = word_embeddings
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="google/electra-small-generator",
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="[MASK]",
expected_output="'paris'",
expected_loss=1.22,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token IDs,可选参数
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,可选参数
token_type_ids: Optional[torch.Tensor] = None, # token 类型 IDs,可选参数
position_ids: Optional[torch.Tensor] = None, # 位置 IDs,可选参数
head_mask: Optional[torch.Tensor] = None, # 头部掩码,可选参数
inputs_embeds: Optional[torch.Tensor] = None, # 嵌入的输入,可选参数
labels: Optional[torch.Tensor] = None, # 用于计算MLM损失的标签,可选参数
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选参数
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选参数
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选参数
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 通过Electra模型生成隐状态
generator_hidden_states = self.electra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
generator_sequence_output = generator_hidden_states[0] # 获取生成器的序列输出
# 使用生成器预测MLM任务的分数
prediction_scores = self.generator_predictions(generator_sequence_output)
prediction_scores = self.generator_lm_head(prediction_scores) # 应用MLM的softmax层
loss = None
# 如果提供了标签,则计算MLM损失
if labels is not None:
loss_fct = nn.CrossEntropyLoss() # 交叉熵损失函数,-100索引表示填充标记
loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
# 如果不要求返回字典形式的输出,则返回元组形式的输出
if not return_dict:
output = (prediction_scores,) + generator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output
# 返回MaskedLMOutput对象,包含损失、预测logits、隐藏状态和注意力权重
return MaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=generator_hidden_states.hidden_states,
attentions=generator_hidden_states.attentions,
)
# 定义一个基于 Electra 模型的标记分类器模型
@add_start_docstrings(
"""
Electra model with a token classification head on top.
Both the discriminator and generator may be loaded into this model.
""",
ELECTRA_START_DOCSTRING,
)
class ElectraForTokenClassification(ElectraPreTrainedModel):
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 设置分类器的标签数量
self.num_labels = config.num_labels
# 加载 Electra 模型
self.electra = ElectraModel(config)
# 获取分类器的 dropout 配置,如果未指定则使用隐藏层 dropout 的配置
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
# 定义一个 dropout 层
self.dropout = nn.Dropout(classifier_dropout)
# 定义一个线性层,用于分类
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并进行最终的处理
self.post_init()
# 增加输入文档字符串到模型的前向传播方法
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 增加示例代码文档字符串到模型的前向传播方法
@add_code_sample_docstrings(
checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output="['B-LOC', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC']",
expected_loss=0.11,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 前向传播方法,接收多种输入参数,返回模型的输出
# 可选参数包括输入的张量、注意力掩码、token 类型 ID、位置 ID、头部掩码、嵌入的输入张量、标签、是否输出注意力、是否输出隐藏状态、是否返回字典
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 根据 return_dict 参数确定是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 Electra 模型进行推断
discriminator_hidden_states = self.electra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取 discriminator_hidden_states 的输出序列
discriminator_sequence_output = discriminator_hidden_states[0]
# 对输出序列应用 dropout
discriminator_sequence_output = self.dropout(discriminator_sequence_output)
# 使用分类器生成 logits
logits = self.classifier(discriminator_sequence_output)
# 初始化损失值为 None
loss = None
# 如果提供了标签,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果 return_dict 为 False,则按非字典形式返回输出
if not return_dict:
output = (logits,) + discriminator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output
# 如果 return_dict 为 True,则按 TokenClassifierOutput 对象形式返回输出
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
)
@add_start_docstrings(
"""
ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
ELECTRA_START_DOCSTRING,
)
# 定义用于问答任务的 ELECTRA 模型,包含一个用于提取式问答任务(如 SQuAD)的跨度分类头部(在隐藏状态输出之上的线性层,
# 用于计算 `span start logits` 和 `span end logits`)。
class ElectraForQuestionAnswering(ElectraPreTrainedModel):
# 指定配置类
config_class = ElectraConfig
# 基础模型前缀
base_model_prefix = "electra"
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# ELECTRA 模型
self.electra = ElectraModel(config)
# 问答输出层,用于预测答案的开始和结束位置
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="bhadresh-savani/electra-base-squad2",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
qa_target_start_index=11,
qa_target_end_index=12,
expected_output="'a nice puppet'",
expected_loss=2.64,
)
# 前向传播方法,接收一系列输入并返回预测的答案开始和结束位置的 logit
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
ELECTRA_START_DOCSTRING,
)
# 定义用于多选分类任务的 ELECTRA 模型,包含一个用于多选分类任务(如 RocStories/SWAG)的分类头部(线性层放置在池化输出之上,
# 并应用 softmax 操作)。
class ElectraForMultipleChoice(ElectraPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# ELECTRA 模型
self.electra = ElectraModel(config)
# 序列汇总层
self.sequence_summary = SequenceSummary(config)
# 分类器层,用于多选任务的分类
self.classifier = nn.Linear(config.hidden_size, 1)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token ids,类型为可选的PyTorch张量
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,类型为可选的PyTorch张量
token_type_ids: Optional[torch.Tensor] = None, # token类型 ids,类型为可选的PyTorch张量
position_ids: Optional[torch.Tensor] = None, # 位置 ids,类型为可选的PyTorch张量
head_mask: Optional[torch.Tensor] = None, # 头部掩码,类型为可选的PyTorch张量
inputs_embeds: Optional[torch.Tensor] = None, # 嵌入的输入,类型为可选的PyTorch张量
labels: Optional[torch.Tensor] = None, # 标签,用于多项选择分类损失计算的PyTorch张量
output_attentions: Optional[bool] = None, # 是否输出注意力权重,类型为可选的布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,类型为可选的布尔值
return_dict: Optional[bool] = None, # 是否返回字典格式的输出,类型为可选的布尔值
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict # 确定是否返回字典格式的输出
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] # 获取选择的数量
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None # 将输入token ids重新视图化
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None # 将注意力掩码重新视图化
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None # 将token类型 ids重新视图化
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None # 将位置 ids重新视图化
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
) # 将嵌入的输入重新视图化
discriminator_hidden_states = self.electra(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
) # 使用Electra模型进行前向传播,获取鉴别器的隐藏状态
sequence_output = discriminator_hidden_states[0] # 获取鉴别器的序列输出
pooled_output = self.sequence_summary(sequence_output) # 序列总结,获取池化输出
logits = self.classifier(pooled_output) # 分类器,计算logits
reshaped_logits = logits.view(-1, num_choices) # 重新形状化logits
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss() # 定义交叉熵损失函数
loss = loss_fct(reshaped_logits, labels) # 计算损失
if not return_dict:
output = (reshaped_logits,) + discriminator_hidden_states[1:] # 构造非字典格式的输出
return ((loss,) + output) if loss is not None else output # 返回损失和输出,如果损失不为None的话
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions,
) # 返回字典格式的多项选择模型输出
# 继承自 ElectraPreTrainedModel 类的 ELECTRA 语言模型,添加了用于条件语言建模 fine-tuning 的头部
@add_start_docstrings(
"""ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING
)
class ElectraForCausalLM(ElectraPreTrainedModel):
# 用于指定权重共享的键名列表
_tied_weights_keys = ["generator_lm_head.weight"]
# 初始化方法,接受配置参数 config
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 如果配置中未指定为解码器,发出警告
if not config.is_decoder:
logger.warning("If you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`")
# 创建 ELECTRA 模型
self.electra = ElectraModel(config)
# 创建用于生成预测的预测器对象
self.generator_predictions = ElectraGeneratorPredictions(config)
# 创建用于语言建模的线性层,设置输入维度和词汇表大小
self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
# 初始化权重
self.init_weights()
# 返回语言建模头部的输出嵌入
def get_output_embeddings(self):
return self.generator_lm_head
# 设置语言建模头部的输出嵌入
def set_output_embeddings(self, new_embeddings):
self.generator_lm_head = new_embeddings
# 前向传播方法,接受多个输入参数,并返回预测的条件语言建模输出
@add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.Tensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 此处为 forward 方法的详细文档注释
"""
Replace these docstrings with ones in transformers.models.**
"""
# 根据输入准备生成过程中需要的输入参数
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
# 如果未提供注意力掩码,则创建全 1 的注意力掩码
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# 如果传入了过去的键值,则根据它们调整输入的 input_ids
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 某些生成方法可能只传入最后一个输入 ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# 默认行为:仅保留最后一个 ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
# 返回准备好的输入字典
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
# 定义方法 `_reorder_cache`,用于重新排序模型的缓存 `past_key_values`,以便与给定的 `beam_idx` 对齐
def _reorder_cache(self, past_key_values, beam_idx):
# 初始化一个空元组 `reordered_past`,用于存储重新排序后的缓存
reordered_past = ()
# 遍历 `past_key_values` 中的每个层的缓存
for layer_past in past_key_values:
# 对当前层的每个缓存状态 `past_state` 执行索引选择操作,
# 根据 `beam_idx` 对应的索引进行选择,并将结果转移到与 `past_state` 相同的设备上
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
# 返回重新排序后的缓存 `reordered_past`
return reordered_past