Transformers 源码解析(三十五)
.\models\deformable_detr\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"]
_import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_deformable_detr"] = [
"DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST",
"DeformableDetrForObjectDetection",
"DeformableDetrModel",
"DeformableDetrPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor
from .image_processing_deformable_detr import DeformableDetrImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_deformable_detr import (
DEFORMABLE_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
DeformableDetrForObjectDetection,
DeformableDetrModel,
DeformableDetrPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deit\configuration_deit.py
""" DeiT model configuration"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/deit-base-distilled-patch16-224": (
"https://huggingface.co/facebook/deit-base-patch16-224/resolve/main/config.json"
),
}
class DeiTConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to instantiate an DeiT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DeiT
[facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "deit"
def __init__(
self,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-12,
image_size=224,
patch_size=16,
num_channels=3,
qkv_bias=True,
encoder_stride=16,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.qkv_bias = qkv_bias
self.encoder_stride = encoder_stride
class DeiTOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-4
.\models\deit\convert_deit_timm_to_pytorch.py
"""Convert DeiT distilled checkpoints from the timm library."""
import argparse
import json
from pathlib import Path
import requests
import timm
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(config, base_model=False):
rename_keys = []
for i in range(config.num_hidden_layers):
rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight"))
rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias"))
rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight"))
rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias"))
rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight"))
rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias"))
rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias"))
rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias"))
rename_keys.extend(
[
("cls_token", "deit.embeddings.cls_token"),
("dist_token", "deit.embeddings.distillation_token"),
("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"),
("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"),
("pos_embed", "deit.embeddings.position_embeddings"),
]
)
if base_model:
rename_keys.extend(
[
("norm.weight", "layernorm.weight"),
("norm.bias", "layernorm.bias"),
("pre_logits.fc.weight", "pooler.dense.weight"),
("pre_logits.fc.bias", "pooler.dense.bias"),
]
)
rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys]
else:
rename_keys.extend(
[
("norm.weight", "deit.layernorm.weight"),
("norm.bias", "deit.layernorm.bias"),
("head.weight", "cls_classifier.weight"),
("head.bias", "cls_classifier.bias"),
("head_dist.weight", "distillation_classifier.weight"),
("head_dist.bias", "distillation_classifier.bias"),
]
)
return rename_keys
def read_in_q_k_v(state_dict, config, base_model=False):
for i in range(config.num_hidden_layers):
if base_model:
prefix = ""
else:
prefix = "deit."
in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
: config.hidden_size, :
]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
config.hidden_size : config.hidden_size * 2, :
]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
config.hidden_size : config.hidden_size * 2
]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-config.hidden_size :, :
]
state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
"""
将模型权重从其它结构复制、粘贴并调整到我们的 DeiT 结构中。
"""
config = DeiTConfig()
base_model = False
config.num_labels = 1000
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
config.patch_size = int(deit_name[-6:-4])
config.image_size = int(deit_name[-3:])
if deit_name[9:].startswith("tiny"):
config.hidden_size = 192
config.intermediate_size = 768
config.num_hidden_layers = 12
config.num_attention_heads = 3
elif deit_name[9:].startswith("small"):
config.hidden_size = 384
config.intermediate_size = 1536
config.num_hidden_layers = 12
config.num_attention_heads = 6
if deit_name[9:].startswith("base"):
pass
elif deit_name[4:].startswith("large"):
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
timm_model = timm.create_model(deit_name, pretrained=True)
timm_model.eval()
state_dict = timm_model.state_dict()
rename_keys = create_rename_keys(config, base_model)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config, base_model)
model = DeiTForImageClassificationWithTeacher(config).eval()
model.load_state_dict(state_dict)
size = int(
(256 / 224) * config.image_size
)
image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
encoding = image_processor(images=prepare_img(), return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values)
timm_logits = timm_model(pixel_values)
assert timm_logits.shape == outputs.logits.shape
assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--deit_name",
default="vit_deit_base_distilled_patch16_224",
type=str,
help="Name of the DeiT timm model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
args = parser.parse_args()
convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path)
.\models\deit\feature_extraction_deit.py
"""DeiT 的特征提取器类。"""
import warnings
from ...utils import logging
from .image_processing_deit import DeiTImageProcessor
logger = logging.get_logger(__name__)
class DeiTFeatureExtractor(DeiTImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class DeiTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
" use DeiTImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\deit\image_processing_deit.py
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
class DeiTImageProcessor(BaseImageProcessor):
r"""
Constructs a DeiT image processor.
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
resample (`PILImageResampling` filter, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
# 定义模型输入名称列表,只包含一个元素"pixel_values"
model_input_names = ["pixel_values"]
# 初始化方法,设置各种预处理参数的默认值
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PIL.Image.BICUBIC,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
rescale_factor: Union[int, float] = 1 / 255,
do_rescale: bool = True,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
):
# 定义构造函数,继承自父类,接受关键字参数
def __init__(
self,
size: Optional[Dict[str, int]] = None,
do_resize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = False,
crop_size: Optional[Dict[str, int]] = None,
do_rescale: bool = False,
rescale_factor: float = 1.0,
do_normalize: bool = False,
image_mean: Optional[List[float]] = None,
image_std: Optional[List[float]] = None,
**kwargs,
):
# 调用父类的构造函数,传入所有关键字参数
super().__init__(**kwargs)
# 如果 size 参数为 None,则设定默认值 {"height": 256, "width": 256}
size = size if size is not None else {"height": 256, "width": 256}
# 调用 get_size_dict 函数,规范化 size 字典的格式
size = get_size_dict(size)
# 如果 crop_size 参数为 None,则设定默认值 {"height": 224, "width": 224}
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
# 调用 get_size_dict 函数,规范化 crop_size 字典的格式,并指定参数名 "crop_size"
crop_size = get_size_dict(crop_size, param_name="crop_size")
# 设定是否进行 resize 操作的标志
self.do_resize = do_resize
# 设定图像尺寸的参数
self.size = size
# 设定图像 resize 时的插值方法
self.resample = resample
# 设定是否进行中心裁剪操作的标志
self.do_center_crop = do_center_crop
# 设定裁剪尺寸的参数
self.crop_size = crop_size
# 设定是否进行图像 rescale 的标志
self.do_rescale = do_rescale
# 设定图像 rescale 的比例因子
self.rescale_factor = rescale_factor
# 设定是否进行图像 normalize 的标志
self.do_normalize = do_normalize
# 如果未指定图像均值,则使用预设的 IMAGENET_STANDARD_MEAN
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
# 如果未指定图像标准差,则使用预设的 IMAGENET_STANDARD_STD
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
# 设定有效的处理器关键字列表,用于验证
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
# 从 transformers.models.vit.image_processing_vit.ViTImageProcessor.resize 复制的函数,设定插值方法为 PILImageResampling.BICUBIC
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size) # 调用函数获取处理后的尺寸信息
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"]) # 获取输出图像的尺寸
return resize(
image,
size=output_size, # 调整图像尺寸为指定的大小
resample=resample, # 使用指定的重采样方法
data_format=data_format, # 设置输出图像的通道维度格式
input_data_format=input_data_format, # 设置输入图像的通道维度格式
**kwargs,
)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample=None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\deit\modeling_deit.py
""" PyTorch DeiT模型。"""
import collections.abc
import math
from dataclasses import dataclass
from typing import Optional, Set, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPooling,
ImageClassifierOutput,
MaskedImageModelingOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_deit import DeiTConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DeiTConfig"
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
_EXPECTED_OUTPUT_SHAPE = [1, 198, 768]
_IMAGE_CLASS_CHECKPOINT = "facebook/deit-base-distilled-patch16-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/deit-base-distilled-patch16-224",
]
class DeiTEmbeddings(nn.Module):
"""
构建CLS令牌、蒸馏令牌、位置和补丁嵌入。可选地,还包括掩码令牌。
"""
def __init__(self, config: DeiTConfig, use_mask_token: bool = False) -> None:
super().__init__()
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
self.patch_embeddings = DeiTPatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 2, config.hidden_size))
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
embeddings = self.patch_embeddings(pixel_values)
batch_size, seq_length, _ = embeddings.size()
if bool_masked_pos is not None:
mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
distillation_tokens = self.distillation_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, distillation_tokens, embeddings), dim=1)
embeddings = embeddings + self.position_embeddings
embeddings = self.dropout(embeddings)
return embeddings
class DeiTPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
)
x = self.projection(pixel_values).flatten(2).transpose(1, 2)
return x
class DeiTSelfAttention(nn.Module):
def __init__(self, config: DeiTConfig) -> None:
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class DeiTSelfOutput(nn.Module):
"""
The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: DeiTConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class DeiTAttention(nn.Module):
def __init__(self, config: DeiTConfig) -> None:
super().__init__()
self.attention = DeiTSelfAttention(config)
self.output = DeiTSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads: Set[int]) -> None:
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_outputs = self.attention(hidden_states, head_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class DeiTIntermediate(nn.Module):
def __init__(self, config: DeiTConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class DeiTOutput(nn.Module):
def __init__(self, config: DeiTConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
class DeiTLayer(nn.Module):
"""This corresponds to the Block class in the timm implementation."""
def __init__(self, config: DeiTConfig) -> None:
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = DeiTAttention(config)
self.intermediate = DeiTIntermediate(config)
self.output = DeiTOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states),
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
hidden_states = attention_output + hidden_states
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output, hidden_states)
outputs = (layer_output,) + outputs
return outputs
class DeiTEncoder(nn.Module):
def __init__(self, config: DeiTConfig) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([DeiTLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
layer_head_mask,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class DeiTPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DeiTConfig
base_model_prefix = "deit"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
_no_split_modules = ["DeiTLayer"]
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data = nn.init.trunc_normal_(
module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
).to(module.weight.dtype)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
DEIT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DEIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`DeiTImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.",
DEIT_START_DOCSTRING,
)
class DeiTModel(DeiTPreTrainedModel):
def __init__(self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False) -> None:
super().__init__(config)
self.config = config
self.embeddings = DeiTEmbeddings(config, use_mask_token=use_mask_token)
self.encoder = DeiTEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.pooler = DeiTPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self) -> DeiTPatchEmbeddings:
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
if pixel_values.dtype != expected_dtype:
pixel_values = pixel_values.to(expected_dtype)
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
encoder_outputs = self.encoder(
embedding_output,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
return head_outputs + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class DeiTPooler(nn.Module):
def __init__(self, config: DeiTConfig):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
@add_start_docstrings(
"""DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
<Tip>
Note that we provide a script to pre-train this model on custom data in our [examples
directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
</Tip>
""",
DEIT_START_DOCSTRING,
)
class DeiTForMaskedImageModeling(DeiTPreTrainedModel):
def __init__(self, config: DeiTConfig) -> None:
super().__init__(config)
self.deit = DeiTModel(config, add_pooling_layer=False, use_mask_token=True)
self.decoder = nn.Sequential(
nn.Conv2d(
in_channels=config.hidden_size,
out_channels=config.encoder_stride**2 * config.num_channels,
kernel_size=1,
),
nn.PixelShuffle(config.encoder_stride),
)
self.post_init()
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
""",
DEIT_START_DOCSTRING,
)
class DeiTForImageClassification(DeiTPreTrainedModel):
def __init__(self, config: DeiTConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.deit = DeiTModel(config, add_pooling_layer=False)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.post_init()
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@dataclass
class DeiTForImageClassificationWithTeacherOutput(ModelOutput):
"""
`DeiTForImageClassificationWithTeacher`的输出类型。
Args:
logits (`torch.FloatTensor`,形状为 `(batch_size, config.num_labels)`):
预测分数,是`cls_logits`和`distillation_logits`的平均值。
cls_logits (`torch.FloatTensor`,形状为 `(batch_size, config.num_labels)`):
分类头部的预测分数(即最终隐藏状态的类令牌上的线性层)。
distillation_logits (`torch.FloatTensor`,形状为 `(batch_size, config.num_labels)`):
蒸馏头部的预测分数(即最终隐藏状态的蒸馏令牌上的线性层)。
hidden_states (`tuple(torch.FloatTensor)`,*可选*,当传递 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
一个元组,包含 `torch.FloatTensor`(一个用于嵌入的输出 + 每个层的输出),形状为 `(batch_size, sequence_length, hidden_size)`。
模型在每一层输出的隐藏状态,以及初始嵌入的输出。
attentions (`tuple(torch.FloatTensor)`,*可选*,当传递 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
一个元组,包含 `torch.FloatTensor`(每个层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
经过注意力 softmax 后的注意力权重,在自注意力头中用于计算加权平均值。
"""
@add_start_docstrings(
"""
带有图像分类头的DeiT模型转换器(在[CLS]令牌的最终隐藏状态上有一个线性层,以及在蒸馏令牌的最终隐藏状态上有一个线性层),例如用于ImageNet。
.. warning::
此模型仅支持推断。尚不支持使用蒸馏进行微调(即带有教师)。
""",
DEIT_START_DOCSTRING,
)
class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
"""
带有教师的DeiT模型,用于图像分类。
"""
def __init__(self, config: DeiTConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.deit = DeiTModel(config, add_pooling_layer=False)
self.cls_classifier = (
nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.distillation_classifier = (
nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=DeiTForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, DeiTForImageClassificationWithTeacherOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deit(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
cls_logits = self.cls_classifier(sequence_output[:, 0, :])
distillation_logits = self.distillation_classifier(sequence_output[:, 1, :])
logits = (cls_logits + distillation_logits) / 2
if not return_dict:
output = (logits, cls_logits, distillation_logits) + outputs[1:]
return output
return DeiTForImageClassificationWithTeacherOutput(
logits=logits,
cls_logits=cls_logits,
distillation_logits=distillation_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\deit\modeling_tf_deit.py
""" TensorFlow DeiT model. """
from __future__ import annotations
import collections.abc
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFImageClassifierOutput,
TFMaskedImageModelingOutput,
)
from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_deit import DeiTConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DeiTConfig"
_CHECKPOINT_FOR_DOC = "facebook/deit-base-distilled-patch16-224"
_EXPECTED_OUTPUT_SHAPE = [1, 198, 768]
_IMAGE_CLASS_CHECKPOINT = "facebook/deit-base-distilled-patch16-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/deit-base-distilled-patch16-224",
]
@dataclass
class TFDeiTForImageClassificationWithTeacherOutput(ModelOutput):
"""
[`DeiTForImageClassificationWithTeacher`] 的输出类型。
"""
Args:
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Prediction scores as the average of the cls_logits and distillation logits.
预测分数,作为 cls_logits 和 distillation_logits 的平均值
cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
class token).
分类头部的预测分数,即在类令牌的最终隐藏状态之上的线性层的输出
distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
distillation token).
蒸馏头部的预测分数,即在蒸馏令牌的最终隐藏状态之上的线性层的输出
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
the initial embedding outputs.
隐藏状态的元组,包含每个层的输出(嵌入层输出和每层输出),仅在设置 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
注意力权重的元组,包含每个层的注意力权重,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`,
仅在设置 `output_attentions=True` 或 `config.output_attentions=True` 时返回
"""
# 初始化参数,默认为 None,表示可能的输出结果
logits: tf.Tensor = None
cls_logits: tf.Tensor = None
distillation_logits: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
"""
构建 CLS token、蒸馏 token、位置和补丁嵌入。可选择是否包含 mask token。
"""
def __init__(self, config: DeiTConfig, use_mask_token: bool = False, **kwargs) -> None:
super().__init__(**kwargs)
self.config = config
self.use_mask_token = use_mask_token
self.patch_embeddings = TFDeiTPatchEmbeddings(config=config, name="patch_embeddings")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
def build(self, input_shape=None):
# 添加 CLS token 权重,形状为 (1, 1, hidden_size),初始化为零
self.cls_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
initializer=keras.initializers.zeros(),
trainable=True,
name="cls_token",
)
# 添加蒸馏 token 权重,形状为 (1, 1, hidden_size),初始化为零
self.distillation_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
initializer=keras.initializers.zeros(),
trainable=True,
name="distillation_token",
)
# 如果需要使用 mask token,则添加 mask token 权重,形状为 (1, 1, hidden_size),初始化为零
self.mask_token = None
if self.use_mask_token:
self.mask_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
initializer=keras.initializers.zeros(),
trainable=True,
name="mask_token",
)
# 计算补丁数量,用于位置嵌入的构建
num_patches = self.patch_embeddings.num_patches
# 添加位置嵌入权重,形状为 (1, num_patches + 2, hidden_size),初始化为零
self.position_embeddings = self.add_weight(
shape=(1, num_patches + 2, self.config.hidden_size),
initializer=keras.initializers.zeros(),
trainable=True,
name="position_embeddings",
)
# 如果已经构建过,则直接返回
if self.built:
return
self.built = True
# 如果存在 patch_embeddings 属性,则构建 patch_embeddings
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None)
# 如果存在 dropout 属性,则构建 dropout
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
def call(
self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None, training: bool = False
):
# 略
pass
) -> tf.Tensor:
# 使用 patch_embeddings 方法生成图像块的嵌入表示
embeddings = self.patch_embeddings(pixel_values)
# 获取嵌入张量的批大小、序列长度和特征维度信息
batch_size, seq_length, _ = shape_list(embeddings)
if bool_masked_pos is not None:
# 使用 mask_token 在指定位置替换掩码的视觉标记
mask_tokens = tf.tile(self.mask_token, [batch_size, seq_length, 1])
# 创建用于掩码的布尔张量,并扩展其维度以适应张量运算
mask = tf.expand_dims(bool_masked_pos, axis=-1)
mask = tf.cast(mask, dtype=mask_tokens.dtype)
# 应用掩码,保留未被掩码的嵌入值,替换掩码位置的嵌入为 mask_tokens
embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
# 生成一组重复的 cls_token,用于每个批次
cls_tokens = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
# 生成一组重复的 distillation_token,用于每个批次
distillation_tokens = tf.repeat(self.distillation_token, repeats=batch_size, axis=0)
# 将 cls_tokens、distillation_tokens 和 embeddings 拼接在一起
embeddings = tf.concat((cls_tokens, distillation_tokens, embeddings), axis=1)
# 添加位置嵌入到嵌入张量中
embeddings = embeddings + self.position_embeddings
# 应用 dropout 操作到嵌入张量中,用于训练时的正则化
embeddings = self.dropout(embeddings, training=training)
# 返回最终生成的嵌入张量
return embeddings
# 定义一个自定义层 TFDeiTPatchEmbeddings,用于将像素值 `pixel_values` 转换为 Transformer 模型可用的初始隐藏状态(patch embeddings)
class TFDeiTPatchEmbeddings(keras.layers.Layer):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config: DeiTConfig, **kwargs) -> None:
super().__init__(**kwargs)
# 从配置中获取图像大小和patch大小
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
# 如果图像大小或patch大小不是可迭代对象,则转换为元组
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算patch的数量
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
# 设置对象的属性
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
# 创建一个卷积层,用于将像素值投影到隐藏大小的空间,作为Transformer的输入
self.projection = keras.layers.Conv2D(
hidden_size, kernel_size=patch_size, strides=patch_size, name="projection"
)
# 定义层的前向传播逻辑,将像素值转换为patch embeddings
def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
# 获取输入张量的形状信息
batch_size, height, width, num_channels = shape_list(pixel_values)
# 在即时执行模式下,确保像素值的通道维度与配置中设置的通道数匹配
if tf.executing_eagerly() and num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# 在即时执行模式下,确保输入图像的尺寸与配置中设置的图像尺寸匹配
if tf.executing_eagerly() and (height != self.image_size[0] or width != self.image_size[1]):
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
)
# 将像素值通过投影层投影为形状为(batch_size, seq_length, hidden_size)的张量x
x = self.projection(pixel_values)
# 获取投影后张量x的形状信息
batch_size, height, width, num_channels = shape_list(x)
# 将x重新调整形状为(batch_size, seq_length, hidden_size),其中seq_length=height*width
x = tf.reshape(x, (batch_size, height * width, num_channels))
# 返回转换后的张量x作为patch embeddings
return x
# 在模型构建时调用,用于构建投影层
def build(self, input_shape=None):
# 如果已经构建,则直接返回
if self.built:
return
# 标记此层为已构建
self.built = True
# 如果投影层已经存在,则在其名称作用域下构建该层
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
self.projection.build([None, None, None, self.num_channels])
# 从 transformers.models.vit.modeling_tf_vit.TFViTSelfAttention 复制并修改为 DeiT 模型中的自注意力层 TFDeiTSelfAttention
class TFDeiTSelfAttention(keras.layers.Layer):
def __init__(self, config: DeiTConfig, **kwargs):
super().__init__(**kwargs)
# 检查隐藏大小是否能被注意力头数整除
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
# 初始化注意力头数和每个注意力头的大小
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
# 创建查询、键、值的全连接层,用于后续注意力计算
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
# 定义用于 dropout 的层
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# 将输入张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
# 转置张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 到 [batch_size, num_attention_heads, seq_length, attention_head_size]
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
# 获取隐藏状态的批量大小
batch_size = shape_list(hidden_states)[0]
# 使用 self.query 函数处理隐藏状态,生成混合的查询层
mixed_query_layer = self.query(inputs=hidden_states)
# 使用 self.key 函数处理隐藏状态,生成混合的键层
mixed_key_layer = self.key(inputs=hidden_states)
# 使用 self.value 函数处理隐藏状态,生成混合的值层
mixed_value_layer = self.value(inputs=hidden_states)
# 将混合的查询层转置以用于注意力分数计算
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
# 将混合的键层转置以用于注意力分数计算
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
# 将混合的值层转置以用于注意力分数计算
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# 计算注意力分数,即查询和键的点积
# 结果形状为 (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
# 缩放注意力分数
dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
attention_scores = tf.divide(attention_scores, dk)
# 将注意力分数归一化为概率
attention_probs = stable_softmax(logits=attention_scores, axis=-1)
# 对注意力分数进行 dropout 处理
attention_probs = self.dropout(inputs=attention_probs, training=training)
# 如果给定了头部掩码,则应用头部掩码
if head_mask is not None:
attention_probs = tf.multiply(attention_probs, head_mask)
# 计算注意力输出,将注意力分数乘以值层
attention_output = tf.matmul(attention_probs, value_layer)
# 调整输出张量的维度顺序
attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
# 将输出张量重新形状为 (batch_size, seq_len_q, all_head_size)
attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
# 如果需要输出注意力分数,则返回注意力输出和注意力分数
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
return outputs
def build(self, input_shape=None):
# 如果已经构建过网络层,则直接返回
if self.built:
return
# 标记网络层已经构建
self.built = True
# 如果存在 self.query 属性,则构建查询网络层
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
# 如果存在 self.key 属性,则构建键网络层
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
# 如果存在 self.value 属性,则构建值网络层
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->DeiT
class TFDeiTSelfOutput(keras.layers.Layer):
"""
The residual connection is defined in TFDeiTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: DeiTConfig, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,用于变换隐藏状态的维度
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 创建一个 dropout 层,用于随机失活一部分神经元,防止过拟合
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
# 将隐藏状态输入全连接层进行变换
hidden_states = self.dense(inputs=hidden_states)
# 在训练阶段,对变换后的隐藏状态进行随机失活处理
hidden_states = self.dropout(inputs=hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经构建过网络层,则直接返回
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
# 构建全连接层,指定输入形状和隐藏状态的维度
self.dense.build([None, None, self.config.hidden_size])
# Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->DeiT
class TFDeiTAttention(keras.layers.Layer):
def __init__(self, config: DeiTConfig, **kwargs):
super().__init__(**kwargs)
# 创建自注意力层,用于处理注意力机制
self.self_attention = TFDeiTSelfAttention(config, name="attention")
# 创建输出层,用于处理自注意力层输出的隐藏状态
self.dense_output = TFDeiTSelfOutput(config, name="output")
def prune_heads(self, heads):
# 目前未实现头部修剪的方法
raise NotImplementedError
def call(
self,
input_tensor: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
# 调用自注意力层处理输入张量,获取自注意力输出
self_outputs = self.self_attention(
hidden_states=input_tensor, head_mask=head_mask, output_attentions=output_attentions, training=training
)
# 将自注意力输出传入输出层处理,得到注意力机制的输出
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
)
# 如果需要输出注意力信息,则将注意力信息添加到输出中
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经构建过网络层,则直接返回
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
# 构建自注意力层,不需要指定具体的输入形状
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
# 构建输出层,不需要指定具体的输入形状
self.dense_output.build(None)
# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->DeiT
class TFDeiTIntermediate(keras.layers.Layer):
# 初始化方法,用于初始化一个新的对象实例
def __init__(self, config: DeiTConfig, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 创建一个全连接层,设置单元数为config中指定的中间大小,
# 内核初始化方式使用config中的初始化范围,命名为"dense"
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 根据config中的隐藏激活函数,获取对应的激活函数对象或者名称
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
# 将config保存在对象中
self.config = config
# 调用方法,用于执行前向传播
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 使用全连接层处理输入的隐藏状态数据
hidden_states = self.dense(inputs=hidden_states)
# 使用中间激活函数处理全连接层的输出隐藏状态数据
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回处理后的隐藏状态数据
return hidden_states
# 构建方法,用于构建模型层次结构
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记为已经构建
self.built = True
# 如果存在全连接层dense,则使用其名字作为作用域,构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
# 从transformers.models.vit.modeling_tf_vit.TFViTOutput复制并更名为TFDeiTOutput,用于实现Transformer中的输出层逻辑,但采用DeiT的配置。
class TFDeiTOutput(keras.layers.Layer):
def __init__(self, config: DeiTConfig, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,输出维度为config.hidden_size,权重初始化使用config中定义的初始化范围。
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 创建一个Dropout层,使用config中定义的dropout概率。
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
# 通过全连接层处理输入hidden_states,输出经过线性变换后的结果。
hidden_states = self.dense(inputs=hidden_states)
# 对处理后的结果进行Dropout操作,以防止过拟合。
hidden_states = self.dropout(inputs=hidden_states, training=training)
# 将Dropout后的结果与输入tensor相加,实现残差连接。
hidden_states = hidden_states + input_tensor
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经构建过,直接返回。否则,根据self.dense的名字作用域,构建全连接层,输入维度为None,None,self.config.intermediate_size。
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
# TFDeiTLayer是Transformer中的一个层,对应于timm实现中的Block类。
class TFDeiTLayer(keras.layers.Layer):
"""This corresponds to the Block class in the timm implementation."""
def __init__(self, config: DeiTConfig, **kwargs):
super().__init__(**kwargs)
# 创建DeiTAttention层,使用给定的DeiT配置,并命名为"attention"。
self.attention = TFDeiTAttention(config, name="attention")
# 创建DeiTIntermediate层,使用给定的DeiT配置,并命名为"intermediate"。
self.intermediate = TFDeiTIntermediate(config, name="intermediate")
# 创建TFDeiTOutput层,使用给定的DeiT配置,并命名为"output"。
self.deit_output = TFDeiTOutput(config, name="output")
# 创建LayerNormalization层,在训练过程中使用给定的epsilon参数进行归一化,命名为"layernorm_before"和"layernorm_after"。
self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
self.config = config
def call(
self,
hidden_states: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
# 对输入hidden_states应用LayerNormalization,然后传递给self.attention,获取attention_outputs。
attention_outputs = self.attention(
input_tensor=self.layernorm_before(inputs=hidden_states, training=training),
head_mask=head_mask,
output_attentions=output_attentions,
training=training,
)
attention_output = attention_outputs[0]
# 第一个残差连接,将attention_output加到hidden_states上。
hidden_states = attention_output + hidden_states
# 再次应用LayerNormalization,得到layer_output。
layer_output = self.layernorm_after(inputs=hidden_states, training=training)
# 将layer_output传递给self.intermediate层进行处理,得到intermediate_output。
intermediate_output = self.intermediate(hidden_states=layer_output, training=training)
# 第二个残差连接,将intermediate_output和hidden_states传递给self.deit_output处理,得到layer_output。
layer_output = self.deit_output(
hidden_states=intermediate_output, input_tensor=hidden_states, training=training
)
# 将输出打包成元组outputs,如果需要输出attention信息,则将其添加到outputs中。
outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
return outputs
# 定义一个方法用于建立模型,可以接受输入形状作为参数
def build(self, input_shape=None):
# 如果模型已经建立过,则直接返回,不进行重复建立
if self.built:
return
# 设置标记,表明模型已经建立
self.built = True
# 如果存在 self.attention 属性,则构建 attention 层
if getattr(self, "attention", None) is not None:
# 使用 attention 层的名称作为命名空间
with tf.name_scope(self.attention.name):
# 调用 attention 层的 build 方法,传入 None 作为输入形状
self.attention.build(None)
# 如果存在 self.intermediate 属性,则构建 intermediate 层
if getattr(self, "intermediate", None) is not None:
# 使用 intermediate 层的名称作为命名空间
with tf.name_scope(self.intermediate.name):
# 调用 intermediate 层的 build 方法,传入 None 作为输入形状
self.intermediate.build(None)
# 如果存在 self.deit_output 属性,则构建 deit_output 层
if getattr(self, "deit_output", None) is not None:
# 使用 deit_output 层的名称作为命名空间
with tf.name_scope(self.deit_output.name):
# 调用 deit_output 层的 build 方法,传入 None 作为输入形状
self.deit_output.build(None)
# 如果存在 self.layernorm_before 属性,则构建 layernorm_before 层
if getattr(self, "layernorm_before", None) is not None:
# 使用 layernorm_before 层的名称作为命名空间
with tf.name_scope(self.layernorm_before.name):
# 调用 layernorm_before 层的 build 方法,传入一个形状为 [None, None, self.config.hidden_size] 的列表作为输入形状
self.layernorm_before.build([None, None, self.config.hidden_size])
# 如果存在 self.layernorm_after 属性,则构建 layernorm_after 层
if getattr(self, "layernorm_after", None) is not None:
# 使用 layernorm_after 层的名称作为命名空间
with tf.name_scope(self.layernorm_after.name):
# 调用 layernorm_after 层的 build 方法,传入一个形状为 [None, None, self.config.hidden_size] 的列表作为输入形状
self.layernorm_after.build([None, None, self.config.hidden_size])
# 从transformers.models.vit.modeling_tf_vit.TFViTEncoder复制并修改为ViT->DeiT
class TFDeiTEncoder(keras.layers.Layer):
def __init__(self, config: DeiTConfig, **kwargs):
super().__init__(**kwargs)
# 初始化层列表,每层命名为"layer_._{i}"
self.layer = [TFDeiTLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
# 如果输出隐藏状态,初始化空元组以存储所有隐藏状态
all_hidden_states = () if output_hidden_states else None
# 如果输出注意力权重,初始化空元组以存储所有注意力权重
all_attentions = () if output_attentions else None
# 遍历每个编码层
for i, layer_module in enumerate(self.layer):
# 如果输出隐藏状态,将当前隐藏状态加入到列表中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 调用编码层的计算
layer_outputs = layer_module(
hidden_states=hidden_states,
head_mask=head_mask[i],
output_attentions=output_attentions,
training=training,
)
# 更新隐藏状态为当前层的输出
hidden_states = layer_outputs[0]
# 如果输出注意力权重,将当前层的注意力权重加入到列表中
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# 添加最后一层的隐藏状态到所有隐藏状态列表中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不返回字典形式的结果,根据是否为空过滤掉None值并返回结果元组
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
# 返回 TFBaseModelOutput 对象,包含最后的隐藏状态、所有隐藏状态和所有注意力权重
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
# 如果已经构建,直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 如果存在层列表,为每一层设置命名作用域并构建层
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFDeiTMainLayer(keras.layers.Layer):
config_class = DeiTConfig
def __init__(
self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
) -> None:
super().__init__(**kwargs)
# 初始化配置
self.config = config
# 初始化嵌入层和编码器层
self.embeddings = TFDeiTEmbeddings(config, use_mask_token=use_mask_token, name="embeddings")
self.encoder = TFDeiTEncoder(config, name="encoder")
# 初始化层归一化层
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# 如果需要添加池化层,则初始化池化层
self.pooler = TFDeiTPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> TFDeiTPatchEmbeddings:
# 返回嵌入层的补丁嵌入
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 修剪模型的注意力头,heads_to_prune 是一个字典,键为层号,值为要修剪的头列表,参见基类 PreTrainedModel
raise NotImplementedError
def get_head_mask(self, head_mask):
# 如果 head_mask 不为 None,抛出未实现的错误
if head_mask is not None:
raise NotImplementedError
else:
# 否则,创建一个长度为 self.config.num_hidden_layers 的 None 列表作为 head_mask
head_mask = [None] * self.config.num_hidden_layers
# 返回创建或传入的 head_mask
return head_mask
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor | None = None,
bool_masked_pos: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor, ...]]:
# 设置 output_attentions,如果未提供则使用 self.config.output_attentions
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置 output_hidden_states,如果未提供则使用 self.config.output_hidden_states
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置 return_dict,如果未提供则使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 pixel_values 为 None,则抛出数值错误
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 转置 pixel_values 张量,将 NCHW 格式转换为 NHWC 格式
pixel_values = tf.transpose(pixel_values, (0, 2, 3, 1))
# 准备 head_mask,调用 self.get_head_mask 获取头部掩码
head_mask = self.get_head_mask(head_mask)
# 使用 embeddings 方法生成嵌入输出
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos, training=training)
# 使用 encoder 方法生成编码器输出
encoder_outputs = self.encoder(
embedding_output,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取编码器的序列输出
sequence_output = encoder_outputs[0]
# 对序列输出进行 layer normalization
sequence_output = self.layernorm(sequence_output, training=training)
# 如果存在池化器,使用池化器生成池化输出
pooled_output = self.pooler(sequence_output, training=training) if self.pooler is not None else None
# 如果 return_dict 为 False,则返回 head_outputs 和额外的 encoder_outputs
if not return_dict:
head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
return head_outputs + encoder_outputs[1:]
# 如果 return_dict 为 True,则返回 TFBaseModelOutputWithPooling 对象
return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# 如果模型已经构建完成,则直接返回,不进行重复构建
if self.built:
return
# 将模型标记为已构建状态
self.built = True
# 如果模型具有嵌入层(embeddings),则构建嵌入层
if getattr(self, "embeddings", None) is not None:
# 使用嵌入层的名称作为命名空间,构建嵌入层
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# 如果模型具有编码器(encoder),则构建编码器
if getattr(self, "encoder", None) is not None:
# 使用编码器的名称作为命名空间,构建编码器
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果模型具有层归一化(layernorm),则构建层归一化
if getattr(self, "layernorm", None) is not None:
# 使用层归一化的名称作为命名空间,构建层归一化,指定输入形状为 [None, None, self.config.hidden_size]
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.config.hidden_size])
# 如果模型具有池化器(pooler),则构建池化器
if getattr(self, "pooler", None) is not None:
# 使用池化器的名称作为命名空间,构建池化器
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
# 从 transformers.models.vit.modeling_tf_vit.TFViTPreTrainedModel 复制并修改为 ViT->DeiT 的大小写。
class TFDeiTPreTrainedModel(TFPreTrainedModel):
"""
一个抽象类,处理权重初始化以及预训练模型的下载和加载的简单接口。
"""
# 配置类,指定为 DeiTConfig
config_class = DeiTConfig
# 基础模型前缀,设定为 "deit"
base_model_prefix = "deit"
# 主输入名称,设定为 "pixel_values"
main_input_name = "pixel_values"
# 下面是 DEIT_START_DOCSTRING 的文档字符串
DEIT_START_DOCSTRING = r"""
This model is a TensorFlow
[keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.
Parameters:
config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# 下面是 DEIT_INPUTS_DOCSTRING 的文档字符串
DEIT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`DeiTImageProcessor.__call__`] for details.
head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.",
DEIT_START_DOCSTRING,
)
# TFDeiTModel 类的文档字符串和初始化方法注释
class TFDeiTModel(TFDeiTPreTrainedModel):
def __init__(
self, config: DeiTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
) -> None:
# 调用父类的初始化方法
super().__init__(config, **kwargs)
# 创建 TFDeiTMainLayer 实例作为 self.deit 属性
self.deit = TFDeiTMainLayer(
config, add_pooling_layer=add_pooling_layer, use_mask_token=use_mask_token, name="deit"
)
# 使用装饰器添加输入解包,模型前向方法的起始文档字符串和代码示例文档字符串
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
# 定义一个方法 `call`,用于调用模型。
def call(
self,
pixel_values: tf.Tensor | None = None, # 输入像素值的张量,可以为空
bool_masked_pos: tf.Tensor | None = None, # 布尔类型的遮罩位置张量,可以为空
head_mask: tf.Tensor | None = None, # 头部遮罩张量,可以为空
output_attentions: Optional[bool] = None, # 是否输出注意力权重的布尔值,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态的布尔值,可选
return_dict: Optional[bool] = None, # 是否返回字典类型的结果,可选
training: bool = False, # 是否处于训练模式,默认为 False
) -> Union[Tuple, TFBaseModelOutputWithPooling]:
# 调用模型 `deit`,传递给它所有的参数,并返回其输出
outputs = self.deit(
pixel_values=pixel_values,
bool_masked_pos=bool_masked_pos,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回模型的输出
return outputs
# 定义一个方法 `build`,用于构建模型的层次结构
def build(self, input_shape=None):
# 如果模型已经构建完成,则直接返回
if self.built:
return
# 标记模型为已构建状态
self.built = True
# 如果存在 `deit` 属性,则在 `deit` 的命名作用域下构建模型
if getattr(self, "deit", None) is not None:
with tf.name_scope(self.deit.name):
# 调用 `deit` 的构建方法,传入 `None` 的输入形状
self.deit.build(None)
# 从 transformers.models.vit.modeling_tf_vit.TFViTPooler 复制而来,将 ViT 改为 DeiT
class TFDeiTPooler(keras.layers.Layer):
def __init__(self, config: DeiTConfig, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,用于池化模型,输出维度为 config.hidden_size
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 通过简单地选择第一个 token 的隐藏状态来“池化”模型
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经构建,直接返回;否则,按照指定的输入形状构建 dense 层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFDeitPixelShuffle(keras.layers.Layer):
"""TF 实现的 torch.nn.PixelShuffle 的层"""
def __init__(self, upscale_factor: int, **kwargs) -> None:
super().__init__(**kwargs)
if not isinstance(upscale_factor, int) or upscale_factor < 2:
raise ValueError(f"upscale_factor 必须是大于等于 2 的整数,当前值为 {upscale_factor}")
self.upscale_factor = upscale_factor
def call(self, x: tf.Tensor) -> tf.Tensor:
hidden_states = x
batch_size, _, _, num_input_channels = shape_list(hidden_states)
block_size_squared = self.upscale_factor**2
output_depth = int(num_input_channels / block_size_squared)
# 计算输出通道数时,PyTorch 的 PixelShuffle 和 TF 的 depth_to_space 在输出上存在差异,
# 因为通道的选择顺序会导致组合顺序不同,详情参考:
# https://stackoverflow.com/questions/68272502/tf-depth-to-space-not-same-as-torchs-pixelshuffle-when-output-channels-1
permutation = tf.constant(
[[i + j * block_size_squared for i in range(block_size_squared) for j in range(output_depth)]]
)
# 使用 permutation 重新组合隐藏状态张量的通道
hidden_states = tf.gather(params=hidden_states, indices=tf.tile(permutation, [batch_size, 1]), batch_dims=-1)
# 使用 TF 的 depth_to_space 函数进行像素洗牌操作
hidden_states = tf.nn.depth_to_space(hidden_states, block_size=self.upscale_factor, data_format="NHWC")
return hidden_states
class TFDeitDecoder(keras.layers.Layer):
def __init__(self, config: DeiTConfig, **kwargs) -> None:
super().__init__(**kwargs)
# 创建一个卷积层,输出通道数为 config.encoder_stride 的平方乘以 config.num_channels,卷积核大小为 1x1
self.conv2d = keras.layers.Conv2D(
filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, name="0"
)
# 创建 TFDeitPixelShuffle 层,用于解码器
self.pixel_shuffle = TFDeitPixelShuffle(config.encoder_stride, name="1")
self.config = config
# 定义一个方法用于调用模型,接受一个张量作为输入,并可选择是否进行训练
def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
# 将输入张量赋给隐藏状态变量
hidden_states = inputs
# 对隐藏状态进行二维卷积操作
hidden_states = self.conv2d(hidden_states)
# 对卷积后的结果进行像素重排操作
hidden_states = self.pixel_shuffle(hidden_states)
# 返回处理后的隐藏状态
return hidden_states
# 构建模型的方法
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回
if self.built:
return
# 设置模型已构建标志为True
self.built = True
# 如果模型具有conv2d属性,则构建conv2d层
if getattr(self, "conv2d", None) is not None:
# 在TensorFlow中使用name_scope管理命名空间
with tf.name_scope(self.conv2d.name):
# 构建conv2d层,指定输入的形状为[None, None, None, self.config.hidden_size]
self.conv2d.build([None, None, None, self.config.hidden_size])
# 如果模型具有pixel_shuffle属性,则构建pixel_shuffle层
if getattr(self, "pixel_shuffle", None) is not None:
# 在TensorFlow中使用name_scope管理命名空间
with tf.name_scope(self.pixel_shuffle.name):
# 构建pixel_shuffle层,不指定具体的输入形状
self.pixel_shuffle.build(None)
@add_start_docstrings(
"""
在遮蔽图像建模中使用的DeiT模型,其顶部有一个解码器,正如[SimmIM](https://arxiv.org/abs/2111.09886)中所提出的。
""",
DEIT_START_DOCSTRING,
)
class TFDeiTForMaskedImageModeling(TFDeiTPreTrainedModel):
def __init__(self, config: DeiTConfig) -> None:
super().__init__(config)
# 初始化DeiT主层,禁用池化层,启用掩码令牌,命名为"deit"
self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, use_mask_token=True, name="deit")
# 初始化解码器,使用给定的DeiT配置,命名为"decoder"
self.decoder = TFDeitDecoder(config, name="decoder")
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
pixel_values: tf.Tensor | None = None,
bool_masked_pos: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
"""
DeiT模型的前向传播方法,接受像素值、掩码位置、头部掩码等参数。
"""
# 省略了具体的前向传播逻辑,由于不在要求内,不能提供更多细节。
def build(self, input_shape=None):
"""
构建模型,确保DeiT和解码器都已构建。
"""
if self.built:
return
self.built = True
if getattr(self, "deit", None) is not None:
with tf.name_scope(self.deit.name):
self.deit.build(None)
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
@add_start_docstrings(
"""
带有图像分类头部的DeiT模型变换器(在[CLS]标记的最终隐藏状态之上有一个线性层),例如用于ImageNet。
""",
DEIT_START_DOCSTRING,
)
class TFDeiTForImageClassification(TFDeiTPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: DeiTConfig):
super().__init__(config)
# 配置标签数量
self.num_labels = config.num_labels
# 初始化DeiT主层,禁用池化层,命名为"deit"
self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, name="deit")
# 分类器头部
self.classifier = (
# 如果标签数大于0,则使用稠密层,命名为"classifier"
keras.layers.Dense(config.num_labels, name="classifier")
if config.num_labels > 0
# 否则使用线性激活函数,命名为"classifier"
else keras.layers.Activation("linear", name="classifier")
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
pixel_values: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
labels: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
"""
DeiT模型的前向传播方法,接受像素值、头部掩码、标签等参数。
"""
# 省略了具体的前向传播逻辑,由于不在要求内,不能提供更多细节。
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 设置返回字典的标志,如果未提供则使用模型配置中的默认设置
outputs = self.deit(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 使用图像编码器(self.deit)处理像素值,可选地应用头部遮罩、输出注意力和隐藏状态,根据return_dict参数返回结果
sequence_output = outputs[0]
# 获取模型输出中的序列输出(通常是最后一层的输出)
logits = self.classifier(sequence_output[:, 0, :])
# 使用分类器(self.classifier)计算逻辑回归,通常使用序列输出的第一个位置的信息
# 不使用蒸馏令牌
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果提供了标签,则使用标签和逻辑回归计算损失,否则损失为None
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果不要求返回字典形式的结果,则返回一个元组,包含逻辑回归和其它可能的输出
return TFImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 返回一个TFImageClassifierOutput对象,包含损失、逻辑回归、隐藏状态和注意力信息
# 如果模型已经构建完成,则直接返回,不进行重复构建
if self.built:
return
# 标记模型已经构建完成
self.built = True
# 如果存在名为"deit"的属性,并且不为None,则构建"deit"模型部分
if getattr(self, "deit", None) is not None:
# 在命名空间中构建"deit"模型
with tf.name_scope(self.deit.name):
self.deit.build(None)
# 如果存在名为"classifier"的属性,并且不为None,则构建"classifier"模型部分
if getattr(self, "classifier", None) is not None:
# 在命名空间中构建"classifier"模型
with tf.name_scope(self.classifier.name):
# 构建"classifier"模型,指定输入形状为[None, None, self.config.hidden_size]
self.classifier.build([None, None, self.config.hidden_size])
# 使用装饰器为类添加文档字符串,描述了此类的主要功能和警告信息
@add_start_docstrings(
"""
DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
.. warning::
This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
supported.
""",
DEIT_START_DOCSTRING,
)
class TFDeiTForImageClassificationWithTeacher(TFDeiTPreTrainedModel):
def __init__(self, config: DeiTConfig) -> None:
# 调用父类构造函数初始化模型
super().__init__(config)
# 保存分类标签数量
self.num_labels = config.num_labels
# 创建 DeiT 主层实例,不添加池化层
self.deit = TFDeiTMainLayer(config, add_pooling_layer=False, name="deit")
# 分类器头部初始化
# 如果有分类标签数量,则创建全连接层作为分类器;否则创建线性激活层
self.cls_classifier = (
keras.layers.Dense(config.num_labels, name="cls_classifier")
if config.num_labels > 0
else keras.layers.Activation("linear", name="cls_classifier")
)
# 同上,针对蒸馏分类器头部
self.distillation_classifier = (
keras.layers.Dense(config.num_labels, name="distillation_classifier")
if config.num_labels > 0
else keras.layers.Activation("linear", name="distillation_classifier")
)
# 保存配置信息
self.config = config
# 使用装饰器定义 call 方法的输入输出说明文档
@unpack_inputs
@add_start_docstrings_to_model_forward(DEIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFDeiTForImageClassificationWithTeacherOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
# 定义模型的前向传播逻辑
def call(
self,
pixel_values: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs
) -> TFDeiTForImageClassificationWithTeacherOutput:
# 在这里实现具体的前向传播逻辑,处理输入张量和模型参数
pass # 这里的 pass 仅用于示例,实际应填写前向传播的具体实现
# 定义模型的前向传播方法,输入参数包括像素值、头部掩码、是否输出注意力和隐藏状态、是否返回字典以及训练模式
) -> Union[tuple, TFDeiTForImageClassificationWithTeacherOutput]:
# 如果未指定返回字典,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 DeiT 模型进行推理
outputs = self.deit(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取序列输出
sequence_output = outputs[0]
# 对序列输出的第一个位置进行分类器预测
cls_logits = self.cls_classifier(sequence_output[:, 0, :])
# 对序列输出的第二个位置进行蒸馏分类器预测
distillation_logits = self.distillation_classifier(sequence_output[:, 1, :])
# 在推理阶段,返回两个分类器预测结果的平均值作为最终预测
logits = (cls_logits + distillation_logits) / 2
# 如果不需要返回字典,则返回预测结果和其他输出信息
if not return_dict:
output = (logits, cls_logits, distillation_logits) + outputs[1:]
return output
# 否则,返回一个包含预测 logits、分类器 logits、蒸馏分类器 logits、隐藏状态和注意力信息的 TFDeiTForImageClassificationWithTeacherOutput 对象
return TFDeiTForImageClassificationWithTeacherOutput(
logits=logits,
cls_logits=cls_logits,
distillation_logits=distillation_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 构建模型的方法,设置模型已构建标志并构建模型的各个组件
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回
if self.built:
return
# 设置模型已构建标志为 True
self.built = True
# 如果 DeiT 模型存在,则构建它
if getattr(self, "deit", None) is not None:
with tf.name_scope(self.deit.name):
self.deit.build(None)
# 如果分类器存在,则构建分类器
if getattr(self, "cls_classifier", None) is not None:
with tf.name_scope(self.cls_classifier.name):
self.cls_classifier.build([None, None, self.config.hidden_size])
# 如果蒸馏分类器存在,则构建蒸馏分类器
if getattr(self, "distillation_classifier", None) is not None:
with tf.name_scope(self.distillation_classifier.name):
self.distillation_classifier.build([None, None, self.config.hidden_size])
.\models\deit\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_torch_available,
is_vision_available,
)
_import_structure = {"configuration_deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig", "DeiTOnnxConfig"]}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_deit"] = ["DeiTFeatureExtractor"]
_import_structure["image_processing_deit"] = ["DeiTImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_deit"] = [
"DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
"DeiTForImageClassification",
"DeiTForImageClassificationWithTeacher",
"DeiTForMaskedImageModeling",
"DeiTModel",
"DeiTPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_deit"] = [
"TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFDeiTForImageClassification",
"TFDeiTForImageClassificationWithTeacher",
"TFDeiTForMaskedImageModeling",
"TFDeiTModel",
"TFDeiTPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig, DeiTOnnxConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_deit import DeiTFeatureExtractor
from .image_processing_deit import DeiTImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_deit import (
DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
DeiTForImageClassification,
DeiTForImageClassificationWithTeacher,
DeiTForMaskedImageModeling,
DeiTModel,
DeiTPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_deit import (
TF_DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFDeiTForImageClassification,
TFDeiTForImageClassificationWithTeacher,
TFDeiTForMaskedImageModeling,
TFDeiTModel,
TFDeiTPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\bort\convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
import argparse
import os
import gluonnlp as nlp
import mxnet as mx
import numpy as np
import torch
from gluonnlp.base import get_home_dir
from gluonnlp.model.bert import BERTEncoder
from gluonnlp.model.utils import _load_vocab
from gluonnlp.vocab import Vocab
from packaging import version
from torch import nn
from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer
from transformers.models.bert.modeling_bert import (
BertIntermediate,
BertLayer,
BertOutput,
BertSelfAttention,
BertSelfOutput,
)
from transformers.utils import logging
if version.parse(nlp.__version__) != version.parse("0.8.3"):
raise Exception("requires gluonnlp == 0.8.3")
if version.parse(mx.__version__) != version.parse("1.5.0"):
raise Exception("requires mxnet == 1.5.0")
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!"
def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
"""
Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
将原始的基于MXNet和GluonNLP的Bort检查点转换为我们的BERT结构。
"""
bort_4_8_768_1024_hparams = {
"attention_cell": "multi_head",
"num_layers": 4,
"units": 1024,
"hidden_size": 768,
"max_length": 512,
"num_heads": 8,
"scaled": True,
"dropout": 0.1,
"use_residual": True,
"embed_size": 1024,
"embed_dropout": 0.1,
"word_embed": None,
"layer_norm_eps": 1e-5,
"token_type_vocab_size": 2,
}
predefined_args = bort_4_8_768_1024_hparams
encoder = BERTEncoder(
attention_cell=predefined_args["attention_cell"],
num_layers=predefined_args["num_layers"],
units=predefined_args["units"],
hidden_size=predefined_args["hidden_size"],
max_length=predefined_args["max_length"],
num_heads=predefined_args["num_heads"],
scaled=predefined_args["scaled"],
dropout=predefined_args["dropout"],
output_attention=False,
output_all_encodings=False,
use_residual=predefined_args["use_residual"],
activation=predefined_args.get("activation", "gelu"),
layer_norm_eps=predefined_args.get("layer_norm_eps", None),
)
vocab_name = "openwebtext_ccnews_stories_books_cased"
gluon_cache_dir = os.path.join(get_home_dir(), "models")
bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)
original_bort = nlp.model.BERTModel(
encoder,
len(bort_vocab),
units=predefined_args["units"],
embed_size=predefined_args["embed_size"],
embed_dropout=predefined_args["embed_dropout"],
word_embed=predefined_args["word_embed"],
use_pooler=False,
use_token_type_embed=False,
token_type_vocab_size=predefined_args["token_type_vocab_size"],
use_classifier=False,
use_decoder=False,
)
original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)
params = original_bort._collect_params_with_prefix()
hf_bort_config_json = {
"architectures": ["BertForMaskedLM"],
"attention_probs_dropout_prob": predefined_args["dropout"],
"hidden_act": "gelu",
"hidden_dropout_prob": predefined_args["dropout"],
"hidden_size": predefined_args["embed_size"],
"initializer_range": 0.02,
"intermediate_size": predefined_args["hidden_size"],
"layer_norm_eps": predefined_args["layer_norm_eps"],
"max_position_embeddings": predefined_args["max_length"],
"model_type": "bort",
"num_attention_heads": predefined_args["num_heads"],
"num_hidden_layers": predefined_args["num_layers"],
"pad_token_id": 1,
"type_vocab_size": 1,
"vocab_size": len(bort_vocab),
}
hf_bort_config = BertConfig.from_dict(hf_bort_config_json)
hf_bort_model = BertForMaskedLM(hf_bort_config)
hf_bort_model.eval()
def to_torch(mx_array) -> nn.Parameter:
return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))
def check_and_map_params(hf_param, gluon_param):
shape_hf = hf_param.shape
gluon_param = to_torch(params[gluon_param])
shape_gluon = gluon_param.shape
assert (
shape_hf == shape_gluon
), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
return gluon_param
hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
)
hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
)
hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
)
hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
)
hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
)
hf_bort_model.half()
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]
gluon_input_ids = mx.nd.array([input_ids])
output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])
hf_bort_model.save_pretrained(pytorch_dump_folder_path)
hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
hf_bort_model.eval()
input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
output_hf = hf_bort_model(**input_ids)[0]
gluon_layer = output_gluon[0].asnumpy()
hf_layer = output_hf[0].detach().numpy()
max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()
success = np.allclose(gluon_layer, hf_layer, atol=1e-3)
if success:
print("✔️ Both model do output the same tensors")
else:
print("❌ Both model do **NOT** output the same tensors")
print("Absolute difference is:", max_absolute_diff)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path)
.\models\deprecated\bort\__init__.py
import os
import shutil
def copy_all_files(src_dir, dst_dir):
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for item in os.listdir(src_dir):
src = os.path.join(src_dir, item)
dst = os.path.join(dst_dir, item)
if os.path.isfile(src):
shutil.copy(src, dst)
elif os.path.isdir(src):
copy_all_files(src, dst)
.\models\deprecated\mctct\configuration_mctct.py
from ....configuration_utils import PretrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"speechbrain/m-ctc-t-large": "https://huggingface.co/speechbrain/m-ctc-t-large/resolve/main/config.json",
}
class MCTCTConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MCTCTModel`]. It is used to instantiate an
M-CTC-T model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the M-CTC-T
[speechbrain/m-ctc-t-large](https://huggingface.co/speechbrain/m-ctc-t-large) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import MCTCTConfig, MCTCTModel
>>> # Initializing a M-CTC-T mctct-large style configuration
>>> configuration = MCTCTConfig()
>>> # Initializing a model (with random weights) from the mctct-large style configuration
>>> model = MCTCTModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "mctct"
def __init__(
self,
vocab_size=8065,
hidden_size=1536,
num_hidden_layers=36,
intermediate_size=6144,
num_attention_heads=4,
attention_head_dim=384,
max_position_embeddings=920,
layer_norm_eps=1e-5,
layerdrop=0.3,
hidden_act="relu",
initializer_range=0.02,
hidden_dropout_prob=0.3,
attention_probs_dropout_prob=0.3,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
conv_glu_dim=1,
conv_dropout=0.3,
num_conv_layers=1,
conv_kernel=(7,),
conv_stride=(3,),
input_feat_per_channel=80,
input_channels=1,
conv_channels=None,
ctc_loss_reduction="sum",
ctc_zero_infinity=False,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
intermediate_size=intermediate_size,
num_attention_heads=num_attention_heads,
attention_head_dim=attention_head_dim,
max_position_embeddings=max_position_embeddings,
layer_norm_eps=layer_norm_eps,
layerdrop=layerdrop,
hidden_act=hidden_act,
initializer_range=initializer_range,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
conv_glu_dim=conv_glu_dim,
conv_dropout=conv_dropout,
num_conv_layers=num_conv_layers,
conv_kernel=conv_kernel,
conv_stride=conv_stride,
input_feat_per_channel=input_feat_per_channel,
input_channels=input_channels,
conv_channels=conv_channels,
ctc_loss_reduction=ctc_loss_reduction,
ctc_zero_infinity=ctc_zero_infinity,
**kwargs,
)
):
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.layerdrop = layerdrop
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.conv_glu_dim = conv_glu_dim
self.conv_dropout = conv_dropout
self.num_conv_layers = num_conv_layers
self.input_feat_per_channel = input_feat_per_channel
self.input_channels = input_channels
self.conv_channels = conv_channels
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
self.conv_kernel = list(conv_kernel)
self.conv_stride = list(conv_stride)
if len(self.conv_kernel) != self.num_conv_layers:
raise ValueError(
"Configuration for convolutional module is incorrect. "
"It is required that `len(config.conv_kernel)` == `config.num_conv_layers` "
f"but is `len(config.conv_kernel) = {len(self.conv_kernel)}`, "
f"`config.num_conv_layers = {self.num_conv_layers}`."
)
.\models\deprecated\mctct\feature_extraction_mctct.py
"""
Feature extractor class for M-CTC-T
"""
from typing import List, Optional, Union
import numpy as np
from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
from ....feature_extraction_utils import BatchFeature
from ....file_utils import PaddingStrategy, TensorType
from ....utils import logging
logger = logging.get_logger(__name__)
class MCTCTFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a M-CTC-T feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods. This
code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to
this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an)
that takes the user step-by-step in the implementation.
"""
"""
Args:
feature_size (`int`, defaults to 80):
The feature dimension of the extracted features. This is the number of mel_frequency
coefficients to compute per frame.
sampling_rate (`int`, defaults to 16000):
The sampling rate at which the audio files should be digitized, expressed in hertz (Hz).
padding_value (`float`, defaults to 0.0):
The value used to fill the padding frames.
hop_length (`int`, defaults to 10):
Number of audio samples between consecutive frames.
win_length (`int`, defaults to 25):
Length of the window function applied to each frame, in milliseconds.
win_function (`str`, defaults to `"hamming_window"`):
Name for the window function used for windowing, must be accessible via `torch.{win_function}`.
frame_signal_scale (`float`, defaults to 32768.0):
Scaling factor applied to the audio frames before applying Discrete Fourier Transform (DFT).
preemphasis_coeff (`float`, defaults to 0.97):
Coefficient applied in pre-emphasis filtering of audio signals before DFT.
mel_floor (`float`, defaults to 1.0):
Minimum value enforced for mel frequency bank values.
normalize_means (`bool`, *optional*, defaults to `True`):
Whether to zero-mean normalize the extracted features.
normalize_vars (`bool`, *optional*, defaults to `True`):
Whether to unit-variance normalize the extracted features.
"""
model_input_names = ["input_features", "attention_mask"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
padding_value=0.0,
hop_length=10,
win_length=25,
win_function="hamming_window",
frame_signal_scale=32768.0,
preemphasis_coeff=0.97,
mel_floor=1.0,
normalize_means=True,
normalize_vars=True,
return_attention_mask=False,
**kwargs,
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
self.feature_size = feature_size
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.hop_length = hop_length
self.win_length = win_length
self.frame_signal_scale = frame_signal_scale
self.preemphasis_coeff = preemphasis_coeff
self.mel_floor = mel_floor
self.normalize_means = normalize_means
self.normalize_vars = normalize_vars
self.win_function = win_function
self.return_attention_mask = return_attention_mask
self.sample_size = win_length * sampling_rate // 1000
self.sample_stride = hop_length * sampling_rate // 1000
self.n_fft = optimal_fft_length(self.sample_size)
self.n_freqs = (self.n_fft // 2) + 1
def _extract_mfsc_features(self, one_waveform: np.array) -> np.ndarray:
if self.win_function == "hamming_window":
window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False)
else:
window = window_function(window_length=self.sample_size, name=self.win_function)
fbanks = mel_filter_bank(
num_frequency_bins=self.n_freqs,
num_mel_filters=self.feature_size,
min_frequency=0.0,
max_frequency=self.sampling_rate / 2.0,
sampling_rate=self.sampling_rate,
)
msfc_features = spectrogram(
one_waveform * self.frame_signal_scale,
window=window,
frame_length=self.sample_size,
hop_length=self.sample_stride,
fft_length=self.n_fft,
center=False,
preemphasis=self.preemphasis_coeff,
mel_filters=fbanks,
mel_floor=self.mel_floor,
log_mel="log",
)
return msfc_features.T
def _normalize_one(self, x, input_length, padding_value):
if self.normalize_means:
mean = x[:input_length].mean(axis=0)
x = np.subtract(x, mean)
if self.normalize_vars:
std = x[:input_length].std(axis=0)
x = np.divide(x, std)
if input_length < x.shape[0]:
x[input_length:] = padding_value
x = x.astype(np.float32)
return x
def normalize(
self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
) -> List[np.ndarray]:
lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)]
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
**kwargs,
):
.\models\deprecated\mctct\modeling_mctct.py
""" PyTorch M-CTC-T 模型。"""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ....activations import ACT2FN
from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ....integrations.deepspeed import is_deepspeed_zero3_enabled
from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
from ....modeling_outputs import BaseModelOutput, CausalLMOutput
from ....modeling_utils import (
PreTrainedModel,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from ....utils import logging
from .configuration_mctct import MCTCTConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 1
_CONFIG_FOR_DOC = "MCTCTConfig"
_CHECKPOINT_FOR_DOC = "speechbrain/m-ctc-t-large"
_EXPECTED_OUTPUT_SHAPE = [1, 195, 1536]
_CTC_EXPECTED_OUTPUT = '"Mr. Quilter is the apostle of the middle classes, and we\'re glad to welcome his gospel."'
_CTC_EXPECTED_LOSS = 1885.65
MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"speechbrain/m-ctc-t-large",
]
class MCTCTConv1dSubsampler(nn.Module):
"""
Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
via gated linear units (https://arxiv.org/abs/1911.08460)
"""
def __init__(self, config):
super().__init__()
self.config = config
self.glu_dim = config.conv_glu_dim
self.dropout = nn.Dropout(config.conv_dropout)
self.num_layers = config.num_conv_layers
self.in_channels = config.input_feat_per_channel * config.input_channels
if self.num_layers > 1:
if config.conv_channels is None:
raise ValueError(
"Need to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution"
" layers."
)
self.mid_channels = config.conv_channels
else:
self.mid_channels = None
self.out_channels = config.hidden_size * 2
self.kernel_size = config.conv_kernel
self.stride = config.conv_stride
self.conv_layers = nn.ModuleList(
nn.Conv1d(
self.in_channels if i == 0 else self.mid_channels[i-1],
self.mid_channels[i] if i < self.num_layers - 1 else self.out_channels,
kernel_size=k,
stride=self.stride[i],
padding="valid",
)
for i, k in enumerate(self.kernel_size)
)
def forward(self, input_features):
padding = sum([size // 2 for size in self.kernel_size])
input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0)
hidden_states = input_features.transpose(1, 2).contiguous()
for conv in self.conv_layers:
hidden_states = conv(hidden_states)
hidden_states = nn.functional.glu(hidden_states, dim=self.glu_dim)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states.transpose(1, 2).contiguous()
return hidden_states
class MCTCTEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = MCTCTLayerNorm()
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids",
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
persistent=False,
)
def forward(
self, input_features=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
):
input_shape = input_features.size() if input_features is not None else inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_features)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = config.attention_head_dim
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def reshape_fortran(self, x, shape):
if len(x.shape) > 0:
x = x.permute(*reversed(range(len(x.shape))))
return x.reshape(*reversed(shape)).permute(*reversed(range(len(shape))))
def relative_position_embedding_rotate(self, scores):
scores = scores.permute(0, 2, 3, 1)
batch, hidden_state, seq_len, heads = scores.shape
scores = torch.cat((scores, torch.zeros((batch, seq_len, seq_len, heads), device=scores.device)), dim=1)
scores = self.reshape_fortran(scores, [batch, (hidden_state + seq_len) * seq_len, 1, heads])
scores = scores[:, : (seq_len + hidden_state - 1) * seq_len]
scores = self.reshape_fortran(scores, [batch, hidden_state + seq_len - 1, seq_len, heads])
halfpoint = hidden_state // 2
scores = scores[:, halfpoint : halfpoint + seq_len].transpose(1, 2)
return scores.permute(0, 3, 1, 2)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
mixed_query_layer = self.query(hidden_states)
mixed_query_layer = mixed_query_layer / math.sqrt(self.attention_head_size)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
positional_embedding = self.distance_embedding.weight
relative_position_scores = torch.einsum("lh, bche -> bcle", positional_embedding, query_layer.transpose(2, 3))
relative_position_scores = self.relative_position_embedding_rotate(relative_position_scores)
attention_scores = attention_scores + relative_position_scores
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).flatten(start_dim=-2)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class MCTCTLayerNorm(nn.Module):
def __init__(self):
super().__init__()
self.singleton_weight = nn.Parameter(torch.ones(1))
self.singleton_bias = nn.Parameter(torch.zeros(1))
def forward(self, hidden_states):
return (hidden_states * self.singleton_weight) + self.singleton_bias
class MCTCTSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MCTCTAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = MCTCTSelfAttention(config)
self.output = MCTCTSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class MCTCTIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class MCTCTOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MCTCTLayer(nn.Module):
def __init__(self, config: MCTCTConfig):
super().__init__()
self.seq_len_dim = 1
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.intermediate = MCTCTIntermediate(config)
self.attention = MCTCTAttention(config)
self.is_decoder = config.is_decoder
self.output = MCTCTOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
):
self_attention_outputs = self.attention(
hidden_states, attention_mask, head_mask, output_attentions=output_attentions
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class MCTCTPreTrainedModel(PreTrainedModel):
"""
一个处理权重初始化、预训练模型下载和加载的抽象类。
"""
config_class = MCTCTConfig
base_model_prefix = "mctct"
main_input_name = "input_features"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, MCTCTLayerNorm):
module.singleton_weight.data.fill_(1.0)
module.singleton_bias.data.zero_()
if isinstance(module, (nn.Linear, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
"""
Computes the output length of the convolutional layers
"""
dilation = 1
for _, kernel_sz, stride in zip(
range(self.config.num_conv_layers), self.config.conv_kernel, self.config.conv_stride
):
padding = kernel_sz // 2
input_lengths = input_lengths + 2 * padding - dilation * (kernel_sz - 1) - 1
input_lengths = torch.div(input_lengths, stride, rounding_mode="trunc") + 1
return input_lengths
def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
if len(attention_mask.shape) > 2:
attention_mask = attention_mask[:, :, -1]
subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
bsz = attention_mask.size()[0]
attention_mask = torch.zeros(
(bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
return attention_mask
MCTCT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MCTCT_INPUTS_DOCSTRING = r"""
Args:
input_features (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
class MCTCTEncoder(MCTCTPreTrainedModel):
def __init__(self, config: MCTCTConfig):
super().__init__(config)
self.hidden_dropout_prob = config.hidden_dropout_prob
self.layer_norm = MCTCTLayerNorm()
self.conv = MCTCTConv1dSubsampler(config)
self.layers = nn.ModuleList([MCTCTLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
input_features: torch.Tensor,
attention_mask: torch.Tensor,
head_mask: torch.Tensor,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
"The bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.",
MCTCT_START_DOCSTRING,
class MCTCTModel(MCTCTPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.encoder = MCTCTEncoder(config)
self.post_init()
@add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="audio",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_features: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_features is None:
raise ValueError("You have to specify input_features.")
encoder_outputs = self.encoder(
input_features,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""MCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
MCTCT_START_DOCSTRING,
)
class MCTCTForCTC(MCTCTPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.mctct = MCTCTModel(config)
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
output_hidden_size = config.hidden_size
self.ctc_head = nn.Linear(output_hidden_size, config.vocab_size)
self.post_init()
@add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_CTC_EXPECTED_OUTPUT,
expected_loss=_CTC_EXPECTED_LOSS,
)
def forward(
self,
input_features: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
config.vocab_size - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mctct(
input_features,
attention_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
logits = self.ctc_head(hidden_states)
loss = None
if labels is not None:
if labels.max() >= self.config.vocab_size:
raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
attention_mask = (
attention_mask
if attention_mask is not None
else torch.ones(input_features.shape[:-1], dtype=torch.long)
)
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
labels_mask = labels >= 0
target_lengths = labels_mask.sum(-1)
flattened_targets = labels.masked_select(labels_mask)
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
with torch.backends.cudnn.flags(enabled=False):
loss = nn.functional.ctc_loss(
log_probs,
flattened_targets,
input_lengths,
target_lengths,
blank=self.config.pad_token_id,
reduction=self.config.ctc_loss_reduction,
zero_infinity=self.config.ctc_zero_infinity,
)
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
.\models\deprecated\mctct\processing_mctct.py
"""
Speech processor class for M-CTC-T
"""
import warnings
from contextlib import contextmanager
from ....processing_utils import ProcessorMixin
class MCTCTProcessor(ProcessorMixin):
r"""
Constructs a MCTCT processor which wraps a MCTCT feature extractor and a MCTCT tokenizer into a single processor.
[`MCTCTProcessor`] offers all the functionalities of [`MCTCTFeatureExtractor`] and [`AutoTokenizer`]. See the
[`~MCTCTProcessor.__call__`] and [`~MCTCTProcessor.decode`] for more information.
Args:
feature_extractor (`MCTCTFeatureExtractor`):
An instance of [`MCTCTFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`AutoTokenizer`):
An instance of [`AutoTokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "MCTCTFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
"""
当对象在正常模式下使用时,此方法将所有参数转发给 MCTCTFeatureExtractor 的 [`~MCTCTFeatureExtractor.__call__`] 并返回其输出。
如果在 [`~MCTCTProcessor.as_target_processor`] 上下文中使用,则将所有参数转发给 AutoTokenizer 的 [`~AutoTokenizer.__call__`]。
更多信息请参考上述两个方法的文档字符串。
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
if "raw_speech" in kwargs:
warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
audio = kwargs.pop("raw_speech")
else:
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if audio is None and text is None:
raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None:
inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, *args, **kwargs):
"""
此方法将所有参数转发给 AutoTokenizer 的 [`~PreTrainedTokenizer.batch_decode`]。请参考该方法的文档字符串以获取更多信息。
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def pad(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
[`~MCTCTFeatureExtractor.pad`] and returns its output. If used in the context
[`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
[`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
"""
if self._in_target_context_manager:
return self.current_processor.pad(*args, **kwargs)
input_features = kwargs.pop("input_features", None)
labels = kwargs.pop("labels", None)
if len(args) > 0:
input_features = args[0]
args = args[1:]
if input_features is not None:
input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
if labels is not None:
labels = self.tokenizer.pad(labels, **kwargs)
if labels is None:
return input_features
elif input_features is None:
return labels
else:
input_features["labels"] = labels["input_ids"]
return input_features
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to AutoTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT.
"""
warnings.warn(
"`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
"labels by using the argument `text` of the regular `__call__` method (either in the same call as "
"your audio inputs, or in a separate call."
)
self._in_target_context_manager = True
self.current_processor = self.tokenizer
yield
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
.\models\deprecated\mctct\__init__.py
from typing import TYPE_CHECKING
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_mctct": ["MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MCTCTConfig"],
"feature_extraction_mctct": ["MCTCTFeatureExtractor"],
"processing_mctct": ["MCTCTProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mctct"] = [
"MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST",
"MCTCTForCTC",
"MCTCTModel",
"MCTCTPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mctct import MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP, MCTCTConfig
from .feature_extraction_mctct import MCTCTFeatureExtractor
from .processing_mctct import MCTCTProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mctct import MCTCT_PRETRAINED_MODEL_ARCHIVE_LIST, MCTCTForCTC, MCTCTModel, MCTCTPreTrainedModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\mmbt\configuration_mmbt.py
""" MMBT configuration"""
from ....utils import logging
logger = logging.get_logger(__name__)
class MMBTConfig(object):
"""
This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT
model according to the specified arguments, defining the model architecture.
Args:
config ([`PreTrainedConfig`]):
Config of the underlying Transformer models. Its values are copied over to use a single config.
num_labels (`int`, *optional*):
Size of final Linear layer for classification.
modal_hidden_size (`int`, *optional*, defaults to 2048):
Embedding dimension of the non-text modality encoder.
"""
def __init__(self, config, num_labels=None, modal_hidden_size=2048):
self.__dict__ = config.__dict__
self.modal_hidden_size = modal_hidden_size
if num_labels:
self.num_labels = num_labels
.\models\deprecated\mmbt\modeling_mmbt.py
"""PyTorch MMBT model."""
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
from ....modeling_utils import ModuleUtilsMixin
from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "MMBTConfig"
class ModalEmbeddings(nn.Module):
"""Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""
def __init__(self, config, encoder, embeddings):
super().__init__()
self.config = config
self.encoder = encoder
self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
self.position_embeddings = embeddings.position_embeddings
self.token_type_embeddings = embeddings.token_type_embeddings
self.word_embeddings = embeddings.word_embeddings
self.LayerNorm = embeddings.LayerNorm
self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None):
token_embeddings = self.proj_embeddings(self.encoder(input_modal))
seq_length = token_embeddings.size(1)
if start_token is not None:
start_token_embeds = self.word_embeddings(start_token)
seq_length += 1
token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1)
if end_token is not None:
end_token_embeds = self.word_embeddings(end_token)
seq_length += 1
token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1)
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device)
position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
if token_type_ids is None:
token_type_ids = torch.zeros(
(input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = token_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
"""
MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and
Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and
obtain state-of-the-art performance on various multimodal classification benchmark tasks.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MMBTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration.
transformer (`nn.Module`): A text transformer that is used by MMBT.
It should have embeddings, encoder, and pooler attributes.
encoder (`nn.Module`): Encoder for the second modality.
It should take in a batch of modal inputs and return k, n dimension embeddings.
"""
"""
The bare MMBT Model outputting raw hidden-states without any specific head on top.
Inherits documentation from MMBT_START_DOCSTRING.
This class represents the core MMBT model without any additional task-specific head, providing raw hidden states.
Attributes:
config: Model configuration instance containing all model parameters.
transformer: Text transformer module used by MMBT for processing textual inputs.
modal_encoder: Module for handling the encoding of the second modality (e.g., images).
Methods:
forward: Defines the forward pass of the model, detailing how inputs propagate through the network.
get_input_embeddings: Retrieves the word embedding layer of the model.
set_input_embeddings: Sets the word embedding layer of the model.
"""
MMBT_INPUTS_DOCSTRING,
注释:
class MMBTForClassification(nn.Module):
r"""
**labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`:
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**:
(*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or
regression if config.num_labels==1) loss. **logits**:
`torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if
config.num_labels==1) scores (before SoftMax).
**hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for
the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`:
Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**:
(*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape
`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used
to compute the weighted average in the self-attention heads.
Examples:
```
# For example purposes. Not runnable.
transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
encoder = ImageEncoder(args)
model = MMBTForClassification(config, transformer, encoder)
outputs = model(input_modal, input_ids, labels=labels)
loss, logits = outputs[:2]
```"""
def __init__(self, config, transformer, encoder):
super().__init__()
self.num_labels = config.num_labels
self.mmbt = MMBTModel(config, transformer, encoder)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
def forward(
self,
input_modal,
input_ids=None,
modal_start_tokens=None,
modal_end_tokens=None,
attention_mask=None,
token_type_ids=None,
modal_token_type_ids=None,
position_ids=None,
modal_position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
return_dict=None,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mmbt(
input_modal=input_modal,
input_ids=input_ids,
modal_start_tokens=modal_start_tokens,
modal_end_tokens=modal_end_tokens,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
modal_token_type_ids=modal_token_type_ids,
position_ids=position_ids,
modal_position_ids=modal_position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.num_labels == 1:
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1), labels.view(-1))
else:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)