Transformers 源码解析(四十)
.\models\dinat\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_dinat": ["DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DinatConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_dinat"] = [
"DINAT_PRETRAINED_MODEL_ARCHIVE_LIST",
"DinatForImageClassification",
"DinatModel",
"DinatPreTrainedModel",
"DinatBackbone",
]
if TYPE_CHECKING:
from .configuration_dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_dinat import (
DINAT_PRETRAINED_MODEL_ARCHIVE_LIST,
DinatBackbone,
DinatForImageClassification,
DinatModel,
DinatPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\dinov2\configuration_dinov2.py
""" DINOv2 模型配置 """
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/dinov2-base": "https://huggingface.co/facebook/dinov2-base/resolve/main/config.json",
}
class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
r"""
这是一个配置类,用于存储 [`Dinov2Model`] 的配置。根据指定的参数实例化一个 Dinov2 模型,定义模型架构。
使用默认值实例化配置将产生与 Dinov2 [google/dinov2-base-patch16-224] 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。有关更多信息,请阅读 [`PretrainedConfig`] 的文档。
示例:
```
>>> from transformers import Dinov2Config, Dinov2Model
>>> # 初始化一个 Dinov2 dinov2-base-patch16-224 风格的配置
>>> configuration = Dinov2Config()
>>> # 使用 Dinov2 dinov2-base-patch16-224 风格的配置初始化一个模型(带有随机权重)
>>> model = Dinov2Model(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "dinov2"
def __init__(
self,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
mlp_ratio=4,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-6,
image_size=224,
patch_size=16,
num_channels=3,
qkv_bias=True,
layerscale_value=1.0,
drop_path_rate=0.0,
use_swiglu_ffn=False,
out_features=None,
out_indices=None,
apply_layernorm=True,
reshape_hidden_states=True,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.mlp_ratio = mlp_ratio
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.qkv_bias = qkv_bias
self.layerscale_value = layerscale_value
self.drop_path_rate = drop_path_rate
self.use_swiglu_ffn = use_swiglu_ffn
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
self.apply_layernorm = apply_layernorm
self.reshape_hidden_states = reshape_hidden_states
class Dinov2OnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-4
.\models\dinov2\convert_dinov2_to_hf.py
import argparse
import json
from pathlib import Path
import requests
import torch
import torch.nn as nn
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision import transforms
from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_dinov2_config(model_name, image_classifier=False):
config = Dinov2Config(image_size=518, patch_size=14)
if "vits" in model_name:
config.hidden_size = 384
config.num_attention_heads = 6
elif "vitb" in model_name:
pass
elif "vitl" in model_name:
config.hidden_size = 1024
config.num_hidden_layers = 24
config.num_attention_heads = 16
elif "vitg" in model_name:
config.use_swiglu_ffn = True
config.hidden_size = 1536
config.num_hidden_layers = 40
config.num_attention_heads = 24
else:
raise ValueError("Model not supported")
if image_classifier:
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
config.num_labels = 1000
config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
config.id2label = {int(k): v for k, v in config.id2label.items()}
return config
def create_rename_keys(config):
rename_keys = []
rename_keys.append(("cls_token", "embeddings.cls_token"))
rename_keys.append(("mask_token", "embeddings.mask_token"))
rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
for i in range(config.num_hidden_layers):
rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
if config.use_swiglu_ffn:
rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
else:
rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
rename_keys.append(("norm.weight", "layernorm.weight"))
rename_keys.append(("norm.bias", "layernorm.bias"))
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_q_k_v(state_dict, config):
for i in range(config.num_hidden_layers):
in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
config.hidden_size : config.hidden_size * 2, :
]
state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
config.hidden_size : config.hidden_size * 2
]
state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
return image
@torch.no_grad()
def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
"""
Copy/paste/tweak model's weights to our DINOv2 structure.
"""
image_classifier = "1layer" in model_name
config = get_dinov2_config(model_name, image_classifier=image_classifier)
original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
original_model.eval()
state_dict = original_model.state_dict()
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_q_k_v(state_dict, config)
for key, val in state_dict.copy().items():
val = state_dict.pop(key)
if "w12" in key:
key = key.replace("w12", "weights_in")
if "w3" in key:
key = key.replace("w3", "weights_out")
state_dict[key] = val
if image_classifier:
model = Dinov2ForImageClassification(config).eval()
model.dinov2.load_state_dict(state_dict)
model_name_to_classifier_dict_url = {
"dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
"dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
"dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
"dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
}
url = model_name_to_classifier_dict_url[model_name]
classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
else:
model = Dinov2Model(config).eval()
model.load_state_dict(state_dict)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
transformations = transforms.Compose(
[
transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
),
]
)
original_pixel_values = transformations(image).unsqueeze(0)
processor = BitImageProcessor(
size={"shortest_edge": 256},
resample=PILImageResampling.BICUBIC,
image_mean=IMAGENET_DEFAULT_MEAN,
image_std=IMAGENET_DEFAULT_STD,
)
pixel_values = processor(image, return_tensors="pt").pixel_values
assert torch.allclose(original_pixel_values, pixel_values)
with torch.no_grad():
outputs = model(pixel_values, output_hidden_states=True)
original_outputs = original_model(pixel_values)
if image_classifier:
print("Predicted class:")
class_idx = outputs.logits.argmax(-1).item()
print(model.config.id2label[class_idx])
else:
assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
model_name_to_hf_name = {
"dinov2_vits14": "dinov2-small",
"dinov2_vitb14": "dinov2-base",
"dinov2_vitl14": "dinov2-large",
"dinov2_vitg14": "dinov2-giant",
"dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
"dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
"dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
"dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
}
name = model_name_to_hf_name[model_name]
model.push_to_hub(f"facebook/{name}")
processor.push_to_hub(f"facebook/{name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="dinov2_vitb14",
type=str,
choices=[
"dinov2_vits14",
"dinov2_vitb14",
"dinov2_vitl14",
"dinov2_vitg14",
"dinov2_vits14_1layer",
"dinov2_vitb14_1layer",
"dinov2_vitl14_1layer",
"dinov2_vitg14_1layer",
],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\dinov2\modeling_dinov2.py
""" PyTorch DINOv2 模型."""
import collections.abc
import math
from typing import Dict, List, Optional, Set, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BackboneOutput,
BaseModelOutput,
BaseModelOutputWithPooling,
ImageClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_dinov2 import Dinov2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Dinov2Config"
_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-small-imagenet1k-1-layer"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/dinov2-base",
]
class Dinov2Embeddings(nn.Module):
"""
构建 CLS 令牌、掩码令牌、位置和补丁嵌入的模块。
"""
def __init__(self, config: Dinov2Config) -> None:
super().__init__()
self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
self.patch_embeddings = Dinov2PatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.config = config
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method interpolates the pre-trained position encodings for higher resolution images.
Source:
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
"""
num_patches = embeddings.shape[1] - 1
num_positions = self.position_embeddings.shape[1] - 1
if num_patches == num_positions and height == width:
return self.position_embeddings
class_pos_embed = self.position_embeddings[:, 0]
patch_pos_embed = self.position_embeddings[:, 1:]
dim = embeddings.shape[-1]
height = height // self.config.patch_size
width = width // self.config.patch_size
height, width = height + 0.1, width + 0.1
patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
target_dtype = patch_pos_embed.dtype
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed.to(dtype=torch.float32),
scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))),
mode="bicubic",
align_corners=False,
).to(dtype=target_dtype)
if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
raise ValueError("Width or height does not match with the interpolated position embeddings")
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
batch_size, _, height, width = pixel_values.shape
target_dtype = self.patch_embeddings.projection.weight.dtype
embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
if bool_masked_pos is not None:
embeddings = torch.where(
bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
)
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
embeddings = self.dropout(embeddings)
return embeddings
class Dinov2PatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
num_channels = pixel_values.shape[1]
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
f" Expected {self.num_channels} but got {num_channels}."
)
embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
return embeddings
class Dinov2SelfAttention(nn.Module):
def __init__(self, config: Dinov2Config) -> None:
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class Dinov2SelfOutput(nn.Module):
"""
The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: Dinov2Config) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class Dinov2Attention(nn.Module):
def __init__(self, config: Dinov2Config) -> None:
super().__init__()
self.attention = Dinov2SelfAttention(config)
self.output = Dinov2SelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads: Set[int]) -> None:
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_outputs = self.attention(hidden_states, head_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
实现DropPath操作,用于随机关闭网络中的路径,以增强模型的泛化能力。
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class Dinov2DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class Dinov2MLP(nn.Module):
def __init__(self, config) -> None:
super().__init__()
in_features = out_features = config.hidden_size
hidden_features = int(config.hidden_size * config.mlp_ratio)
self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
if isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.fc1(hidden_state)
hidden_state = self.activation(hidden_state)
hidden_state = self.fc2(hidden_state)
return hidden_state
class Dinov2SwiGLUFFN(nn.Module):
def __init__(self, config) -> None:
super().__init__()
in_features = out_features = config.hidden_size
hidden_features = int(config.hidden_size * config.mlp_ratio)
hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.weights_in(hidden_state)
x1, x2 = hidden_state.chunk(2, dim=-1)
hidden = nn.functional.silu(x1) * x2
return self.weights_out(hidden)
class Dinov2Layer(nn.Module):
"""This corresponds to the Block class in the original implementation."""
def __init__(self, config: Dinov2Config) -> None:
super().__init__()
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.attention = Dinov2Attention(config)
self.layer_scale1 = Dinov2LayerScale(config)
self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
if config.use_swiglu_ffn:
self.mlp = Dinov2SwiGLUFFN(config)
else:
self.mlp = Dinov2MLP(config)
self.layer_scale2 = Dinov2LayerScale(config)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
self_attention_outputs = self.attention(
self.norm1(hidden_states),
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
attention_output = self.layer_scale1(attention_output)
outputs = self_attention_outputs[1:]
hidden_states = self.drop_path(attention_output) + hidden_states
layer_output = self.norm2(hidden_states)
layer_output = self.mlp(layer_output)
layer_output = self.layer_scale2(layer_output)
layer_output = self.drop_path(layer_output) + hidden_states
outputs = (layer_output,) + outputs
return outputs
class Dinov2Encoder(nn.Module):
def __init__(self, config: Dinov2Config) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([Dinov2Layer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
layer_head_mask,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class Dinov2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Dinov2Config
base_model_prefix = "dinov2"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data = nn.init.trunc_normal_(
module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
).to(module.weight.dtype)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, Dinov2Embeddings):
module.position_embeddings.data = nn.init.trunc_normal_(
module.position_embeddings.data.to(torch.float32),
mean=0.0,
std=self.config.initializer_range,
).to(module.position_embeddings.dtype)
module.cls_token.data = nn.init.trunc_normal_(
module.cls_token.data.to(torch.float32),
mean=0.0,
std=self.config.initializer_range,
).to(module.cls_token.dtype)
DINOV2_INPUTS_DOCSTRING = r"""
# 以下是模型的输入参数说明:
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
像素值。像素值可以使用 [`AutoImageProcessor`] 获取。详见 [`BitImageProcessor.preprocess`]。
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
布尔掩码位置。指示哪些补丁是掩码的(1),哪些不是(0)。仅在预训练中相关。
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
用于掩盖自注意力模块中选定头部的掩码。掩码值在 `[0, 1]` 中选择:
- 1 表示头部**未被掩盖**,
- 0 表示头部**被掩盖**。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。返回的张量中的 `attentions` 有更多细节。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。返回的张量中的 `hidden_states` 有更多细节。
return_dict (`bool`, *optional*):
是否返回一个 [`~utils.ModelOutput`] 而不是一个普通元组。
"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
像素值。像素值可以通过 [`AutoImageProcessor`] 获取。详见 [`BitImageProcessor.preprocess`]。
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
自注意力模块中选定头部的掩码。掩码值在 `[0, 1]` 范围内:
- 1 表示头部 **未被掩码**,
- 0 表示头部 **已被掩码**。
output_attentions (`bool`, *optional*):
是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions`。
output_hidden_states (`bool`, *optional*):
是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states`。
return_dict (`bool`, *optional*):
是否返回 [`~utils.ModelOutput`] 而非普通元组。
"""
@add_start_docstrings(
"The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
DINOV2_START_DOCSTRING,
)
"""
class Dinov2Model(Dinov2PreTrainedModel):
"""
DINOv2 模型类,继承自预训练模型基类 Dinov2PreTrainedModel。
"""
def __init__(self, config: Dinov2Config):
"""
初始化方法,设置模型配置信息。
Args:
config (Dinov2Config): 模型的配置对象。
"""
super().__init__(config)
self.config = config
self.embeddings = Dinov2Embeddings(config)
self.encoder = Dinov2Encoder(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_init()
def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
"""
返回输入嵌入层对象。
Returns:
Dinov2PatchEmbeddings: 输入嵌入层对象。
"""
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
"""
对模型的注意力头进行修剪。
Args:
heads_to_prune (Dict[int, List[int]]): 要在每层修剪的注意力头的字典。
See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
bool_masked_pos: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
encoder_outputs = self.encoder(
embedding_output,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
pooled_output = sequence_output[:, 0, :]
if not return_dict:
head_outputs = (sequence_output, pooled_output)
return head_outputs + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""
Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
of the [CLS] token) e.g. for ImageNet.
""",
DINOV2_START_DOCSTRING,
)
class Dinov2ForImageClassification(Dinov2PreTrainedModel):
def __init__(self, config: Dinov2Config) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.dinov2 = Dinov2Model(config)
self.classifier = (
nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.dinov2(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
cls_token = sequence_output[:, 0]
patch_tokens = sequence_output[:, 1:]
linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
logits = self.classifier(linear_input)
loss = None
if labels is not None:
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
""",
DINOV2_START_DOCSTRING,
)
class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
def __init__(self, config):
super().__init__(config)
super()._init_backbone(config)
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
self.embeddings = Dinov2Embeddings(config)
self.encoder = Dinov2Encoder(config)
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.post_init()
def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
return self.embeddings.patch_embeddings
@add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.Tensor,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> BackboneOutput:
"""
定义方法签名和返回类型注解,方法返回类型为 BackboneOutput。
返回方法的输出结果,通常用于说明方法的功能。
Examples: 示例用法,展示如何使用此方法的代码片段。
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
>>> model = AutoBackbone.from_pretrained(
... "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
... )
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 16, 16]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
embedding_output = self.embeddings(pixel_values)
outputs = self.encoder(
embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
)
hidden_states = outputs.hidden_states if return_dict else outputs[1]
feature_maps = ()
for stage, hidden_state in zip(self.stage_names, hidden_states):
if stage in self.out_features:
if self.config.apply_layernorm:
hidden_state = self.layernorm(hidden_state)
if self.config.reshape_hidden_states:
hidden_state = hidden_state[:, 1:]
batch_size, _, height, width = pixel_values.shape
patch_size = self.config.patch_size
hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
feature_maps += (hidden_state,)
if not return_dict:
if output_hidden_states:
output = (feature_maps,) + outputs[1:]
else:
output = (feature_maps,) + outputs[2:]
return output
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions if output_attentions else None,
)
.\models\dinov2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
)
_import_structure = {
"configuration_dinov2": ["DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Dinov2Config", "Dinov2OnnxConfig"]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_dinov2"] = [
"DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST",
"Dinov2ForImageClassification",
"Dinov2Model",
"Dinov2PreTrainedModel",
"Dinov2Backbone",
]
if TYPE_CHECKING:
from .configuration_dinov2 import DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Dinov2Config, Dinov2OnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_dinov2 import (
DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST,
Dinov2Backbone,
Dinov2ForImageClassification,
Dinov2Model,
Dinov2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\distilbert\configuration_distilbert.py
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/config.json",
"distilbert-base-uncased-distilled-squad": (
"https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/config.json"
),
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/config.json",
"distilbert-base-cased-distilled-squad": (
"https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/config.json"
),
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/config.json",
"distilbert-base-multilingual-cased": (
"https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/config.json"
),
"distilbert-base-uncased-finetuned-sst-2-english": (
"https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json"
),
}
class DistilBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DistilBertModel`] or a [`TFDistilBertModel`]. It
is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT
[distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
pass
Args:
vocab_size (`int`, *optional*, defaults to 30522):
DistilBERT 模型的词汇表大小,定义了在调用 [`DistilBertModel`] 或 [`TFDistilBertModel`] 时可以表示的不同令牌数量。
max_position_embeddings (`int`, *optional*, defaults to 512):
模型可能使用的最大序列长度。通常设置为一个较大的值(例如 512、1024 或 2048)以防万一。
sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`):
是否使用正弦位置嵌入。
n_layers (`int`, *optional*, defaults to 6):
Transformer 编码器中隐藏层的数量。
n_heads (`int`, *optional*, defaults to 12):
Transformer 编码器中每个注意力层的注意头数。
dim (`int`, *optional*, defaults to 768):
编码器层和池化层的维度。
hidden_dim (`int`, *optional*, defaults to 3072):
Transformer 编码器中“中间”(通常称为前馈)层的大小。
dropout (`float`, *optional*, defaults to 0.1):
嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
attention_dropout (`float`, *optional*, defaults to 0.1):
注意力概率的 dropout 比率。
activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
编码器和池化器中的非线性激活函数(函数或字符串)。支持 "gelu"、"relu"、"silu" 和 "gelu_new"。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
qa_dropout (`float`, *optional*, defaults to 0.1):
用于问答模型 [`DistilBertForQuestionAnswering`] 中的 dropout 概率。
seq_classif_dropout (`float`, *optional*, defaults to 0.2):
用于序列分类和多选模型 [`DistilBertForSequenceClassification`] 中的 dropout 概率。
Examples:
```
>>> from transformers import DistilBertConfig, DistilBertModel
>>>
>>> configuration = DistilBertConfig()
>>>
>>> model = DistilBertModel(configuration)
>>>
>>> configuration = model.config
```
model_type = "distilbert"
attribute_map = {
"hidden_size": "dim",
"num_attention_heads": "n_heads",
"num_hidden_layers": "n_layers",
}
def __init__(
self,
vocab_size=30522,
max_position_embeddings=512,
sinusoidal_pos_embds=False,
n_layers=6,
n_heads=12,
dim=768,
hidden_dim=4 * 768,
dropout=0.1,
attention_dropout=0.1,
activation="gelu",
initializer_range=0.02,
qa_dropout=0.1,
seq_classif_dropout=0.2,
pad_token_id=0,
**kwargs,
):
super().__init__(**kwargs, pad_token_id=pad_token_id)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
self.n_layers = n_layers
self.n_heads = n_heads
self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation = activation
self.initializer_range = initializer_range
self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout
class DistilBertOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
.\models\distilbert\modeling_distilbert.py
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
if is_deepspeed_zero3_enabled():
import deepspeed
with deepspeed.zero.GatheredParameters(out, modifier_rank=0):
if torch.distributed.get_rank() == 0:
_create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)
else:
_create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out)
def _create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor):
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
out.requires_grad = False
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
class Embeddings(nn.Module):
def __init__(self, config: PretrainedConfig):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
if config.sinusoidal_pos_embds:
create_sinusoidal_embeddings(
n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
)
self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
self.dropout = nn.Dropout(config.dropout)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Parameters:
input_ids (torch.Tensor):
torch.tensor(bs, max_seq_length) The token ids to embed.
input_embeds (*optional*, torch.Tensor):
The pre-computed word embeddings. Can only be passed if the input ids are `None`.
Returns:
torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings)
"""
if input_ids is not None:
input_embeds = self.word_embeddings(input_ids)
seq_length = input_embeds.size(1)
if hasattr(self, "position_ids"):
position_ids = self.position_ids[:, :seq_length]
else:
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
position_embeddings = self.position_embeddings(position_ids)
embeddings = input_embeds + position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class MultiHeadSelfAttention(nn.Module):
def __init__(self, config: PretrainedConfig):
super().__init__()
self.config = config
self.n_heads = config.n_heads
self.dim = config.dim
self.dropout = nn.Dropout(p=config.attention_dropout)
self.is_causal = False
if self.dim % self.n_heads != 0:
raise ValueError(f"self.n_heads: {self.n_heads} must divide self.dim: {self.dim} evenly")
self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.pruned_heads: Set[int] = set()
self.attention_head_size = self.dim // self.n_heads
def prune_heads(self, heads: List[int]):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.n_heads, self.attention_head_size, self.pruned_heads
)
self.q_lin = prune_linear_layer(self.q_lin, index)
self.k_lin = prune_linear_layer(self.k_lin, index)
self.v_lin = prune_linear_layer(self.v_lin, index)
self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.dim = self.attention_head_size * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
"""
Parameters:
query: torch.tensor(bs, seq_length, dim)
key: torch.tensor(bs, seq_length, dim)
value: torch.tensor(bs, seq_length, dim)
mask: torch.tensor(bs, seq_length)
Returns:
weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights
context: torch.tensor(bs, seq_length, dim) Contextualized layer.
Optional: only if `output_attentions=True`
"""
bs, q_length, dim = query.size()
k_length = key.size(1)
dim_per_head = self.dim // self.n_heads
mask_reshp = (bs, 1, 1, k_length)
def shape(x: torch.Tensor) -> torch.Tensor:
"""将输入张量重塑以便多头注意力"""
return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
def unshape(x: torch.Tensor) -> torch.Tensor:
"""将多头注意力结果合并"""
return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
q = shape(self.q_lin(query))
k = shape(self.k_lin(key))
v = shape(self.v_lin(value))
q = q / math.sqrt(dim_per_head)
scores = torch.matmul(q, k.transpose(2, 3))
mask = (mask == 0).view(mask_reshp).expand_as(scores)
scores = scores.masked_fill(mask, torch.tensor(torch.finfo(scores.dtype).min))
weights = nn.functional.softmax(scores, dim=-1)
weights = self.dropout(weights)
if head_mask is not None:
weights = weights * head_mask
context = torch.matmul(weights, v)
context = unshape(context)
context = self.out_lin(context)
if output_attentions:
return (context, weights)
else:
return (context,)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
"""
Parameters:
query: torch.tensor(bs, seq_length, dim) # 输入查询张量,形状为(batch_size, seq_length, dim)
key: torch.tensor(bs, seq_length, dim) # 键张量,形状为(batch_size, seq_length, dim)
value: torch.tensor(bs, seq_length, dim) # 值张量,形状为(batch_size, seq_length, dim)
mask: torch.tensor(bs, seq_length) # 掩码张量,形状为(batch_size, seq_length)
Returns:
weights: torch.tensor(bs, n_heads, seq_length, seq_length) # 注意力权重张量,形状为(batch_size, n_heads, seq_length, seq_length)
context: torch.tensor(bs, seq_length, dim) # 上下文化层,可选:仅在`output_attentions=True`时返回
"""
batch_size, q_length, dim = query.size()
dim_per_head = self.dim // self.n_heads
def reshape(x: torch.Tensor) -> torch.Tensor:
"""将张量重新形状为(batch_size, seq_length, n_heads, dim_per_head)"""
return x.view(batch_size, -1, self.n_heads, dim_per_head)
query_states = reshape(self.q_lin(query))
key_states = reshape(self.k_lin(key))
value_states = reshape(self.v_lin(value))
attn_dropout = self.config.attention_dropout if self.training else 0.0
if query_states.dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_lin.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_weights = self._flash_attention_forward(
query_states, key_states, value_states, mask, q_length, dropout=attn_dropout
)
attn_weights_reshaped = attn_weights.reshape(batch_size, q_length, self.n_heads * dim_per_head)
attn_output = self.out_lin(attn_weights_reshaped)
if output_attentions:
return (attn_output, attn_weights)
else:
return (attn_output,)
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
causal = self.is_causal and query_length != 1
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.n_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
)
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
class FFN(nn.Module):
def __init__(self, config: PretrainedConfig):
super().__init__()
self.dropout = nn.Dropout(p=config.dropout)
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
self.activation = get_activation(config.activation)
def forward(self, input: torch.Tensor) -> torch.Tensor:
return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
def ff_chunk(self, input: torch.Tensor) -> torch.Tensor:
x = self.lin1(input)
x = self.activation(x)
x = self.lin2(x)
x = self.dropout(x)
return x
DISTILBERT_ATTENTION_CLASSES = {
"eager": MultiHeadSelfAttention,
"flash_attention_2": DistilBertFlashAttention2,
}
class TransformerBlock(nn.Module):
def __init__(self, config: PretrainedConfig):
super().__init__()
if config.dim % config.n_heads != 0:
raise ValueError(f"config.n_heads {config.n_heads} must divide config.dim {config.dim} evenly")
self.attention = DISTILBERT_ATTENTION_CLASSES[config._attn_implementation](config)
self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
self.ffn = FFN(config)
self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
def forward(
self,
x: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
"""
Parameters:
x: torch.tensor(bs, seq_length, dim)
attn_mask: torch.tensor(bs, seq_length)
Returns:
sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) 注意力权重
ffn_output: torch.tensor(bs, seq_length, dim) Transformer 块的输出
"""
sa_output = self.attention(
query=x,
key=x,
value=x,
mask=attn_mask,
head_mask=head_mask,
output_attentions=output_attentions,
)
if output_attentions:
sa_output, sa_weights = sa_output
else:
if type(sa_output) != tuple:
raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type")
sa_output = sa_output[0]
sa_output = self.sa_layer_norm(sa_output + x)
ffn_output = self.ffn(sa_output)
ffn_output: torch.Tensor = self.output_layer_norm(ffn_output + sa_output)
output = (ffn_output,)
if output_attentions:
output = (sa_weights,) + output
return output
class Transformer(nn.Module):
def __init__(self, config: PretrainedConfig):
super().__init__()
self.n_layers = config.n_layers
self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
self.gradient_checkpointing = False
def forward(
self,
x: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: Optional[bool] = None,
) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:
"""
Parameters:
x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
输入的嵌入序列张量,形状为 (bs, seq_length, dim)
attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
序列的注意力掩码张量,形状为 (bs, seq_length)
Returns:
hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
最后(顶部)层的隐藏状态序列,形状为 (bs, seq_length, dim)
一个包含每层隐藏状态的元组,形状为 (n_layers, bs, seq_length, dim)
可选项:仅在 output_hidden_states=True 时返回
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
每层的注意力权重张量的元组,形状为 (n_layers, bs, n_heads, seq_length, seq_length)
可选项:仅在 output_attentions=True 时返回
"""
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_state = x
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_state,
attn_mask,
head_mask[i],
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_state,
attn_mask,
head_mask[i],
output_attentions,
)
hidden_state = layer_outputs[-1]
if output_attentions:
if len(layer_outputs) != 2:
raise ValueError(f"The length of the layer_outputs should be 2, but it is {len(layer_outputs)}")
attentions = layer_outputs[0]
all_attentions = all_attentions + (attentions,)
else:
if len(layer_outputs) != 1:
raise ValueError(f"The length of the layer_outputs should be 1, but it is {len(layer_outputs)}")
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
if not return_dict:
return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
)
class DistilBertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DistilBertConfig
load_tf_weights = None
base_model_prefix = "distilbert"
supports_gradient_checkpointing = True
_supports_flash_attn_2 = True
def _init_weights(self, module: nn.Module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
DISTILBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DISTILBERT_INPUTS_DOCSTRING = r"""
Describes the inputs to the DistilBERT model and how to prepare them.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
Define a DistilBERT model for encoding text using transformer architecture.
@add_start_docstrings(
"The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING,
)
class DistilBertModel(DistilBertPreTrainedModel):
def __init__(self, config: PretrainedConfig):
super().__init__(config)
self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
# Initialize weights and apply final processing
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
Returns the position embeddings
"""
return self.embeddings.position_embeddings
"""
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
size will add correct vectors at the end following the position encoding algorithm, whereas reducing
the size will remove vectors from the end.
"""
# 计算新旧位置嵌入矩阵长度之差
num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings
# 如果长度没有变化,则无需调整
if num_position_embeds_diff == 0:
return
# 记录信息:设置 `config.max_position_embeddings` 的新值
logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
self.config.max_position_embeddings = new_num_position_embeddings
# 备份旧的位置嵌入权重
old_position_embeddings_weight = self.embeddings.position_embeddings.weight.clone()
# 根据新的 `max_position_embeddings` 大小重新创建位置嵌入层
self.embeddings.position_embeddings = nn.Embedding(self.config.max_position_embeddings, self.config.dim)
# 如果使用正弦位置嵌入,根据新的大小重新创建正弦位置嵌入
if self.config.sinusoidal_pos_embds:
create_sinusoidal_embeddings(
n_pos=self.config.max_position_embeddings, dim=self.config.dim, out=self.position_embeddings.weight
)
else:
with torch.no_grad():
# 根据位置嵌入大小的变化,重新设置位置嵌入权重
if num_position_embeds_diff > 0:
self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = nn.Parameter(
old_position_embeddings_weight
)
else:
self.embeddings.position_embeddings.weight = nn.Parameter(
old_position_embeddings_weight[:num_position_embeds_diff]
)
# 将更新后的位置嵌入层移动到正确的设备上
self.embeddings.position_embeddings.to(self.device)
def get_input_embeddings(self) -> nn.Embedding:
return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings: nn.Embedding):
# 设置输入词嵌入层的新权重
self.embeddings.word_embeddings = new_embeddings
def _prune_heads(self, heads_to_prune: Dict[int, List[List[int]]]):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 遍历需要修剪的层和头部,进行修剪操作
for layer, heads in heads_to_prune.items():
self.transformer.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC, # 添加代码示例文档字符串,指定检查点为给定的文档检查点
output_type=BaseModelOutput, # 指定输出类型为BaseModelOutput类
config_class=_CONFIG_FOR_DOC, # 指定配置类为给定的配置类
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token IDs张量,可选
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码张量,可选
head_mask: Optional[torch.Tensor] = None, # 头部掩码张量,可选
inputs_embeds: Optional[torch.Tensor] = None, # 嵌入输入张量,可选
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选
return_dict: Optional[bool] = None, # 是否返回字典格式的输出,可选
) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]: # 返回值可以是BaseModelOutput或者张量元组
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定output_attentions,则使用self.config中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定output_hidden_states,则使用self.config中的默认值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果未指定return_dict,则使用self.config中的默认值
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
# 检查输入参数的有效性,确保只能同时指定input_ids或inputs_embeds,并获取输入的形状
device = input_ids.device if input_ids is not None else inputs_embeds.device
# 获取输入所在的设备
# Prepare head mask if needed
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
# 如果需要,准备头部掩码
embeddings = self.embeddings(input_ids, inputs_embeds) # (bs, seq_length, dim)
# 生成输入的嵌入表示
if self._use_flash_attention_2:
attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
else:
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length)
# 根据self._use_flash_attention_2的条件设置注意力掩码,如果未提供则使用全1的默认掩码
return self.transformer(
x=embeddings,
attn_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 调用transformer模块进行前向传播,传入嵌入表示、注意力掩码、头部掩码等参数,并返回结果
# 使用装饰器添加文档字符串,描述此类是在DistilBert模型基础上增加了遮盖语言建模头部的模型
@add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top.""",
DISTILBERT_START_DOCSTRING,
)
# 定义DistilBertForMaskedLM类,继承自DistilBertPreTrainedModel
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
# 定义_tied_weights_keys属性,指定需要绑定权重的键名
_tied_weights_keys = ["vocab_projector.weight"]
# 初始化函数,接收一个PretrainedConfig类型的config对象作为参数
def __init__(self, config: PretrainedConfig):
# 调用父类的初始化函数
super().__init__(config)
# 根据配置中指定的激活函数名称,获取对应的激活函数
self.activation = get_activation(config.activation)
# 创建DistilBertModel模型对象,并赋值给self.distilbert
self.distilbert = DistilBertModel(config)
# 创建一个线性层,用于词汇转换,输入和输出维度均为config.dim
self.vocab_transform = nn.Linear(config.dim, config.dim)
# 创建一个LayerNorm层,用于词汇层的归一化,输入维度为config.dim
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
# 创建一个线性层,用于将模型的输出映射到词汇表大小的向量
self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
# 执行初始化权重操作和最终处理
self.post_init()
# 定义模型的损失函数为交叉熵损失函数
self.mlm_loss_fct = nn.CrossEntropyLoss()
# 获取位置嵌入的方法,返回DistilBert模型中的位置嵌入
def get_position_embeddings(self) -> nn.Embedding:
"""
Returns the position embeddings
"""
return self.distilbert.get_position_embeddings()
# 调整位置嵌入的方法,根据新的位置嵌入数量调整模型的位置嵌入矩阵
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
size will add correct vectors at the end following the position encoding algorithm, whereas reducing
the size will remove vectors from the end.
"""
self.distilbert.resize_position_embeddings(new_num_position_embeddings)
# 获取输出嵌入的方法,返回词汇投影层对象
def get_output_embeddings(self) -> nn.Module:
return self.vocab_projector
# 设置输出嵌入的方法,用新的嵌入层对象替换词汇投影层
def set_output_embeddings(self, new_embeddings: nn.Module):
self.vocab_projector = new_embeddings
# 使用装饰器添加文档字符串到模型前向传播方法,描述输入参数和输出类型
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
# 使用代码示例装饰器添加文档字符串,提供模型前向传播的示例和其他相关信息
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
# 模型的前向传播方法,接收多个输入参数,返回一个输出对象或字典
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[MaskedLMOutput, Tuple[torch.Tensor, ...]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
# 根据函数声明,定义了输入参数和返回类型,包括可选的标签用于计算MLM损失
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用DistilBERT模型,获取输出结果
dlbrt_output = self.distilbert(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取DistilBERT模型的隐藏状态
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
# 将隐藏状态转换为预测的对数概率
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
# 应用激活函数到预测的对数概率
prediction_logits = self.activation(prediction_logits) # (bs, seq_length, dim)
# 对预测的对数概率进行层归一化
prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
# 使用投影层将预测的对数概率映射到词汇表大小的空间
prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size)
mlm_loss = None
# 如果提供了标签,计算MLM损失
if labels is not None:
mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))
# 如果不要求返回字典格式的输出,构建输出元组
if not return_dict:
output = (prediction_logits,) + dlbrt_output[1:]
return ((mlm_loss,) + output) if mlm_loss is not None else output
# 返回MaskedLMOutput对象,包括损失、预测的对数概率、隐藏状态和注意力权重
return MaskedLMOutput(
loss=mlm_loss,
logits=prediction_logits,
hidden_states=dlbrt_output.hidden_states,
attentions=dlbrt_output.attentions,
)
"""
DistilBert模型转换器,顶部带有序列分类/回归头(即顶部的线性层,用于池化输出),例如用于GLUE任务。
"""
@add_start_docstrings(
"""
DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
DISTILBERT_START_DOCSTRING,
)
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
def __init__(self, config: PretrainedConfig):
"""
初始化方法,配置DistilBert序列分类/回归模型。
Arguments:
config (:class:`~transformers.PretrainedConfig`):
包含模型配置信息的预训练配置对象。
"""
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
# DistilBert模型实例化
self.distilbert = DistilBertModel(config)
# 预分类器,线性层
self.pre_classifier = nn.Linear(config.dim, config.dim)
# 分类器,线性层
self.classifier = nn.Linear(config.dim, config.num_labels)
# Dropout层
self.dropout = nn.Dropout(config.seq_classif_dropout)
# 初始化权重并应用最终处理
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
返回位置嵌入
"""
return self.distilbert.get_position_embeddings()
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
如果`new_num_position_embeddings != config.max_position_embeddings`,调整模型的位置嵌入。
Arguments:
new_num_position_embeddings (`int`):
新的位置嵌入矩阵数量。如果位置嵌入是学习的,则增加大小将在末尾添加新初始化的向量,
而减小大小将从末尾删除向量。如果位置嵌入不是学习的(例如正弦位置嵌入),
增加大小将按照位置编码算法在末尾添加正确的向量,而减小大小将从末尾删除向量。
"""
self.distilbert.resize_position_embeddings(new_num_position_embeddings)
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播方法,执行DistilBert序列分类/回归模型的计算。
Arguments:
input_ids (:obj:`torch.Tensor`, optional):
输入序列的token IDs张量。如果不提供`inputs_embeds`,则必须提供此项。
attention_mask (:obj:`torch.Tensor`, optional):
注意力遮罩张量,指示哪些tokens应被忽略。默认为`None`。
head_mask (:obj:`torch.Tensor`, optional):
多头注意力层的遮罩张量,用于控制每个注意力头的输出。默认为`None`。
inputs_embeds (:obj:`torch.Tensor`, optional):
直接传入模型的嵌入张量,而不是输入IDs。如果提供了此项,则`input_ids`应为`None`。
labels (:obj:`torch.LongTensor`, optional):
标签张量,用于模型训练的目标值。默认为`None`。
output_attentions (:obj:`bool`, optional):
是否返回所有注意力权重。默认为`None`。
output_hidden_states (:obj:`bool`, optional):
是否返回所有隐藏状态。默认为`None`。
return_dict (:obj:`bool`, optional):
是否返回字典类型的输出。默认为`None`。
Returns:
:class:`~transformers.modeling_outputs.SequenceClassifierOutput`:
包含模型输出的命名元组。
"""
# 实现前向传播逻辑,计算输出结果
# (具体实现详见模型具体代码,此处略)
) -> Union[SequenceClassifierOutput, Tuple[torch.Tensor, ...]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 根据是否需要返回字典来确定返回值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 将输入传递给DistilBERT模型,获取输出
distilbert_output = self.distilbert(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 提取DistilBERT输出的隐藏状态
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
# 提取池化后的输出,取每个序列的第一个标记
pooled_output = hidden_state[:, 0] # (bs, dim)
# 应用预分类器(一个线性层)到池化输出
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
# 应用ReLU激活函数到预分类器输出
pooled_output = nn.ReLU()(pooled_output) # (bs, dim)
# 应用dropout操作到ReLU输出
pooled_output = self.dropout(pooled_output) # (bs, dim)
# 将池化后的输出传递给分类器,得到logits
logits = self.classifier(pooled_output) # (bs, num_labels)
# 初始化损失值为None
loss = None
# 如果有标签输入
if labels is not None:
# 如果问题类型尚未确定
if self.config.problem_type is None:
# 根据标签数量确定问题类型为回归或分类
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型计算损失值
if self.config.problem_type == "regression":
# 使用均方误差损失函数
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
# 使用交叉熵损失函数
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
# 使用带logits的二元交叉熵损失函数
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不需要返回字典,则只返回logits和可能的其他输出
if not return_dict:
output = (logits,) + distilbert_output[1:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典,则返回包含损失、logits和其他输出的SequenceClassifierOutput对象
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
@add_start_docstrings(
"""
DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
DISTILBERT_START_DOCSTRING,
)
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
def __init__(self, config: PretrainedConfig):
super().__init__(config)
# 初始化 DistilBERT 模型
self.distilbert = DistilBertModel(config)
# 线性层用于输出 span 开始和结束的逻辑回归
self.qa_outputs = nn.Linear(config.dim, config.num_labels)
# 检查标签数是否为2,否则引发错误
if config.num_labels != 2:
raise ValueError(f"config.num_labels should be 2, but it is {config.num_labels}")
# Dropout 层
self.dropout = nn.Dropout(config.qa_dropout)
# 初始化权重并进行最终处理
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
Returns the position embeddings
"""
return self.distilbert.get_position_embeddings()
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
size will add correct vectors at the end following the position encoding algorithm, whereas reducing
the size will remove vectors from the end.
"""
self.distilbert.resize_position_embeddings(new_num_position_embeddings)
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
@add_start_docstrings(
"""
DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
DISTILBERT_START_DOCSTRING,
)
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
def __init__(self, config: PretrainedConfig):
super().__init__(config)
self.num_labels = config.num_labels # 从配置中获取标签数量
self.distilbert = DistilBertModel(config) # 初始化 DistilBERT 模型
self.dropout = nn.Dropout(config.dropout) # 根据配置添加 dropout 层
self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 添加线性分类器
# 初始化权重并应用最终处理
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
Returns the position embeddings
"""
return self.distilbert.get_position_embeddings() # 返回 DistilBERT 模型的位置嵌入
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
new_num_position_embeddings (`int`):
The number of new position embedding matrix. If position embeddings are learned, increasing the size
will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
size will add correct vectors at the end following the position encoding algorithm, whereas reducing
the size will remove vectors from the end.
"""
self.distilbert.resize_position_embeddings(new_num_position_embeddings) # 调整 DistilBERT 模型的位置嵌入
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Defines how inputs are processed through the model layers.
Arguments:
input_ids (`torch.Tensor`, optional):
Indices of input sequence tokens in the vocabulary.
attention_mask (`torch.Tensor`, optional):
Mask to avoid performing attention on padding token indices.
head_mask (`torch.Tensor`, optional):
Mask to nullify selected heads of the self-attention modules.
inputs_embeds (`torch.Tensor`, optional):
Embedded representation of input tokens.
labels (`torch.LongTensor`, optional):
Labels for computing the token classification loss.
output_attentions (`bool`, optional):
Whether to return attentions tensors.
output_hidden_states (`bool`, optional):
Whether to return hidden states.
return_dict (`bool`, optional):
Whether to return a dictionary instead of a tuple of outputs.
Returns:
Output of the model, usually a tuple with various elements depending on the configuration.
"""
) -> Union[TokenClassifierOutput, Tuple[torch.Tensor, ...]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 根据函数定义,返回值可以是 TokenClassifierOutput 对象或者元组形式的 Tensor
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 DistilBERT 模型进行前向传播,获取输出结果
outputs = self.distilbert(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从输出结果中提取序列输出
sequence_output = outputs[0]
# 应用 dropout 操作
sequence_output = self.dropout(sequence_output)
# 使用分类器对序列输出进行分类,得到分类 logits
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
# 如果有标签输入,则计算交叉熵损失
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不需要返回字典,则按元组形式返回输出结果
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典形式的输出,构造 TokenClassifierOutput 对象返回
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""
DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
"""
# 导入必要的库和模块
import torch
import torch.nn as nn
from .configuration_distilbert import DistilBertConfig
from .modeling_distilbert import DistilBertModel, DistilBertPreTrainedModel
from .file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from typing import Optional
from transformers.file_utils import ModelOutput, PretrainedConfig
# 定义 DistilBertForMultipleChoice 类,继承自 DistilBertPreTrainedModel
@add_start_docstrings(
"""
DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
""",
DISTILBERT_START_DOCSTRING,
)
class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
def __init__(self, config: PretrainedConfig):
super().__init__(config)
# 初始化 DistilBert 模型
self.distilbert = DistilBertModel(config)
# 多选分类任务的预分类器
self.pre_classifier = nn.Linear(config.dim, config.dim)
# 用于二分类的线性层
self.classifier = nn.Linear(config.dim, 1)
# Dropout 层,用于防止过拟合
self.dropout = nn.Dropout(config.seq_classif_dropout)
# 初始化权重并应用最终处理
self.post_init()
def get_position_embeddings(self) -> nn.Embedding:
"""
Returns the position embeddings
"""
# 调用 DistilBertModel 的方法获取位置嵌入
return self.distilbert.get_position_embeddings()
def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
Arguments:
new_num_position_embeddings (`int`)
The number of new position embeddings. If position embeddings are learned, increasing the size will add
newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
add correct vectors at the end following the position encoding algorithm, whereas reducing the size
will remove vectors from the end.
"""
# 调整 DistilBertModel 的位置嵌入
self.distilbert.resize_position_embeddings(new_num_position_embeddings)
@add_start_docstrings_to_model_forward(
DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> ModelOutput:
"""
Forward pass for DistilBertForMultipleChoice.
Args:
input_ids (Optional[torch.Tensor], optional):
Indices of input sequence tokens in the vocabulary.
attention_mask (Optional[torch.Tensor], optional):
Mask to avoid performing attention on padding token indices.
head_mask (Optional[torch.Tensor], optional):
Mask to nullify selected heads of the self-attention modules.
inputs_embeds (Optional[torch.Tensor], optional):
Optionally, instead of passing `input_ids`, you can directly pass an embedded representation.
labels (Optional[torch.LongTensor], optional):
Labels for computing the multiple choice classification loss.
output_attentions (Optional[bool], optional):
Whether to return attentions weights.
output_hidden_states (Optional[bool], optional):
Whether to return hidden states.
return_dict (Optional[bool], optional):
Whether to return a dictionary instead of a tuple.
**kwargs:
Additional keyword arguments for the DistilBertModel forward method.
Returns:
ModelOutput: A namedtuple with the model outputs: last_hidden_state, (optional) hidden_states, (optional) attentions.
"""
# 调用 DistilBertModel 的 forward 方法进行前向传播
return self.distilbert(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
**kwargs,
)
.\models\distilbert\modeling_flax_distilbert.py
import math
from typing import Callable, Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
import numpy as np
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from ...modeling_flax_outputs import (
FlaxBaseModelOutput,
FlaxMaskedLMOutput,
FlaxMultipleChoiceModelOutput,
FlaxQuestionAnsweringModelOutput,
FlaxSequenceClassifierOutput,
FlaxTokenClassifierOutput,
)
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_distilbert import DistilBertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
_CONFIG_FOR_DOC = "DistilBertConfig"
FLAX_DISTILBERT_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
This model is also a
[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
Parameters:
config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DISTILBERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`numpy.ndarray` of shape `({0})`):
# 输入序列中的token索引数组,索引对应词汇表中的token。
# 可以使用`AutoTokenizer`获取这些索引。参见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`获取详细信息。
# [什么是input IDs?](../glossary#input-ids)
attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
# 避免对填充token索引执行注意力计算的掩码。掩码的取值范围为`[0, 1]`:
# - 1表示**不被掩盖**的token,
# - 0表示**被掩盖**的token。
# [什么是attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。查看返回的张量中`attentions`获取更多细节。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。查看返回的张量中`hidden_states`获取更多细节。
return_dict (`bool`, *optional*):
# 是否返回`~utils.ModelOutput`而不是普通的元组。
"""
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
return pos * angle_rates
"""
# 根据位置、索引和模型维度计算角度率,用于位置编码中的角度计算
"""
def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return jnp.array(pos_encoding)
"""
# 根据位置和模型维度生成位置编码的正弦和余弦模式
class FlaxEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self):
self.word_embeddings = nn.Embed(
self.config.vocab_size,
self.config.dim,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
if not self.config.sinusoidal_pos_embds:
self.position_embeddings = nn.Embed(
self.config.max_position_embeddings,
self.config.dim,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
else:
self.pos_encoding = positional_encoding(self.config.max_position_embeddings, self.config.dim)
self.LayerNorm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.dropout)
def __call__(self, input_ids, deterministic: bool = True):
# Embed
batch_size, seq_length = input_ids.shape
inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
if not self.config.sinusoidal_pos_embds:
position_ids = jnp.arange(seq_length).astype("i4")
position_ids = jnp.broadcast_to(position_ids, shape=(batch_size, seq_length))
position_embeds = self.position_embeddings(position_ids.astype("i4"))
else:
position_embeds = self.pos_encoding[:, :seq_length, :]
# explicitly cast the positions here, since self.embed_positions are not registered as parameters
position_embeds = position_embeds.astype(inputs_embeds.dtype)
# Sum all embeddings
hidden_states = inputs_embeds + position_embeds
# Layer Norm
hidden_states = self.LayerNorm(hidden_states)
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
return hidden_states
class FlaxMultiHeadSelfAttention(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self):
self.n_heads = self.config.n_heads # 从配置中获取注意力头的数量
self.dim = self.config.dim # 从配置中获取模型维度
self.dropout = nn.Dropout(rate=self.config.attention_dropout) # 根据配置设置注意力机制中的dropout
if not (self.dim % self.n_heads == 0):
raise ValueError(f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}") # 检查隐藏层大小是否可以被注意力头的数量整除
self.q_lin = nn.Dense(
self.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
) # 初始化用于query的线性层,输入维度为dim,输出维度为dim
self.k_lin = nn.Dense(
self.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
) # 初始化用于key的线性层,输入维度为dim,输出维度为dim
self.v_lin = nn.Dense(
self.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
) # 初始化用于value的线性层,输入维度为dim,输出维度为dim
self.out_lin = nn.Dense(
self.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
) # 初始化输出层线性层,输入维度为dim,输出维度为dim
def __call__(
self,
query,
key,
value,
mask,
deterministic: bool = True,
output_attentions: bool = False,
):
bs, q_len, dim = query.shape # 获取query的形状信息,bs为batch size,q_len为query的长度,dim为维度
k_len = key.shape[1] # 获取key的长度
dim_per_head = self.dim // self.n_heads # 计算每个注意力头的维度
mask_reshp = (bs, 1, 1, k_len) # 重塑mask的形状用于后续操作
def shape(x):
"""分离头部"""
return x.reshape(bs, -1, self.n_heads, dim_per_head).transpose(0, 2, 1, 3) # 重塑张量x以分离注意力头
def unshape(x):
"""合并头部"""
return x.transpose(0, 2, 1, 3).reshape(bs, -1, self.n_heads * dim_per_head) # 重塑张量x以合并注意力头
q = shape(self.q_lin(query)) # 通过query的线性层进行形状分离,得到 (bs, n_heads, q_len, dim_per_head)
k = shape(self.k_lin(key)) # 通过key的线性层进行形状分离,得到 (bs, n_heads, k_len, dim_per_head)
v = shape(self.v_lin(value)) # 通过value的线性层进行形状分离,得到 (bs, n_heads, k_len, dim_per_head)
q = q / math.sqrt(dim_per_head) # 对query进行缩放,以便更好地计算注意力权重 (bs, n_heads, q_len, dim_per_head)
scores = jnp.matmul(q, k.transpose(0, 1, 3, 2)) # 计算注意力分数,形状为 (bs, n_heads, q_len, k_len)
mask = jnp.reshape(mask, mask_reshp) # 调整mask的形状以匹配注意力分数
mask = mask.astype(scores.dtype) # 将mask转换为与scores相同的数据类型
scores = scores - 1e30 * (1.0 - mask) # 将mask应用于scores,增加无效位置的大负数
weights = nn.softmax(scores, axis=-1) # 计算注意力权重,形状为 (bs, n_heads, q_len, k_len)
weights = self.dropout(weights, deterministic=deterministic) # 应用dropout到注意力权重
context = jnp.matmul(weights, v) # 计算上下文向量,形状为 (bs, n_heads, q_len, dim_per_head)
context = unshape(context) # 合并注意力头,形状为 (bs, q_len, dim)
context = self.out_lin(context) # 应用输出层线性层,形状为 (bs, q_len, dim)
if output_attentions:
return (context, weights) # 如果需要输出注意力权重,返回上下文向量和权重
else:
return (context,) # 否则只返回上下文向量
class FlaxFFN(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
def setup(self):
self.dropout = nn.Dropout(rate=self.config.dropout) # 设置dropout层
self.chunk_size_feed_forward = self.config.chunk_size_feed_forward # 前馈层的块大小
self.seq_len_dim = 1 # 序列长度维度为1
self.lin1 = nn.Dense(
self.config.hidden_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
) # 第一个全连接层,使用正态分布初始化权重
self.lin2 = nn.Dense(
self.config.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
) # 第二个全连接层,使用正态分布初始化权重
self.activation = ACT2FN[self.config.activation] # 激活函数
def __call__(self, hidden_states, deterministic: bool = True):
hidden_states = self.lin1(hidden_states) # 第一个全连接层的计算
hidden_states = self.activation(hidden_states) # 激活函数的应用
hidden_states = self.lin2(hidden_states) # 第二个全连接层的计算
hidden_states = self.dropout(hidden_states, deterministic=deterministic) # dropout操作
return hidden_states
class FlaxTransformerBlock(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
def setup(self):
assert (
self.config.dim % self.config.n_heads == 0
), f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}" # 断言,确保隐藏大小可以被头数整除
self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype) # 多头自注意力机制
self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) # 自注意力层的LayerNorm
self.ffn = FlaxFFN(self.config, dtype=self.dtype) # 前馈网络
self.output_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) # 输出层的LayerNorm
def __call__(
self,
hidden_states,
attn_mask,
output_attentions: bool = False,
deterministic: bool = True,
):
# 自注意力
sa_output = self.attention(
query=hidden_states,
key=hidden_states,
value=hidden_states,
mask=attn_mask,
output_attentions=output_attentions,
deterministic=deterministic,
)
if output_attentions:
sa_output, sa_weights = sa_output # 如果需要输出注意力权重,则获取权重
else:
assert type(sa_output) == tuple
sa_output = sa_output[0] # 否则,获取自注意力的输出
sa_output = self.sa_layer_norm(sa_output + hidden_states) # 应用LayerNorm
# 前馈网络
ffn_output = self.ffn(sa_output, deterministic=deterministic) # 前馈网络的计算
ffn_output = self.output_layer_norm(ffn_output + sa_output) # 应用LayerNorm
output = (ffn_output,) # 输出结果为元组
if output_attentions:
output = (sa_weights,) + output # 如果需要输出注意力权重,则将权重添加到输出中
return output
class FlaxTransformer(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # 计算的数据类型
def setup(self):
self.layers = [
FlaxTransformerBlock(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.n_layers)
] # 创建多个TransformerBlock层的列表
# 定义一个可调用的方法,用于执行模型的前向传播
def __call__(
self,
hidden_states,
attention_mask,
output_attentions: bool = False,
output_hidden_states: bool = False,
deterministic: bool = True,
return_dict: bool = False,
):
# 如果输出隐藏状态,初始化存储所有隐藏状态的元组,否则为None
all_hidden_states = () if output_hidden_states else None
# 如果输出注意力权重,初始化存储所有注意力权重的元组,否则为None
all_attentions = () if output_attentions else None
# 遍历所有的层模块
for layer_module in self.layers:
# 如果需要输出隐藏状态,将当前的隐藏状态添加到所有隐藏状态的元组中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 调用当前层模块的前向传播方法,获取该层的输出
layer_outputs = layer_module(
hidden_states=hidden_states,
attn_mask=attention_mask,
output_attentions=output_attentions,
deterministic=deterministic,
)
# 更新隐藏状态为当前层的输出的最后一个值
hidden_states = layer_outputs[-1]
# 如果需要输出注意力权重
if output_attentions:
# 确保当前层的输出包含两个元素(注意力权重和其他)
assert len(layer_outputs) == 2
# 获取注意力权重,并添加到所有注意力权重的元组中
attentions = layer_outputs[0]
all_attentions = all_attentions + (attentions,)
else:
# 确保当前层的输出只包含一个元素(隐藏状态)
assert len(layer_outputs) == 1
# 添加最后一层的隐藏状态到所有隐藏状态的元组中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 如果不需要返回字典,则返回包含非None值的元组
if not return_dict:
return tuple(v for v in [hidden_states, all_attentions, all_hidden_states] if v is not None)
# 如果需要返回字典,则创建并返回FlaxBaseModelOutput对象
return FlaxBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
class FlaxTransformerEncoder(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self):
self.layer = FlaxTransformer(self.config, dtype=self.dtype)
# 初始化 FlaxTransformer 层,使用给定的配置和数据类型
def __call__(
self,
hidden_states,
attention_mask,
output_attentions: bool = False,
output_hidden_states: bool = False,
deterministic: bool = True,
return_dict: bool = False,
):
return self.layer(
hidden_states=hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
deterministic=deterministic,
return_dict=return_dict,
)
# 调用 FlaxTransformer 层,传递输入参数并返回结果
class FlaxDistilBertLMDecoder(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
def setup(self):
self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
# 初始化偏置参数 self.bias,大小为词汇表大小,使用 bias_init 初始化器
def __call__(self, inputs, kernel):
inputs = jnp.asarray(inputs, self.dtype)
kernel = jnp.asarray(kernel, self.dtype)
y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())))
# 执行高效的矩阵乘法操作,inputs 和 kernel 是输入张量
bias = jnp.asarray(self.bias, self.dtype)
y = y + bias
# 将偏置加到输出 y 上
return y
class FlaxDistilBertPreTrainedModel(FlaxPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DistilBertConfig
base_model_prefix = "distilbert"
module_class: nn.Module = None
def __init__(
self,
config: DistilBertConfig,
input_shape: Tuple = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 使用给定的配置和数据类型初始化模块
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化权重函数
input_ids = jnp.zeros(input_shape, dtype="i4")
attention_mask = jnp.ones_like(input_ids)
# 创建输入张量和注意力掩码,使用默认值
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 分割随机数生成器以用于参数初始化和 dropout
random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"]
# 使用随机数初始化模块的参数
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
# 如果有指定的参数,则将缺失的键补充为随机初始化的参数,并返回完整的参数字典
else:
return random_params
# 否则,直接返回随机初始化的参数
# 添加模型调用的前向传播文档字符串,描述输入参数为批大小和序列长度
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# 定义调用方法,接受多个输入参数用于模型推理
def __call__(
self,
input_ids, # 输入的token IDs序列
attention_mask=None, # 注意力掩码,指示哪些位置是有效的
head_mask=None, # 头掩码,控制不同的注意力头的掩码
params: dict = None, # 参数字典,用于加载模型参数
dropout_rng: jax.random.PRNGKey = None, # 随机数生成器密钥,用于Dropout操作
train: bool = False, # 指示是否为训练模式
output_attentions: Optional[bool] = None, # 是否输出注意力权重
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态
return_dict: Optional[bool] = None, # 是否返回字典格式的输出
):
# 如果未提供attention_mask,则默认为全1,即所有位置都是有效的
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 处理可能需要的任何随机数生成器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 调用模型的apply方法进行前向传播
return self.module.apply(
{"params": params or self.params}, # 使用给定的参数或默认的模型参数
jnp.array(input_ids, dtype="i4"), # 转换输入token IDs为JAX数组
jnp.array(attention_mask, dtype="i4"), # 转换注意力掩码为JAX数组
not train, # 转换训练标志为相反值,用于控制模型是否在推理模式下运行
output_attentions, # 是否输出注意力权重
output_hidden_states, # 是否输出隐藏状态
return_dict, # 是否返回字典格式的输出
rngs=rngs, # 传递随机数生成器密钥到模型的apply方法中
)
class FlaxDistilBertModule(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 初始化嵌入层对象,使用给定的配置和数据类型
self.embeddings = FlaxEmbeddings(self.config, dtype=self.dtype)
# 初始化变换器编码器对象,使用给定的配置和数据类型
self.transformer = FlaxTransformerEncoder(self.config, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 如果输出注意力权重未指定,则使用配置中的默认设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果输出隐藏状态未指定,则使用配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果返回字典未指定,则使用配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 获取输入的嵌入表示
input_embeds = self.embeddings(input_ids, deterministic=deterministic)
# 调用变换器编码器进行处理
return self.transformer(
hidden_states=input_embeds,
attention_mask=attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
@add_start_docstrings(
"输出原始隐藏状态的DistilBert模型变换器,没有特定的输出头部。",
FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertModel(FlaxDistilBertPreTrainedModel):
module_class = FlaxDistilBertModule
append_call_sample_docstring(FlaxDistilBertModel, _CHECKPOINT_FOR_DOC, None, _CONFIG_FOR_DOC)
class FlaxDistilBertForMaskedLMModule(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32 # 计算时使用的数据类型
def setup(self):
# 初始化DistilBert模型对象,使用给定的配置和数据类型
self.distilbert = FlaxDistilBertModule(self.config, dtype=self.dtype)
# 初始化词汇变换层,使用给定的维度和正态分布的初始化方式
self.vocab_transform = nn.Dense(
self.config.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
# 初始化词汇层归一化,设定epsilon为1e-12,使用给定的数据类型
self.vocab_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
# 如果需要绑定词嵌入,则初始化DistilBert语言模型解码器
if self.config.tie_word_embeddings:
self.vocab_projector = FlaxDistilBertLMDecoder(
self.config,
dtype=self.dtype,
)
else:
# 否则初始化普通的Dense层作为词汇投影器
self.vocab_projector = nn.Dense(
self.config.vocab_size,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
def __call__(
self,
input_ids,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
)
# 如果 return_dict 为 None,则根据配置决定是否使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 DistilBERT 模型处理输入,获取输出
dlbrt_output = self.distilbert(
input_ids=input_ids, # 输入的 token IDs
attention_mask=attention_mask, # 注意力掩码
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
deterministic=deterministic, # 是否确定性运行
return_dict=return_dict, # 是否返回字典形式的输出
)
# 获取隐藏状态作为预测的 logits
hidden_states = dlbrt_output[0]
# 使用 vocab_transform 对隐藏状态进行转换得到预测 logits
prediction_logits = self.vocab_transform(hidden_states)
# 根据配置中的激活函数对 logits 进行激活
prediction_logits = ACT2FN[self.config.activation](prediction_logits)
# 对激活后的 logits 进行 layer normalization
prediction_logits = self.vocab_layer_norm(prediction_logits)
# 如果配置指定共享词嵌入,则使用 distilbert 中的词嵌入与 logits 进行投影
if self.config.tie_word_embeddings:
shared_embedding = self.distilbert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
prediction_logits = self.vocab_projector(prediction_logits, shared_embedding.T)
else:
prediction_logits = self.vocab_projector(prediction_logits)
# 如果不需要以字典形式返回结果,则返回 logits 与其它输出
if not return_dict:
output = (prediction_logits,) + dlbrt_output[1:] # 构建输出元组
return output
# 以 FlaxMaskedLMOutput 类型返回输出结果,包含 logits、隐藏状态和注意力权重
return FlaxMaskedLMOutput(
logits=prediction_logits, # 预测 logits
hidden_states=dlbrt_output.hidden_states, # 隐藏状态
attentions=dlbrt_output.attentions, # 注意力权重
)
@add_start_docstrings("""DistilBert Model with a `language modeling` head on top.""", FLAX_DISTILBERT_START_DOCSTRING)
class FlaxDistilBertForMaskedLM(FlaxDistilBertPreTrainedModel):
module_class = FlaxDistilBertForMaskedLMModule
- 定义了一个基于FlaxDistilBertPreTrainedModel的FlaxDistilBertForMaskedLM类,它具有一个`language modeling`头部。
append_call_sample_docstring(FlaxDistilBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
- 调用append_call_sample_docstring函数,为FlaxDistilBertForMaskedLM类添加文档字符串示例。
class FlaxDistilBertForSequenceClassificationModule(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
self.pre_classifier = nn.Dense(
self.config.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout)
self.classifier = nn.Dense(
self.config.num_labels,
dtype=self.dtype,
)
- 定义了一个FlaxDistilBertForSequenceClassificationModule类,继承自nn.Module,用于序列分类任务。在setup方法中初始化了DistilBERT模块、预分类器、Dropout和分类器。
def __call__(
self,
input_ids,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
- 定义了__call__方法,实现了对输入数据进行处理和前向传播,支持不同的返回格式选项。
distilbert_output = self.distilbert(
input_ids,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
- 调用self.distilbert对输入进行处理,得到DistilBERT模型的输出。
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
pooled_output = ACT2FN["relu"](pooled_output)
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
logits = self.classifier(pooled_output) # (bs, dim)
- 对DistilBERT模型的输出进行处理,包括提取池化输出、通过预分类器和激活函数处理、应用Dropout、最终分类器得到logits。
if not return_dict:
return (logits,) + distilbert_output[1:]
return FlaxSequenceClassifierOutput(
logits=logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
- 根据return_dict的设置决定返回的结果格式,可以选择返回元组或者包含logits、隐藏状态和注意力的FlaxSequenceClassifierOutput对象。
@add_start_docstrings(
"""
DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertForSequenceClassification(FlaxDistilBertPreTrainedModel):
module_class = FlaxDistilBertForSequenceClassificationModule
- 定义了一个FlaxDistilBertForSequenceClassification类,继承自FlaxDistilBertPreTrainedModel,具有序列分类/回归头部的DistilBERT模型。
append_call_sample_docstring(
FlaxDistilBertForSequenceClassification,
_CHECKPOINT_FOR_DOC,
FlaxSequenceClassifierOutput,
_CONFIG_FOR_DOC,
)
- 调用append_call_sample_docstring函数,为FlaxDistilBertForSequenceClassification类添加文档字符串示例。
class FlaxDistilBertForMultipleChoiceModule(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32
- 定义了一个FlaxDistilBertForMultipleChoiceModule类,继承自nn.Module,用于多选题任务。
# 初始化模型的各个组件,包括DistilBERT模块、预分类器、Dropout层和分类器
def setup(self):
self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
self.pre_classifier = nn.Dense(
self.config.dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout)
self.classifier = nn.Dense(
1,
dtype=self.dtype,
)
# 模型的调用方法,接收输入的token IDs和attention mask,并返回多项选择任务的结果
def __call__(
self,
input_ids,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 根据参数设定是否使用配置中指定的返回字典方式
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 计算多项选择的数量
num_choices = input_ids.shape[1]
# 将输入的token IDs重新调整形状以便传递给模型
input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
# 将输入的attention mask重新调整形状以便传递给模型
attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
# 使用DistilBERT模型处理输入,返回模型的输出
outputs = self.distilbert(
input_ids,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型的隐藏状态
hidden_state = outputs[0]
# 从隐藏状态中提取池化输出,一般是第一个位置的隐藏状态
pooled_output = hidden_state[:, 0]
# 通过预分类器处理池化输出
pooled_output = self.pre_classifier(pooled_output)
# 应用ReLU激活函数到处理后的池化输出
pooled_output = ACT2FN["relu"](pooled_output)
# 使用Dropout层对处理后的池化输出进行随机失活
pooled_output = self.dropout(pooled_output, deterministic=deterministic)
# 使用分类器计算最终的logits
logits = self.classifier(pooled_output)
# 将logits重新调整形状以适应多项选择的格式
reshaped_logits = logits.reshape(-1, num_choices)
# 如果不使用返回字典的方式,则返回调整形状后的logits和额外的隐藏状态
if not return_dict:
return (reshaped_logits,) + outputs[2:]
# 如果使用返回字典的方式,则返回FlaxMultipleChoiceModelOutput对象
return FlaxMultipleChoiceModelOutput(
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
""",
FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertForMultipleChoice(FlaxDistilBertPreTrainedModel):
module_class = FlaxDistilBertForMultipleChoiceModule
overwrite_call_docstring(
FlaxDistilBertForMultipleChoice, DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
append_call_sample_docstring(
FlaxDistilBertForMultipleChoice,
_CHECKPOINT_FOR_DOC,
FlaxMultipleChoiceModelOutput,
_CONFIG_FOR_DOC,
)
class FlaxDistilBertForTokenClassificationModule(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
def __call__(
self,
input_ids,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Model
outputs = self.distilbert(
input_ids,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
logits = self.classifier(hidden_states)
if not return_dict:
return (logits,) + outputs[1:]
return FlaxTokenClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
FLAX_DISTILBERT_START_DOCSTRING,
)
class FlaxDistilBertForTokenClassification(FlaxDistilBertPreTrainedModel):
module_class = FlaxDistilBertForTokenClassificationModule
append_call_sample_docstring(
FlaxDistilBertForTokenClassification,
_CHECKPOINT_FOR_DOC,
FlaxTokenClassifierOutput,
_CONFIG_FOR_DOC,
)
class FlaxDistilBertForQuestionAnsweringModule(nn.Module):
config: DistilBertConfig
dtype: jnp.dtype = jnp.float32
# 初始化模型的方法,设置各个组件
def setup(self):
# 创建一个 DistilBERT 模型实例,使用给定的配置和数据类型
self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype)
# 创建一个全连接层,用于输出问题回答的分类数目
self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
# 断言模型需要输出的类别数为2
assert self.config.num_labels == 2
# 创建一个 Dropout 层,用于在训练过程中随机丢弃部分输入以防止过拟合
self.dropout = nn.Dropout(rate=self.config.qa_dropout)
# 模型调用方法,接受输入并返回模型预测结果
def __call__(
self,
input_ids,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 根据参数设置是否返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 DistilBERT 模型进行前向传播
distilbert_output = self.distilbert(
input_ids,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出中的隐藏状态
hidden_states = distilbert_output[0]
# 使用 Dropout 层对隐藏状态进行随机丢弃
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 将处理后的隐藏状态输入到全连接层中,得到最终的分类 logits
logits = self.qa_outputs(hidden_states)
# 将 logits 按照类别数目分割成起始和结束 logits
start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
# 去除不必要的维度
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
# 根据是否需要返回字典形式的输出进行处理并返回
if not return_dict:
# 如果不返回字典,则返回元组形式的输出
return (start_logits, end_logits) + distilbert_output[1:]
# 返回 FlaxQuestionAnsweringModelOutput 类的实例,包含起始 logits、结束 logits、隐藏状态和注意力权重
return FlaxQuestionAnsweringModelOutput(
start_logits=start_logits,
end_logits=end_logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
@add_start_docstrings(
"""
DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
FLAX_DISTILBERT_START_DOCSTRING,
)
这部分代码是一个装饰器函数调用,用于给 `FlaxDistilBertForQuestionAnswering` 类添加文档字符串。文档字符串描述了该类的作用,说明它是基于 DistilBert 模型的,具有用于提取式问答任务(如 SQuAD)的分类头部(在隐藏状态输出的基础上进行线性层计算,生成 `span start logits` 和 `span end logits`)。
class FlaxDistilBertForQuestionAnswering(FlaxDistilBertPreTrainedModel):
module_class = FlaxDistilBertForQuestionAnsweringModule
定义了一个新的类 `FlaxDistilBertForQuestionAnswering`,继承自 `FlaxDistilBertPreTrainedModel`。`module_class` 被设置为 `FlaxDistilBertForQuestionAnsweringModule`,用于模型内部的模块处理。
append_call_sample_docstring(
FlaxDistilBertForQuestionAnswering,
_CHECKPOINT_FOR_DOC,
FlaxQuestionAnsweringModelOutput,
_CONFIG_FOR_DOC,
)
这是一个函数调用,用于向 `FlaxDistilBertForQuestionAnswering` 类添加调用示例的文档字符串。它会附加一个关于模型如何调用的示例文档字符串,包括 `_CHECKPOINT_FOR_DOC`(用于模型检查点)、`FlaxQuestionAnsweringModelOutput`(模型输出)和 `_CONFIG_FOR_DOC`(模型配置)。
.\models\distilbert\modeling_tf_distilbert.py
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_distilbert import DistilBertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
_CONFIG_FOR_DOC = "DistilBertConfig"
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",
"distilbert-base-multilingual-cased",
"distilbert-base-uncased-finetuned-sst-2-english",
]
class TFEmbeddings(keras.layers.Layer):
"""构建由单词、位置和标记类型嵌入组成的嵌入层。"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dim = config.dim
self.initializer_range = config.initializer_range
self.max_position_embeddings = config.max_position_embeddings
self.LayerNorm = keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.dropout)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.dim],
initializer=get_initializer(initializer_range=self.initializer_range),
)
with tf.name_scope("position_embeddings"):
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.dim],
initializer=get_initializer(initializer_range=self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.dim])
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if position_ids is None:
position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
final_embeddings = inputs_embeds + position_embeds
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFMultiHeadSelfAttention(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.n_heads = config.n_heads
self.dim = config.dim
self.dropout = keras.layers.Dropout(config.attention_dropout)
self.output_attentions = config.output_attentions
assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}"
self.q_lin = keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
)
self.k_lin = keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
)
self.v_lin = keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
)
self.out_lin = keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
)
self.pruned_heads = set()
self.config = config
def prune_heads(self, heads):
raise NotImplementedError
def call(self, query, key, value, mask, head_mask, output_attentions, training=False):
"""
Parameters:
query: tf.Tensor(bs, seq_length, dim)
key: tf.Tensor(bs, seq_length, dim)
value: tf.Tensor(bs, seq_length, dim)
mask: tf.Tensor(bs, seq_length)
Returns:
weights: tf.Tensor(bs, n_heads, seq_length, seq_length) Attention weights context: tf.Tensor(bs,
seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
"""
bs, q_length, dim = shape_list(query)
k_length = shape_list(key)[1]
dim_per_head = int(self.dim / self.n_heads)
dim_per_head = tf.cast(dim_per_head, dtype=tf.int32)
mask_reshape = [bs, 1, 1, k_length]
def shape(x):
"""将张量按照注意力头分离"""
return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
def unshape(x):
"""将张量的注意力头重新组合"""
return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
q = shape(self.q_lin(query))
k = shape(self.k_lin(key))
v = shape(self.v_lin(value))
q = tf.cast(q, dtype=tf.float32)
q = tf.multiply(q, tf.math.rsqrt(tf.cast(dim_per_head, dtype=tf.float32)))
k = tf.cast(k, dtype=q.dtype)
scores = tf.matmul(q, k, transpose_b=True)
mask = tf.reshape(mask, mask_reshape)
mask = tf.cast(mask, dtype=scores.dtype)
scores = scores - 1e30 * (1.0 - mask)
weights = stable_softmax(scores, axis=-1)
weights = self.dropout(weights, training=training)
if head_mask is not None:
weights = weights * head_mask
context = tf.matmul(weights, v)
context = unshape(context)
context = self.out_lin(context)
if output_attentions:
return (context, weights)
else:
return (context,)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q_lin", None) is not None:
with tf.name_scope(self.q_lin.name):
self.q_lin.build([None, None, self.config.dim])
if getattr(self, "k_lin", None) is not None:
with tf.name_scope(self.k_lin.name):
self.k_lin.build([None, None, self.config.dim])
if getattr(self, "v_lin", None) is not None:
with tf.name_scope(self.v_lin.name):
self.v_lin.build([None, None, self.config.dim])
if getattr(self, "out_lin", None) is not None:
with tf.name_scope(self.out_lin.name):
self.out_lin.build([None, None, self.config.dim])
class TFFFN(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dropout = keras.layers.Dropout(config.dropout)
self.lin1 = keras.layers.Dense(
config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
)
self.lin2 = keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
)
self.activation = get_tf_activation(config.activation)
self.config = config
def call(self, input, training=False):
x = self.lin1(input)
x = self.activation(x)
x = self.lin2(x)
x = self.dropout(x, training=training)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "lin1", None) is not None:
with tf.name_scope(self.lin1.name):
self.lin1.build([None, None, self.config.dim])
if getattr(self, "lin2", None) is not None:
with tf.name_scope(self.lin2.name):
self.lin2.build([None, None, self.config.hidden_dim])
class TFTransformerBlock(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.n_heads = config.n_heads
self.dim = config.dim
self.hidden_dim = config.hidden_dim
self.dropout = keras.layers.Dropout(config.dropout)
self.activation = config.activation
self.output_attentions = config.output_attentions
assert (
config.dim % config.n_heads == 0
), f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
self.attention = TFMultiHeadSelfAttention(config, name="attention")
self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
self.ffn = TFFFN(config, name="ffn")
self.output_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
self.config = config
def call(self, x, attn_mask, head_mask, output_attentions, training=False):
"""
Parameters:
x: tf.Tensor(bs, seq_length, dim)
输入张量,形状为(batch_size, 序列长度, 维度)
attn_mask: tf.Tensor(bs, seq_length)
注意力掩码张量,形状为(batch_size, 序列长度),用于屏蔽无效位置的注意力
head_mask: Not used in this function
该参数在本函数中未使用
output_attentions: bool
是否输出注意力权重张量
training: bool, optional
是否处于训练模式,默认为False
Outputs:
sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
注意力权重张量,形状为(batch_size, 注意力头数, 序列长度, 序列长度)
ffn_output: tf.Tensor(bs, seq_length, dim)
变换器块的输出张量,形状为(batch_size, 序列长度, 维度)
"""
sa_output = self.attention(x, x, x, attn_mask, head_mask, output_attentions, training=training)
if output_attentions:
sa_output, sa_weights = sa_output
else:
sa_output = sa_output[0]
sa_output = self.sa_layer_norm(sa_output + x)
ffn_output = self.ffn(sa_output, training=training)
ffn_output = self.output_layer_norm(ffn_output + sa_output)
output = (ffn_output,)
if output_attentions:
output = (sa_weights,) + output
return output
def build(self, input_shape=None):
"""
构建模型的方法,用于初始化相关层的参数和变量。
Parameters:
input_shape: Not used in this function
该参数在本函数中未使用
"""
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "sa_layer_norm", None) is not None:
with tf.name_scope(self.sa_layer_norm.name):
self.sa_layer_norm.build([None, None, self.config.dim])
if getattr(self, "ffn", None) is not None:
with tf.name_scope(self.ffn.name):
self.ffn.build(None)
if getattr(self, "output_layer_norm", None) is not None:
with tf.name_scope(self.output_layer_norm.name):
self.output_layer_norm.build([None, None, self.config.dim])
class TFTransformer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.n_layers = config.n_layers
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.layer = [TFTransformerBlock(config, name=f"layer_._{i}") for i in range(config.n_layers)]
def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False):
"""
Parameters:
x: tf.Tensor(bs, seq_length, dim) 输入序列的嵌入表示
attn_mask: tf.Tensor(bs, seq_length) 序列的注意力掩码
Returns:
hidden_state: tf.Tensor(bs, seq_length, dim)
最后(顶层)层的隐藏状态序列
all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
长度为n_layers的元组,包含每一层的隐藏状态序列
可选:仅在output_hidden_states=True时返回
all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
长度为n_layers的元组,包含每一层的注意力权重
可选:仅在output_attentions=True时返回
"""
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_state = x
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
layer_outputs = layer_module(hidden_state, attn_mask, head_mask[i], output_attentions, training=training)
hidden_state = layer_outputs[-1]
if output_attentions:
assert len(layer_outputs) == 2
attentions = layer_outputs[0]
all_attentions = all_attentions + (attentions,)
else:
assert len(layer_outputs) == 1, f"Incorrect number of outputs {len(layer_outputs)} instead of 1"
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
if not return_dict:
return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFDistilBertMainLayer(keras.layers.Layer):
config_class = DistilBertConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFEmbeddings(config, name="embeddings")
self.transformer = TFTransformer(config, name="transformer")
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, value):
self.embeddings.weight = value
self.embeddings.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.ones(input_shape)
attention_mask = tf.cast(attention_mask, dtype=tf.float32)
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)
tfmr_output = self.transformer(
embedding_output,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=training,
)
return tfmr_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
class TFDistilBertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DistilBertConfig
base_model_prefix = "distilbert"
DISTILBERT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DISTILBERT_INPUTS_DOCSTRING = r"""
# Args: 输入参数说明开始
input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
# 输入序列标记在词汇表中的索引
Indices of input sequence tokens in the vocabulary.
# 通过 [`AutoTokenizer`] 可以获取输入的索引。参见 [`PreTrainedTokenizer.__call__`] 和 [`PreTrainedTokenizer.encode`] 获取详细信息。
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
# [What are input IDs?](../glossary#input-ids) 输入 ID 是什么?
attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
# 注意力掩码,用于避免对填充的标记索引执行注意力操作。掩码值在 `[0, 1]` 之间:
# - 1 表示**未掩码**的标记,
# - 0 表示**已掩码**的标记。
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
# [What are attention masks?](../glossary#attention-mask) 注意力掩码是什么?
head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 自注意力模块中要屏蔽的头部的掩码。掩码值在 `[0, 1]` 之间:
# - 1 表示**未掩码**的头部,
# - 0 表示**已掩码**的头部。
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
# inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
# 可选项,可以直接传递嵌入表示而不是传递 `input_ids`。如果希望更好地控制如何将 `input_ids` 索引转换为关联向量,这很有用。
# This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
# model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 获取更多细节。此参数仅在 eager 模式下使用,在图模式下将使用配置中的值。
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。查看返回张量中的 `hidden_states` 获取更多细节。此参数仅在 eager 模式下使用,在图模式下将使用配置中的值。
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通元组。此参数可以在 eager 模式下使用,在图模式下该值将始终为 True。
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False`):
# 是否使用模型处于训练模式(某些模块如 dropout 在训练和评估之间有不同的行为)。
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
TFDistilBertModel 类定义了一个基于 DistilBERT 模型的编码器/转换器,不添加特定的输出头部。
@parameters:config - DistilBERT 模型的配置
*inputs - 输入参数
**kwargs - 额外的关键字参数
@returns:DistilBERT 模型的输出结果
"""
@add_start_docstrings(
"The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING,
)
class TFDistilBertModel(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
outputs = self.distilbert(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
class TFDistilBertLMHead(keras.layers.Layer):
"""
TFDistilBertLMHead 类定义了 DistilBERT 的语言模型头部。
@paramters:config - DistilBERT 的配置
input_embeddings - 输入的嵌入层
"""
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dim = config.dim
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape):
"""
建立语言模型头部的权重。
@paramters:input_shape - 输入形状
"""
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
def get_output_embeddings(self):
"""
获取输出嵌入层。
@returns:输入嵌入层
"""
return self.input_embeddings
def set_output_embeddings(self, value):
"""
设置输出嵌入层。
@paramters:value - 新的嵌入层权重
"""
self.input_embeddings.weight = value
self.input_embeddings.vocab_size = shape_list(value)[0]
def get_bias(self):
"""
获取偏置项。
@returns:偏置项字典
"""
return {"bias": self.bias}
def set_bias(self, value):
"""
设置偏置项。
@paramters:value - 新的偏置项值
"""
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
# 定义一个方法 `call`,接受 `hidden_states` 参数
def call(self, hidden_states):
# 获取 `hidden_states` 张量的序列长度
seq_length = shape_list(tensor=hidden_states)[1]
# 将 `hidden_states` 张量重塑为二维张量,形状为 [-1, self.dim]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim])
# 对重塑后的张量 `hidden_states` 与模型的输入嵌入权重矩阵进行矩阵乘法,转置模型的输入嵌入权重矩阵
hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
# 将矩阵乘法的结果重新塑形为三维张量,形状为 [-1, seq_length, self.config.vocab_size]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
# 在张量 `hidden_states` 上添加偏置项,偏置项为模型的偏置
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
# 返回处理后的张量 `hidden_states`
return hidden_states
# 添加模型文档字符串,描述该类为带有 `masked language modeling` 头部的 DistilBERT 模型
@add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top.""",
DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.config = config
# 初始化 DistilBERT 主层
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
# 创建词汇转换层,用于预测词汇的分布
self.vocab_transform = keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
)
# 获取激活函数并应用于模型
self.act = get_tf_activation(config.activation)
# 添加词汇层归一化层
self.vocab_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
# 初始化 DistilBERT 语言模型头部
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
def get_lm_head(self):
# 返回语言模型头部
return self.vocab_projector
def get_prefix_bias_name(self):
# 警告:方法 get_prefix_bias_name 已废弃,请使用 `get_bias` 替代
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
# 返回带有语言模型头部名字的前缀
return self.name + "/" + self.vocab_projector.name
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
# 神经网络模型的前向传播函数,用于执行推断或训练步骤
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
distilbert_output = self.distilbert(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 提取DistilBERT模型的输出隐藏状态
hidden_states = distilbert_output[0] # (bs, seq_length, dim)
# 将隐藏状态映射为预测的logits(对应于词汇表大小)
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
# 应用激活函数到预测的logits
prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim)
# 对预测的logits进行层归一化
prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
# 投影到词汇表维度的空间
prediction_logits = self.vocab_projector(prediction_logits)
# 如果没有提供标签,则损失为None;否则使用预测的logits计算损失
loss = None if labels is None else self.hf_compute_loss(labels, prediction_logits)
# 如果不要求返回字典,则返回一组输出
if not return_dict:
output = (prediction_logits,) + distilbert_output[1:]
return ((loss,) + output) if loss is not None else output
# 返回带有命名属性的TFMaskedLMOutput对象,包括损失、logits、隐藏状态和注意力权重
return TFMaskedLMOutput(
loss=loss,
logits=prediction_logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
# 如果已经构建过模型,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果存在DistilBERT模型,则构建它
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
# 如果存在词汇转换层,则构建它
if getattr(self, "vocab_transform", None) is not None:
with tf.name_scope(self.vocab_transform.name):
self.vocab_transform.build([None, None, self.config.dim])
# 如果存在词汇层归一化,则构建它
if getattr(self, "vocab_layer_norm", None) is not None:
with tf.name_scope(self.vocab_layer_norm.name):
self.vocab_layer_norm.build([None, None, self.config.dim])
# 如果存在词汇投影层,则构建它
if getattr(self, "vocab_projector", None) is not None:
with tf.name_scope(self.vocab_projector.name):
self.vocab_projector.build(None)
@add_start_docstrings(
"""
DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels # 初始化分类标签数量
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # 初始化 DistilBERT 主层
# 预分类器,用于准备输入特征
self.pre_classifier = keras.layers.Dense(
config.dim,
kernel_initializer=get_initializer(config.initializer_range),
activation="relu",
name="pre_classifier",
)
# 分类器,用于分类任务,输出为 num_labels 个类别
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
# Dropout 层,用于防止过拟合
self.dropout = keras.layers.Dropout(config.seq_classif_dropout)
self.config = config # 保存配置信息
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
"""
根据输入调用模型进行前向传播计算。
Args:
input_ids (TFModelInputType | None): 输入序列的 token IDs
attention_mask (np.ndarray | tf.Tensor | None): 注意力遮罩,掩盖无意义的位置
head_mask (np.ndarray | tf.Tensor | None): 多头注意力掩码
inputs_embeds (np.ndarray | tf.Tensor | None): 替代输入的嵌入向量
output_attentions (Optional[bool]): 是否输出注意力权重
output_hidden_states (Optional[bool]): 是否输出隐藏状态
return_dict (Optional[bool]): 是否返回字典格式的输出
labels (np.ndarray | tf.Tensor | None): 标签 IDs
training (Optional[bool]): 是否处于训练模式
Returns:
输出字典或对象,包含预测结果或损失值
"""
# 省略部分代码
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 调用 DistilBERT 模型处理输入数据,获取模型输出
distilbert_output = self.distilbert(
input_ids=input_ids, # 输入的 token IDs
attention_mask=attention_mask, # 注意力掩码,指示哪些 token 是填充的
head_mask=head_mask, # 头部掩码,用于控制哪些注意力头部是有效的
inputs_embeds=inputs_embeds, # 嵌入的输入张量
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
return_dict=return_dict, # 是否以字典形式返回结果
training=training, # 是否处于训练模式
)
hidden_state = distilbert_output[0] # 获取 DistilBERT 输出的隐藏状态 (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # 获取池化的输出 (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # 对池化输出进行预分类
pooled_output = self.dropout(pooled_output, training=training) # 对预分类输出进行 dropout 处理
logits = self.classifier(pooled_output) # 使用分类器获取 logits (bs, dim)
loss = None if labels is None else self.hf_compute_loss(labels, logits) # 计算损失,若无标签则为 None
if not return_dict:
output = (logits,) + distilbert_output[1:] # 如果不返回字典,则输出 logits 和其他 DistilBERT 输出
return ((loss,) + output) if loss is not None else output # 返回损失和输出或者仅输出
# 返回 TFSequenceClassifierOutput 对象,包括损失、logits、隐藏状态和注意力权重
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
if self.built:
return # 如果模型已经构建过,则直接返回
self.built = True # 标记模型已经构建
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None) # 构建 DistilBERT 模型
if getattr(self, "pre_classifier", None) is not None:
with tf.name_scope(self.pre_classifier.name):
self.pre_classifier.build([None, None, self.config.dim]) # 构建预分类器模型
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.dim]) # 构建分类器模型
@add_start_docstrings(
"""
DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 设置分类任务的标签数量
self.num_labels = config.num_labels
# 初始化 DistilBERT 主层
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
# Dropout 层,用于防止过拟合
self.dropout = keras.layers.Dropout(config.dropout)
# 分类器,输出层,用于将隐藏状态输出映射到标签空间
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 调用 DistilBERT 主层,获取模型的输出
outputs = self.distilbert(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取序列输出
sequence_output = outputs[0]
# 应用 Dropout 层以防止过拟合
sequence_output = self.dropout(sequence_output, training=training)
# 将 Dropout 后的序列输出传入分类器,得到预测的 logits
logits = self.classifier(sequence_output)
# 如果有标签,计算损失值
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果不要求返回字典,则返回元组形式的输出
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典,则返回 TFTokenClassifierOutput 格式的输出
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 如果模型已经构建完成,则直接返回,不进行重复构建
if self.built:
return
# 标记模型为已构建状态
self.built = True
# 如果模型包含名为'distilbert'的属性且不为None,则构建distilbert部分
if getattr(self, "distilbert", None) is not None:
# 在TensorFlow中使用名称作用域来管理命名空间,这里创建distilbert的名称作用域
with tf.name_scope(self.distilbert.name):
# 调用distilbert的build方法来构建模型
self.distilbert.build(None)
# 如果模型包含名为'classifier'的属性且不为None,则构建classifier部分
if getattr(self, "classifier", None) is not None:
# 在TensorFlow中使用名称作用域来管理命名空间,这里创建classifier的名称作用域
with tf.name_scope(self.classifier.name):
# 调用classifier的build方法来构建模型,传入输入形状作为参数
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
""",
DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 初始化 DistilBERT 主层,作为模型的主体部分
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
# Dropout 层,用于随机断开输入神经元,防止过拟合
self.dropout = keras.layers.Dropout(config.seq_classif_dropout)
# 预分类器层,包含一个 Dense 层用于降维和激活函数为 ReLU
self.pre_classifier = keras.layers.Dense(
config.dim,
kernel_initializer=get_initializer(config.initializer_range),
activation="relu",
name="pre_classifier",
)
# 分类器层,输出为单个值,用于多选题的分类
self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
# 存储配置信息
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(
DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
"""
Forward pass for TFDistilBertForMultipleChoice.
Args:
input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length)`, optional):
Mask to avoid performing attention on padding token indices.
head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, optional):
Mask to nullify selected heads of the self-attention modules.
inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, hidden_size)`, optional):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
output_attentions (:obj:`bool`, optional):
Whether or not to return the attentions tensors of all attention layers.
output_hidden_states (:obj:`bool`, optional):
Whether or not to return the hidden states of all layers.
return_dict (:obj:`bool`, optional):
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
labels (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(batch_size,)`, optional):
Labels for computing the multiple choice classification loss.
training (:obj:`bool`, optional):
Whether to set the model to training mode (dropout active).
Returns:
:obj:`TFMultipleChoiceModelOutput` or :obj:`Tuple` comprising various elements depending on the configuration
(config_class, output_attentions, output_hidden_states).
"""
# 实现模型的前向传播
# 省略部分代码以保持注释紧凑
pass
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
# 如果提供了 input_ids,则获取 num_choices 和 seq_length
if input_ids is not None:
num_choices = shape_list(input_ids)[1] # 获取 input_ids 的第二维大小,即选项数
seq_length = shape_list(input_ids)[2] # 获取 input_ids 的第三维大小,即序列长度
else:
num_choices = shape_list(inputs_embeds)[1] # 获取 inputs_embeds 的第二维大小,即选项数
seq_length = shape_list(inputs_embeds)[2] # 获取 inputs_embeds 的第三维大小,即序列长度
# 将 input_ids 展开成二维张量,如果 input_ids 不为 None
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
# 将 attention_mask 展开成二维张量,如果 attention_mask 不为 None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
# 将 inputs_embeds 展开成三维张量,如果 inputs_embeds 不为 None
flat_inputs_embeds = (
tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
# 调用 DistilBERT 模型进行前向传播,获取输出
distilbert_output = self.distilbert(
flat_input_ids,
flat_attention_mask,
head_mask,
flat_inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
hidden_state = distilbert_output[0] # 获取 DistilBERT 输出的隐藏状态 (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # 获取隐藏状态的首个位置,作为池化输出 (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # 经过预分类器处理 (bs, dim)
pooled_output = self.dropout(pooled_output, training=training) # 应用 dropout (bs, dim)
logits = self.classifier(pooled_output) # 经过分类器处理,得到预测 logits
reshaped_logits = tf.reshape(logits, (-1, num_choices)) # 重新调整 logits 的形状为 (batch_size, num_choices)
# 计算损失,如果提供了 labels
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
# 如果不需要返回字典形式的输出,则返回元组形式的输出
if not return_dict:
output = (reshaped_logits,) + distilbert_output[1:]
return ((loss,) + output) if loss is not None else output
# 如果需要返回字典形式的输出,则返回 TFMultipleChoiceModelOutput 对象
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
# 如果已经构建过模型,则直接返回
if self.built:
return
self.built = True
# 如果模型中包含 DistilBERT 层,则构建 DistilBERT 层
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
# 如果模型中包含预分类器层,则构建预分类器层
if getattr(self, "pre_classifier", None) is not None:
with tf.name_scope(self.pre_classifier.name):
self.pre_classifier.build([None, None, self.config.dim])
# 如果模型中包含分类器层,则构建分类器层
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.dim])
@add_start_docstrings(
"""
DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
# 初始化 DistilBERT 主层,使用给定的配置和名称
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
# 初始化输出层,一个全连接层用于预测起始和结束位置的 logits
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
# 断言确保标签数目为2,用于检查是否正确配置了模型
assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2"
# 初始化 dropout 层,用于在训练时进行随机失活
self.dropout = keras.layers.Dropout(config.qa_dropout)
# 保存配置对象到实例中
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 模型的前向传播方法,接受多个输入参数并返回输出
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
"""
此方法用于模型的前向传播,接受多个输入参数并返回输出结果。
"""
# 以下是方法体的代码,包括输入参数和具体的处理逻辑。
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 获取 DistilBERT 的输出,包括隐藏状态和注意力权重等
distilbert_output = self.distilbert(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
# 对隐藏状态应用 dropout,用于防止过拟合
hidden_states = self.dropout(hidden_states, training=training) # (bs, max_query_len, dim)
# 通过线性层计算起始和结束位置的 logits
logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
loss = None
# 如果给定了起始和结束位置,则计算损失
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
# 如果不要求返回字典,则根据是否存在损失返回相应的输出
if not return_dict:
output = (start_logits, end_logits) + distilbert_output[1:]
return ((loss,) + output) if loss is not None else output
# 返回 TFQuestionAnsweringModelOutput 类的对象,包含损失、起始和结束 logits、隐藏状态和注意力权重
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=distilbert_output.hidden_states,
attentions=distilbert_output.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果 DistilBERT 存在,则构建其层次结构
if getattr(self, "distilbert", None) is not None:
with tf.name_scope(self.distilbert.name):
self.distilbert.build(None)
# 如果 QA 输出层存在,则构建其层次结构
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.dim])