Transformers 源码解析(七十八)
.\models\mobilevit\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tf_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig", "MobileViTOnnxConfig"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_mobilevit"] = ["MobileViTFeatureExtractor"]
_import_structure["image_processing_mobilevit"] = ["MobileViTImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mobilevit"] = [
"MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
"MobileViTForImageClassification",
"MobileViTForSemanticSegmentation",
"MobileViTModel",
"MobileViTPreTrainedModel",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_mobilevit"] = [
"TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFMobileViTForImageClassification",
"TFMobileViTForSemanticSegmentation",
"TFMobileViTModel",
"TFMobileViTPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig, MobileViTOnnxConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_mobilevit import MobileViTFeatureExtractor
from .image_processing_mobilevit import MobileViTImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mobilevit import (
MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
MobileViTForImageClassification,
MobileViTForSemanticSegmentation,
MobileViTModel,
MobileViTPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_mobilevit import (
TF_MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFMobileViTForImageClassification,
TFMobileViTForSemanticSegmentation,
TFMobileViTModel,
TFMobileViTPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mobilevitv2\configuration_mobilevitv2.py
""" MobileViTV2 model configuration"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"apple/mobilevitv2-1.0": "https://huggingface.co/apple/mobilevitv2-1.0/resolve/main/config.json",
}
class MobileViTV2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MobileViTV2Model`]. It is used to instantiate a
MobileViTV2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MobileViTV2
[apple/mobilevitv2-1.0](https://huggingface.co/apple/mobilevitv2-1.0) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
Args:
num_channels (`int`, *optional*, defaults to 3):
输入通道数,默认为3。
image_size (`int`, *optional*, defaults to 256):
每张图片的分辨率大小,默认为256像素。
patch_size (`int`, *optional*, defaults to 2):
每个图块的分辨率大小,默认为2像素。
expand_ratio (`float`, *optional*, defaults to 2.0):
MobileNetv2层的扩展因子,默认为2.0。
hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
Transformer编码器和卷积层中的非线性激活函数(函数或字符串),默认为"swish"。
conv_kernel_size (`int`, *optional*, defaults to 3):
MobileViTV2层中卷积核的大小,默认为3。
output_stride (`int`, *optional*, defaults to 32):
输出空间分辨率与输入图像分辨率之比,默认为32。
classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
附加分类器的dropout比率,默认为0.1。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差,默认为0.02。
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
层归一化层使用的epsilon值,默认为1e-05。
aspp_out_channels (`int`, *optional*, defaults to 512):
语义分割中ASPP层使用的输出通道数,默认为512。
atrous_rates (`List[int]`, *optional*, defaults to `[6, 12, 18]`):
语义分割中ASPP层使用的扩张(空洞)率列表,默认为`[6, 12, 18]`。
aspp_dropout_prob (`float`, *optional*, defaults to 0.1):
语义分割中ASPP层的dropout比率,默认为0.1。
semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
语义分割模型损失函数中被忽略的索引,默认为255。
n_attn_blocks (`List[int]`, *optional*, defaults to `[2, 4, 3]`):
每个MobileViTV2Layer中注意力块的数量列表,默认为`[2, 4, 3]`。
base_attn_unit_dims (`List[int]`, *optional*, defaults to `[128, 192, 256]`):
每个MobileViTV2Layer中注意力块维度的基础乘数列表,默认为`[128, 192, 256]`。
width_multiplier (`float`, *optional*, defaults to 1.0):
MobileViTV2的宽度乘数,默认为1.0。
ffn_multiplier (`int`, *optional*, defaults to 2):
MobileViTV2的FFN乘数,默认为2。
attn_dropout (`float`, *optional*, defaults to 0.0):
注意力层中的dropout比率,默认为0.0。
ffn_dropout (`float`, *optional*, defaults to 0.0):
FFN层之间的dropout比率,默认为0.0。
Example:
```
>>> from transformers import MobileViTV2Config, MobileViTV2Model
>>>
>>> configuration = MobileViTV2Config()
model = MobileViTV2Model(configuration)
configuration = model.config
class MobileViTV2OnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})])
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "image-classification":
return OrderedDict([("logits", {0: "batch"})])
else:
return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
@property
def atol_for_validation(self) -> float:
return 1e-4
.\models\mobilevitv2\convert_mlcvnets_to_pytorch.py
"""Convert MobileViTV2 checkpoints from the ml-cvnets library."""
import argparse
import collections
import json
from pathlib import Path
import requests
import torch
import yaml
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import (
MobileViTImageProcessor,
MobileViTV2Config,
MobileViTV2ForImageClassification,
MobileViTV2ForSemanticSegmentation,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def load_orig_config_file(orig_cfg_file):
print("Loading config file...")
def flatten_yaml_as_dict(d, parent_key="", sep="."):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.abc.MutableMapping):
items.extend(flatten_yaml_as_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
config = argparse.Namespace()
with open(orig_cfg_file, "r") as yaml_file:
try:
cfg = yaml.load(yaml_file, Loader=yaml.FullLoader)
flat_cfg = flatten_yaml_as_dict(cfg)
for k, v in flat_cfg.items():
setattr(config, k, v)
except yaml.YAMLError as exc:
logger.error("Error while loading config file: {}. Error message: {}".format(orig_cfg_file, str(exc)))
return config
def get_mobilevitv2_config(task_name, orig_cfg_file):
config = MobileViTV2Config()
is_segmentation_model = False
if task_name.startswith("imagenet1k_"):
config.num_labels = 1000
if int(task_name.strip().split("_")[-1]) == 384:
config.image_size = 384
else:
config.image_size = 256
filename = "imagenet-1k-id2label.json"
elif task_name.startswith("imagenet21k_to_1k_"):
config.num_labels = 21000
if int(task_name.strip().split("_")[-1]) == 384:
config.image_size = 384
else:
config.image_size = 256
filename = "imagenet-22k-id2label.json"
elif task_name.startswith("ade20k_"):
config.num_labels = 151
config.image_size = 512
filename = "ade20k-id2label.json"
is_segmentation_model = True
elif task_name.startswith("voc_"):
config.num_labels = 21
config.image_size = 512
filename = "pascal-voc-id2label.json"
is_segmentation_model = True
orig_config = load_orig_config_file(orig_cfg_file)
assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
assert (
getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
), "Norm layers other than layer_norm_2d is not supported"
config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
if is_segmentation_model:
config.output_stride = getattr(orig_config, "model.segmentation.output_stride", 16)
if "_deeplabv3" in task_name:
config.atrous_rates = getattr(orig_config, "model.segmentation.deeplabv3.aspp_rates", [12, 24, 36])
config.aspp_out_channels = getattr(orig_config, "model.segmentation.deeplabv3.aspp_out_channels", 512)
config.aspp_dropout_prob = getattr(orig_config, "model.segmentation.deeplabv3.aspp_dropout", 0.1)
repo_id = "huggingface/label-files"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def create_rename_keys(state_dict, base_model=False):
if base_model:
model_prefix = ""
else:
model_prefix = "mobilevitv2."
rename_keys = []
return rename_keys
def remove_unused_keys(state_dict):
"""remove unused keys (e.g.: seg_head.aux_head)"""
keys_to_ignore = []
for k in state_dict.keys():
if k.startswith("seg_head.aux_head."):
keys_to_ignore.append(k)
for k in keys_to_ignore:
state_dict.pop(k, None)
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, pytorch_dump_folder_path):
"""
Copy/paste/tweak model's weights to our MobileViTV2 structure.
"""
config = get_mobilevitv2_config(task_name, orig_config_path)
checkpoint = torch.load(checkpoint_path, map_location="cpu")
if task_name.startswith("ade20k_") or task_name.startswith("voc_"):
model = MobileViTV2ForSemanticSegmentation(config).eval()
base_model = False
else:
model = MobileViTV2ForImageClassification(config).eval()
base_model = False
state_dict = checkpoint
remove_unused_keys(state_dict)
rename_keys = create_rename_keys(state_dict, base_model=base_model)
for rename_key_src, rename_key_dest in rename_keys:
rename_key(state_dict, rename_key_src, rename_key_dest)
model.load_state_dict(state_dict)
image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
encoding = image_processor(images=prepare_img(), return_tensors="pt")
outputs = model(**encoding)
if task_name.startswith("imagenet"):
logits = outputs.logits
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])
if task_name.startswith("imagenet1k_256") and config.width_multiplier == 1.0:
expected_logits = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01])
assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {task_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--task",
default="imagenet1k_256",
type=str,
help=(
"Name of the task for which the MobileViTV2 model you'd like to convert is trained on . "
"""
Classification (ImageNet-1k)
- MobileViTV2 (256x256) : imagenet1k_256
- MobileViTV2 (Trained on 256x256 and Finetuned on 384x384) : imagenet1k_384
- MobileViTV2 (Trained on ImageNet-21k and Finetuned on ImageNet-1k 256x256) :
imagenet21k_to_1k_256
- MobileViTV2 (Trained on ImageNet-21k, Finetuned on ImageNet-1k 256x256, and Finetuned on
ImageNet-1k 384x384) : imagenet21k_to_1k_384
Segmentation
- ADE20K Dataset : ade20k_deeplabv3
- Pascal VOC 2012 Dataset: voc_deeplabv3
"""
),
choices=[
"imagenet1k_256",
"imagenet1k_384",
"imagenet21k_to_1k_256",
"imagenet21k_to_1k_384",
"ade20k_deeplabv3",
"voc_deeplabv3",
],
)
parser.add_argument(
"--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
)
parser.add_argument("--orig_config_path", required=True, type=str, help="Path to the original config file.")
parser.add_argument(
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
)
args = parser.parse_args()
convert_mobilevitv2_checkpoint(
args.task, args.orig_checkpoint_path, args.orig_config_path, args.pytorch_dump_folder_path
)
.\models\mobilevitv2\modeling_mobilevitv2.py
""" PyTorch MobileViTV2 model."""
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithNoAttention,
BaseModelOutputWithPoolingAndNoAttention,
ImageClassifierOutputWithNoAttention,
SemanticSegmenterOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_mobilevitv2 import MobileViTV2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "MobileViTV2Config"
_CHECKPOINT_FOR_DOC = "apple/mobilevitv2-1.0-imagenet1k-256"
_EXPECTED_OUTPUT_SHAPE = [1, 512, 8, 8]
_IMAGE_CLASS_CHECKPOINT = "apple/mobilevitv2-1.0-imagenet1k-256"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"apple/mobilevitv2-1.0-imagenet1k-256"
]
def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
"""
Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
original TensorFlow repo. It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
"""
if min_value is None:
min_value = divisor
new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
if new_value < 0.9 * value:
new_value += divisor
return int(new_value)
def clip(value: float, min_val: float = float("-inf"), max_val: float = float("inf")) -> float:
"""
Clip the input `value` to ensure it falls within the specified range [`min_val`, `max_val`].
"""
return max(min_val, min(max_val, value))
class MobileViTV2ConvLayer(nn.Module):
"""
MobileViTV2 convolutional layer module that extends nn.Module.
"""
def __init__(
self,
config: MobileViTV2Config,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
groups: int = 1,
bias: bool = False,
dilation: int = 1,
use_normalization: bool = True,
use_activation: Union[bool, str] = True,
) -> None:
super().__init__()
padding = int((kernel_size - 1) / 2) * dilation
if in_channels % groups != 0:
raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
if out_channels % groups != 0:
raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
self.convolution = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
padding_mode="zeros",
)
if use_normalization:
self.normalization = nn.BatchNorm2d(
num_features=out_channels,
eps=1e-5,
momentum=0.1,
affine=True,
track_running_stats=True,
)
else:
self.normalization = None
if use_activation:
if isinstance(use_activation, str):
self.activation = ACT2FN[use_activation]
elif isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
else:
self.activation = None
def forward(self, features: torch.Tensor) -> torch.Tensor:
features = self.convolution(features)
if self.normalization is not None:
features = self.normalization(features)
if self.activation is not None:
features = self.activation(features)
return features
class MobileViTV2InvertedResidual(nn.Module):
"""
反向残差块(MobileNetv2):https://arxiv.org/abs/1801.04381
"""
def __init__(
self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1
) -> None:
super().__init__()
expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8)
if stride not in [1, 2]:
raise ValueError(f"Invalid stride {stride}.")
self.use_residual = (stride == 1) and (in_channels == out_channels)
self.expand_1x1 = MobileViTV2ConvLayer(
config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1
)
self.conv_3x3 = MobileViTV2ConvLayer(
config,
in_channels=expanded_channels,
out_channels=expanded_channels,
kernel_size=3,
stride=stride,
groups=expanded_channels,
dilation=dilation,
)
self.reduce_1x1 = MobileViTV2ConvLayer(
config,
in_channels=expanded_channels,
out_channels=out_channels,
kernel_size=1,
use_activation=False,
)
def forward(self, features: torch.Tensor) -> torch.Tensor:
residual = features
features = self.expand_1x1(features)
features = self.conv_3x3(features)
features = self.reduce_1x1(features)
return residual + features if self.use_residual else features
class MobileViTV2MobileNetLayer(nn.Module):
def __init__(
self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1
) -> None:
super().__init__()
self.layer = nn.ModuleList()
for i in range(num_stages):
layer = MobileViTV2InvertedResidual(
config,
in_channels=in_channels,
out_channels=out_channels,
stride=stride if i == 0 else 1,
)
self.layer.append(layer)
in_channels = out_channels
def forward(self, features: torch.Tensor) -> torch.Tensor:
for layer_module in self.layer:
features = layer_module(features)
return features
class MobileViTV2LinearSelfAttention(nn.Module):
"""
这一层应用了MobileViTV2论文中描述的线性复杂度的自注意力机制:
https://arxiv.org/abs/2206.02680
Args:
config (`MobileVitv2Config`):
模型配置对象
embed_dim (`int`):
预期输入的通道数,尺寸为(batch_size, input_channels, height, width)
"""
def __init__(self, config: MobileViTV2Config, embed_dim: int) -> None:
super().__init__()
self.qkv_proj = MobileViTV2ConvLayer(
config=config,
in_channels=embed_dim,
out_channels=1 + (2 * embed_dim),
bias=True,
kernel_size=1,
use_normalization=False,
use_activation=False,
)
self.attn_dropout = nn.Dropout(p=config.attn_dropout)
self.out_proj = MobileViTV2ConvLayer(
config=config,
in_channels=embed_dim,
out_channels=embed_dim,
bias=True,
kernel_size=1,
use_normalization=False,
use_activation=False,
)
self.embed_dim = embed_dim
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
qkv = self.qkv_proj(hidden_states)
query, key, value = torch.split(qkv, split_size_or_sections=[1, self.embed_dim, self.embed_dim], dim=1)
context_scores = torch.nn.functional.softmax(query, dim=-1)
context_scores = self.attn_dropout(context_scores)
context_vector = key * context_scores
context_vector = torch.sum(context_vector, dim=-1, keepdim=True)
out = torch.nn.functional.relu(value) * context_vector.expand_as(value)
out = self.out_proj(out)
return out
class MobileViTV2FFN(nn.Module):
def __init__(
self,
config: MobileViTV2Config,
embed_dim: int,
ffn_latent_dim: int,
ffn_dropout: float = 0.0,
) -> None:
super().__init__()
self.conv1 = MobileViTV2ConvLayer(
config=config,
in_channels=embed_dim,
out_channels=ffn_latent_dim,
kernel_size=1,
stride=1,
bias=True,
use_normalization=False,
use_activation=True,
)
self.dropout1 = nn.Dropout(ffn_dropout)
self.conv2 = MobileViTV2ConvLayer(
config=config,
in_channels=ffn_latent_dim,
out_channels=embed_dim,
kernel_size=1,
stride=1,
bias=True,
use_normalization=False,
use_activation=False,
)
self.dropout2 = nn.Dropout(ffn_dropout)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.conv1(hidden_states)
hidden_states = self.dropout1(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = self.dropout2(hidden_states)
return hidden_states
class MobileViTV2TransformerLayer(nn.Module):
def __init__(
self,
config: MobileViTV2Config,
embed_dim: int,
ffn_latent_dim: int,
dropout: float = 0.0,
) -> None:
super().__init__()
self.layernorm_before = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
self.attention = MobileViTV2LinearSelfAttention(config, embed_dim)
self.dropout1 = nn.Dropout(p=dropout)
self.layernorm_after = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
self.ffn = MobileViTV2FFN(config, embed_dim, ffn_latent_dim, config.ffn_dropout)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
layernorm_1_out = self.layernorm_before(hidden_states)
attention_output = self.attention(layernorm_1_out)
hidden_states = attention_output + hidden_states
layer_output = self.layernorm_after(hidden_states)
layer_output = self.ffn(layer_output)
layer_output = layer_output + hidden_states
return layer_output
class MobileViTV2Transformer(nn.Module):
def __init__(self, config: MobileViTV2Config, n_layers: int, d_model: int) -> None:
super().__init__()
ffn_multiplier = config.ffn_multiplier
ffn_dims = [ffn_multiplier * d_model] * n_layers
ffn_dims = [int((d // 16) * 16) for d in ffn_dims]
self.layer = nn.ModuleList()
for block_idx in range(n_layers):
transformer_layer = MobileViTV2TransformerLayer(
config, embed_dim=d_model, ffn_latent_dim=ffn_dims[block_idx]
)
self.layer.append(transformer_layer)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
for layer_module in self.layer:
hidden_states = layer_module(hidden_states)
return hidden_states
class MobileViTV2Layer(nn.Module):
"""
MobileViTV2 layer: https://arxiv.org/abs/2206.02680
"""
def __init__(
self,
config: MobileViTV2Config,
in_channels: int,
out_channels: int,
attn_unit_dim: int,
n_attn_blocks: int = 2,
dilation: int = 1,
stride: int = 2,
) -> None:
super().__init__()
self.patch_width = config.patch_size
self.patch_height = config.patch_size
cnn_out_dim = attn_unit_dim
if stride == 2:
self.downsampling_layer = MobileViTV2InvertedResidual(
config,
in_channels=in_channels,
out_channels=out_channels,
stride=stride if dilation == 1 else 1,
dilation=dilation // 2 if dilation > 1 else 1,
)
in_channels = out_channels
else:
self.downsampling_layer = None
self.conv_kxk = MobileViTV2ConvLayer(
config,
in_channels=in_channels,
out_channels=in_channels,
kernel_size=config.conv_kernel_size,
groups=in_channels,
)
self.conv_1x1 = MobileViTV2ConvLayer(
config,
in_channels=in_channels,
out_channels=cnn_out_dim,
kernel_size=1,
use_normalization=False,
use_activation=False,
)
self.transformer = MobileViTV2Transformer(config, d_model=attn_unit_dim, n_layers=n_attn_blocks)
self.layernorm = nn.GroupNorm(num_groups=1, num_channels=attn_unit_dim, eps=config.layer_norm_eps)
self.conv_projection = MobileViTV2ConvLayer(
config,
in_channels=cnn_out_dim,
out_channels=in_channels,
kernel_size=1,
use_normalization=True,
use_activation=False,
)
def unfolding(self, feature_map: torch.Tensor) -> Tuple[torch.Tensor, Tuple[int, int]]:
batch_size, in_channels, img_height, img_width = feature_map.shape
patches = nn.functional.unfold(
feature_map,
kernel_size=(self.patch_height, self.patch_width),
stride=(self.patch_height, self.patch_width),
)
patches = patches.reshape(batch_size, in_channels, self.patch_height * self.patch_width, -1)
return patches, (img_height, img_width)
def folding(self, patches: torch.Tensor, output_size: Tuple[int, int]) -> torch.Tensor:
batch_size, in_dim, patch_size, n_patches = patches.shape
patches = patches.reshape(batch_size, in_dim * patch_size, n_patches)
feature_map = nn.functional.fold(
patches,
output_size=output_size,
kernel_size=(self.patch_height, self.patch_width),
stride=(self.patch_height, self.patch_width),
)
return feature_map
def forward(self, features: torch.Tensor) -> torch.Tensor:
if self.downsampling_layer:
features = self.downsampling_layer(features)
features = self.conv_kxk(features)
features = self.conv_1x1(features)
patches, output_size = self.unfolding(features)
patches = self.transformer(patches)
patches = self.layernorm(patches)
features = self.folding(patches, output_size)
features = self.conv_projection(features)
return features
class MobileViTV2Encoder(nn.Module):
def __init__(self, config: MobileViTV2Config) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList()
self.gradient_checkpointing = False
dilate_layer_4 = dilate_layer_5 = False
if config.output_stride == 8:
dilate_layer_4 = True
dilate_layer_5 = True
elif config.output_stride == 16:
dilate_layer_5 = True
dilation = 1
layer_0_dim = make_divisible(
clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
)
layer_1_dim = make_divisible(64 * config.width_multiplier, divisor=16)
layer_2_dim = make_divisible(128 * config.width_multiplier, divisor=8)
layer_3_dim = make_divisible(256 * config.width_multiplier, divisor=8)
layer_4_dim = make_divisible(384 * config.width_multiplier, divisor=8)
layer_5_dim = make_divisible(512 * config.width_multiplier, divisor=8)
layer_1 = MobileViTV2MobileNetLayer(
config,
in_channels=layer_0_dim,
out_channels=layer_1_dim,
stride=1,
num_stages=1,
)
self.layer.append(layer_1)
layer_2 = MobileViTV2MobileNetLayer(
config,
in_channels=layer_1_dim,
out_channels=layer_2_dim,
stride=2,
num_stages=2,
)
self.layer.append(layer_2)
layer_3 = MobileViTV2Layer(
config,
in_channels=layer_2_dim,
out_channels=layer_3_dim,
attn_unit_dim=make_divisible(config.base_attn_unit_dims[0] * config.width_multiplier, divisor=8),
n_attn_blocks=config.n_attn_blocks[0],
)
self.layer.append(layer_3)
if dilate_layer_4:
dilation *= 2
layer_4 = MobileViTV2Layer(
config,
in_channels=layer_3_dim,
out_channels=layer_4_dim,
attn_unit_dim=make_divisible(config.base_attn_unit_dims[1] * config.width_multiplier, divisor=8),
n_attn_blocks=config.n_attn_blocks[1],
dilation=dilation,
)
self.layer.append(layer_4)
if dilate_layer_5:
dilation *= 2
layer_5 = MobileViTV2Layer(
config,
in_channels=layer_4_dim,
out_channels=layer_5_dim,
attn_unit_dim=make_divisible(config.base_attn_unit_dims[2] * config.width_multiplier, divisor=8),
n_attn_blocks=config.n_attn_blocks[2],
dilation=dilation,
)
self.layer.append(layer_5)
def forward(
self,
hidden_states: torch.Tensor,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutputWithNoAttention]:
all_hidden_states = () if output_hidden_states else None
for i, layer_module in enumerate(self.layer):
if self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
)
else:
hidden_states = layer_module(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
class MobileViTV2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = MobileViTV2Config
base_model_prefix = "mobilevitv2"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
MOBILEVITV2_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MOBILEVITV2_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`MobileViTImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare MobileViTV2 model outputting raw hidden-states without any specific head on top.",
MOBILEVITV2_START_DOCSTRING,
)
class MobileViTV2Model(MobileViTV2PreTrainedModel):
pass
def __init__(self, config: MobileViTV2Config, expand_output: bool = True):
super().__init__(config)
self.config = config
self.expand_output = expand_output
layer_0_dim = make_divisible(
clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
)
self.conv_stem = MobileViTV2ConvLayer(
config,
in_channels=config.num_channels,
out_channels=layer_0_dim,
kernel_size=3,
stride=2,
use_normalization=True,
use_activation=True,
)
self.encoder = MobileViTV2Encoder(config)
self.post_init()
def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
"""
for layer_index, heads in heads_to_prune.items():
mobilevitv2_layer = self.encoder.layer[layer_index]
if isinstance(mobilevitv2_layer, MobileViTV2Layer):
for transformer_layer in mobilevitv2_layer.transformer.layer:
transformer_layer.attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Performs forward pass of the MobileViTV2 model.
pixel_values: Optional[torch.Tensor], input pixel values of shape (batch_size, num_channels, height, width)
output_hidden_states: Optional[bool], whether to output hidden states
return_dict: Optional[bool], whether to return a dictionary as output
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.conv_stem(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if self.expand_output:
last_hidden_state = encoder_outputs[0]
pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False)
else:
last_hidden_state = encoder_outputs[0]
pooled_output = None
if not return_dict:
output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,)
return output + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
@add_start_docstrings(
"""
MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
""",
MOBILEVITV2_START_DOCSTRING,
)
class MobileViTV2ForImageClassification(MobileViTV2PreTrainedModel):
def __init__(self, config: MobileViTV2Config) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.mobilevitv2 = MobileViTV2Model(config)
out_channels = make_divisible(512 * config.width_multiplier, divisor=8)
self.classifier = (
nn.Linear(in_features=out_channels, out_features=config.num_labels)
if config.num_labels > 0
else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
class MobileViTV2ASPPPooling(nn.Module):
def __init__(self, config: MobileViTV2Config, in_channels: int, out_channels: int) -> None:
super().__init__()
self.global_pool = nn.AdaptiveAvgPool2d(output_size=1)
self.conv_1x1 = MobileViTV2ConvLayer(
config,
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_normalization=True,
use_activation="relu",
)
def forward(self, features: torch.Tensor) -> torch.Tensor:
spatial_size = features.shape[-2:]
features = self.global_pool(features)
features = self.conv_1x1(features)
features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False)
return features
class MobileViTV2ASPP(nn.Module):
"""
ASPP 模块,由 DeepLab 论文定义:https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
"""
def __init__(self, config: MobileViTV2Config) -> None:
super().__init__()
encoder_out_channels = make_divisible(512 * config.width_multiplier, divisor=8)
in_channels = encoder_out_channels
out_channels = config.aspp_out_channels
if len(config.atrous_rates) != 3:
raise ValueError("Expected 3 values for atrous_rates")
self.convs = nn.ModuleList()
in_projection = MobileViTV2ConvLayer(
config,
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
use_activation="relu",
)
self.convs.append(in_projection)
self.convs.extend(
[
MobileViTV2ConvLayer(
config,
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
dilation=rate,
use_activation="relu",
)
for rate in config.atrous_rates
]
)
pool_layer = MobileViTV2ASPPPooling(config, in_channels, out_channels)
self.convs.append(pool_layer)
self.project = MobileViTV2ConvLayer(
config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu"
)
self.dropout = nn.Dropout(p=config.aspp_dropout_prob)
def forward(self, features: torch.Tensor) -> torch.Tensor:
pyramid = []
for conv in self.convs:
pyramid.append(conv(features))
pyramid = torch.cat(pyramid, dim=1)
pooled_features = self.project(pyramid)
pooled_features = self.dropout(pooled_features)
return pooled_features
class MobileViTV2DeepLabV3(nn.Module):
"""
DeepLabv3 架构:https://arxiv.org/abs/1706.05587
"""
def __init__(self, config: MobileViTV2Config) -> None:
super().__init__()
self.aspp = MobileViTV2ASPP(config)
self.dropout = nn.Dropout2d(config.classifier_dropout_prob)
self.classifier = MobileViTV2ConvLayer(
config,
in_channels=config.aspp_out_channels,
out_channels=config.num_labels,
kernel_size=1,
use_normalization=False,
use_activation=False,
bias=True,
)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
features = self.aspp(hidden_states[-1])
features = self.dropout(features)
features = self.classifier(features)
return features
@add_start_docstrings(
"""
MobileViTV2 模型,顶部带有语义分割头,例如用于 Pascal VOC 数据集。
""",
MOBILEVITV2_START_DOCSTRING,
)
class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
def __init__(self, config: MobileViTV2Config) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.mobilevitv2 = MobileViTV2Model(config, expand_output=False)
self.segmentation_head = MobileViTV2DeepLabV3(config)
self.post_init()
@add_start_docstrings_to_model_forward(MOBILEVITV2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[SemanticSegmenterOutput, Tuple[torch.Tensor, ...]]:
return self.mobilevitv2(
pixel_values=pixel_values,
labels=labels,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
**kwargs,
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mobilevitv2(
pixel_values,
output_hidden_states=True,
return_dict=return_dict,
)
encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
logits = self.segmentation_head(encoder_hidden_states)
loss = None
if labels is not None:
if self.config.num_labels == 1:
raise ValueError("The number of labels should be greater than one")
else:
upsampled_logits = nn.functional.interpolate(
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
)
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
loss = loss_fct(upsampled_logits, labels)
if not return_dict:
if output_hidden_states:
output = (logits,) + outputs[1:]
else:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SemanticSegmenterOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=None,
)
.\models\mobilevitv2\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_mobilevitv2": [
"MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
"MobileViTV2Config",
"MobileViTV2OnnxConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mobilevitv2"] = [
"MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST",
"MobileViTV2ForImageClassification",
"MobileViTV2ForSemanticSegmentation",
"MobileViTV2Model",
"MobileViTV2PreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mobilevitv2 import (
MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
MobileViTV2Config,
MobileViTV2OnnxConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mobilevitv2 import (
MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LIST,
MobileViTV2ForImageClassification,
MobileViTV2ForSemanticSegmentation,
MobileViTV2Model,
MobileViTV2PreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\mpnet\configuration_mpnet.py
""" MPNet model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/config.json",
}
class MPNetConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MPNetModel`] or a [`TFMPNetModel`]. It is used to
instantiate a MPNet model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MPNet
[microsoft/mpnet-base](https://huggingface.co/microsoft/mpnet-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
Args:
vocab_size (`int`, *optional*, defaults to 30527):
MPNet模型的词汇表大小,决定了在调用`MPNetModel`或`TFMPNetModel`时,`inputs_ids`可以表示的不同token数量。
hidden_size (`int`, *optional*, defaults to 768):
编码器层和池化层的维度。
num_hidden_layers (`int`, *optional*, defaults to 12):
Transformer编码器中的隐藏层数量。
num_attention_heads (`int`, *optional*, defaults to 12):
Transformer编码器中每个注意力层的注意力头数量。
intermediate_size (`int`, *optional*, defaults to 3072):
Transformer编码器中"中间"(通常称为前馈)层的维度。
hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
编码器和池化器中的非线性激活函数(函数或字符串)。支持的字符串有:"gelu"、"relu"、"silu"和"gelu_new"。
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
嵌入层、编码器和池化器中所有全连接层的dropout概率。
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
注意力概率的dropout比例。
max_position_embeddings (`int`, *optional*, defaults to 512):
模型可能使用的最大序列长度。通常设置为一个较大的值(例如512、1024或2048),以防万一。
initializer_range (`float`, *optional*, defaults to 0.02):
初始化所有权重矩阵的截断正态初始化器的标准差。
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
层归一化层使用的epsilon值。
relative_attention_num_buckets (`int`, *optional*, defaults to 32):
每个注意力层使用的桶的数量。
Examples:
>>> from transformers import MPNetModel, MPNetConfig
>>> configuration = MPNetConfig()
>>> model = MPNetModel(configuration)
>>> configuration = model.config
def __init__(
self,
vocab_size=30527,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
initializer_range=0.02,
layer_norm_eps=1e-12,
relative_attention_num_buckets=32,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.relative_attention_num_buckets = relative_attention_num_buckets
.\models\mpnet\modeling_mpnet.py
"""PyTorch MPNet 模型。"""
import math
from typing import Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPooling,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_mpnet import MPNetConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "microsoft/mpnet-base"
_CONFIG_FOR_DOC = "MPNetConfig"
MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/mpnet-base",
]
class MPNetPreTrainedModel(PreTrainedModel):
config_class = MPNetConfig
pretrained_model_archive_map = MPNET_PRETRAINED_MODEL_ARCHIVE_LIST
base_model_prefix = "mpnet"
def _init_weights(self, module):
"""初始化模型参数"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class MPNetEmbeddings(nn.Module):
def __init__(self, config):
super().__init__()
self.padding_idx = 1
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs):
if position_ids is None:
if input_ids is not None:
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
embeddings = inputs_embeds + position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
class MPNetSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.q = nn.Linear(config.hidden_size, self.all_head_size)
self.k = nn.Linear(config.hidden_size, self.all_head_size)
self.v = nn.Linear(config.hidden_size, self.all_head_size)
self.o = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
position_bias=None,
output_attentions=False,
**kwargs,
):
q = self.q(hidden_states)
k = self.k(hidden_states)
v = self.v(hidden_states)
q = self.transpose_for_scores(q)
k = self.transpose_for_scores(k)
v = self.transpose_for_scores(v)
attention_scores = torch.matmul(q, k.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if position_bias is not None:
attention_scores += position_bias
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
c = torch.matmul(attention_probs, v)
c = c.permute(0, 2, 1, 3).contiguous()
new_c_shape = c.size()[:-2] + (self.all_head_size,)
c = c.view(*new_c_shape)
o = self.o(c)
outputs = (o, attention_probs) if output_attentions else (o,)
return outputs
class MPNetAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.attn = MPNetSelfAttention(config)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attn.num_attention_heads, self.attn.attention_head_size, self.pruned_heads
)
self.attn.q = prune_linear_layer(self.attn.q, index)
self.attn.k = prune_linear_layer(self.attn.k, index)
self.attn.v = prune_linear_layer(self.attn.v, index)
self.attn.o = prune_linear_layer(self.attn.o, index, dim=1)
self.attn.num_attention_heads = self.attn.num_attention_heads - len(heads)
self.attn.all_head_size = self.attn.attention_head_size * self.attn.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
position_bias=None,
output_attentions=False,
**kwargs,
):
self_outputs = self.attn(
hidden_states,
attention_mask,
head_mask,
position_bias,
output_attentions=output_attentions,
)
attention_output = self.LayerNorm(self.dropout(self_outputs[0]) + hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class MPNetIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class MPNetOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class MPNetLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.attention = MPNetAttention(config)
self.intermediate = MPNetIntermediate(config)
self.output = MPNetOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
position_bias=None,
output_attentions=False,
**kwargs,
):
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
position_bias=position_bias,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
outputs = (layer_output,) + outputs
return outputs
class MPNetEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.n_heads = config.num_attention_heads
self.layer = nn.ModuleList([MPNetLayer(config) for _ in range(config.num_hidden_layers)])
self.relative_attention_bias = nn.Embedding(config.relative_attention_num_buckets, self.n_heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
**kwargs,
):
position_bias = self.compute_position_bias(hidden_states)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states,
attention_mask,
head_mask[i],
position_bias,
output_attentions=output_attentions,
**kwargs,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
else:
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_attentions,
)
def compute_position_bias(self, x, position_ids=None, num_buckets=32):
bsz, qlen, klen = x.size(0), x.size(1), x.size(1)
if position_ids is not None:
context_position = position_ids[:, :, None]
memory_position = position_ids[:, None, :]
else:
context_position = torch.arange(qlen, dtype=torch.long)[:, None]
memory_position = torch.arange(klen, dtype=torch.long)[None, :]
relative_position = memory_position - context_position
rp_bucket = self.relative_position_bucket(relative_position, num_buckets=num_buckets)
rp_bucket = rp_bucket.to(x.device)
values = self.relative_attention_bias(rp_bucket)
values = values.permute([2, 0, 1]).unsqueeze(0)
values = values.expand((bsz, -1, qlen, klen)).contiguous()
return values
@staticmethod
def relative_position_bucket(relative_position, num_buckets=32, max_distance=128):
ret = 0
n = -relative_position
num_buckets //= 2
ret += (n < 0).to(torch.long) * num_buckets
n = torch.abs(n)
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).to(torch.long)
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
"""
Args:
hidden_states (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Tensor containing the hidden states of the input sequences. Typically comes from the output of the
last layer of the model.
Returns:
:obj:`torch.Tensor`: The pooled output tensor of shape :obj:`(batch_size, hidden_size)`.
This tensor represents the "pooled" output, i.e., the output corresponding to the first token of each
input sequence in the batch.
"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
@add_start_docstrings(
"The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
MPNET_START_DOCSTRING,
)
class MPNetModel(MPNetPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = MPNetEmbeddings(config)
self.encoder = MPNetEncoder(config)
self.pooler = MPNetPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class MPNetForMaskedLM(MPNetPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder"]
def __init__(self, config):
super().__init__(config)
self.mpnet = MPNetModel(config, add_pooling_layer=False)
self.lm_head = MPNetLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mpnet(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
"""MPNet Head for masked and permuted language modeling."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, features, **kwargs):
x = self.dense(features)
x = gelu(x)
x = self.layer_norm(x)
x = self.decoder(x)
return x
@add_start_docstrings(
"""
MPNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
MPNET_START_DOCSTRING,
)
class MPNetForSequenceClassification(MPNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.mpnet = MPNetModel(config, add_pooling_layer=False)
self.classifier = MPNetClassificationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mpnet(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
MPNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
MPNET_START_DOCSTRING,
)
class MPNetForMultipleChoice(MPNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.mpnet = MPNetModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = False,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.mpnet(
flat_input_ids,
position_ids=flat_position_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
else:
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
MPNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
MPNET_START_DOCSTRING,
)
class MPNetForTokenClassification(MPNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.mpnet = MPNetModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mpnet(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings(
"""
MPNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MPNET_START_DOCSTRING,
)
class MPNetForQuestionAnswering(MPNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.mpnet = MPNetModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.mpnet(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def create_position_ids_from_input_ids(input_ids, padding_idx):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`. :param torch.Tensor x: :return torch.Tensor:
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
return incremental_indices.long() + padding_idx
.\models\mpnet\modeling_tf_mpnet.py
from __future__ import annotations
import math
import warnings
from typing import Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_mpnet import MPNetConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "microsoft/mpnet-base"
_CONFIG_FOR_DOC = "MPNetConfig"
TF_MPNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/mpnet-base",
]
class TFMPNetPreTrainedModel(TFPreTrainedModel):
"""
一个处理权重初始化、下载和加载预训练模型的抽象类。
"""
config_class = MPNetConfig
base_model_prefix = "mpnet"
class TFMPNetEmbeddings(keras.layers.Layer):
"""从单词和位置嵌入构建嵌入层。"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.padding_idx = 1
self.config = config
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.hidden_size],
initializer=get_initializer(initializer_range=self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
def create_position_ids_from_input_ids(self, input_ids):
"""
根据输入的 token ids 创建位置 ids。非填充符号用它们的位置数字替换。
位置编号从 padding_idx+1 开始,填充符号被忽略。这是从 fairseq 的 `utils.make_positions` 修改而来。
Args:
input_ids: tf.Tensor,输入的 token ids
Returns: tf.Tensor,位置 ids
"""
mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
incremental_indices = tf.math.cumsum(mask, axis=1) * mask
return incremental_indices + self.padding_idx
def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False):
"""
根据输入张量应用嵌入。
Returns:
final_embeddings (`tf.Tensor`): 输出的嵌入张量。
"""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if position_ids is None:
if input_ids is not None:
position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
else:
position_ids = tf.expand_dims(
tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
)
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
final_embeddings = inputs_embeds + position_embeds
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFMPNetPooler(keras.layers.Layer):
def __init__(self, config: MPNetConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFMPNetSelfAttention(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads}"
)
self.num_attention_heads = config.num_attention_heads
assert config.hidden_size % config.num_attention_heads == 0
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.q = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="q"
)
self.k = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="k"
)
self.v = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="v"
)
self.o = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o"
)
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.config = config
def transpose_for_scores(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, hidden_states, attention_mask, head_mask, output_attentions, position_bias=None, training=False):
batch_size = shape_list(hidden_states)[0]
q = self.q(hidden_states)
k = self.k(hidden_states)
v = self.v(hidden_states)
q = self.transpose_for_scores(q, batch_size)
k = self.transpose_for_scores(k, batch_size)
v = self.transpose_for_scores(v, batch_size)
attention_scores = tf.matmul(q, k, transpose_b=True)
dk = tf.cast(shape_list(k)[-1], attention_scores.dtype)
attention_scores = attention_scores / tf.math.sqrt(dk)
if position_bias is not None:
attention_scores += position_bias
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = stable_softmax(attention_scores, axis=-1)
attention_probs = self.dropout(attention_probs, training=training)
if head_mask is not None:
attention_probs = attention_probs * head_mask
c = tf.matmul(attention_probs, v)
c = tf.transpose(c, perm=[0, 2, 1, 3])
c = tf.reshape(c, (batch_size, -1, self.all_head_size))
o = self.o(c)
outputs = (o, attention_probs) if output_attentions else (o,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "q", None) is not None:
with tf.name_scope(self.q.name):
self.q.build([None, None, self.config.hidden_size])
if getattr(self, "k", None) is not None:
with tf.name_scope(self.k.name):
self.k.build([None, None, self.config.hidden_size])
if getattr(self, "v", None) is not None:
with tf.name_scope(self.v.name):
self.v.build([None, None, self.config.hidden_size])
if getattr(self, "o", None) is not None:
with tf.name_scope(self.o.name):
self.o.build([None, None, self.config.hidden_size])
class TFMPNetAttention(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.attn = TFMPNetSelfAttention(config, name="attn")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.config = config
def prune_heads(self, heads):
raise NotImplementedError
def call(self, input_tensor, attention_mask, head_mask, output_attentions, position_bias=None, training=False):
self_outputs = self.attn(
input_tensor, attention_mask, head_mask, output_attentions, position_bias=position_bias, training=training
)
attention_output = self.LayerNorm(self.dropout(self_outputs[0]) + input_tensor)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attn", None) is not None:
with tf.name_scope(self.attn.name):
self.attn.build(None)
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFMPNetIntermediate(keras.layers.Layer):
def __init__(self, config: MPNetConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFMPNetOutput(keras.layers.Layer):
def __init__(self, config: MPNetConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFMPNetLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.attention = TFMPNetAttention(config, name="attention")
self.intermediate = TFMPNetIntermediate(config, name="intermediate")
self.out = TFMPNetOutput(config, name="output")
def call(self, hidden_states, attention_mask, head_mask, output_attentions, position_bias=None, training=False):
self_attention_outputs = self.attention(
hidden_states, attention_mask, head_mask, output_attentions, position_bias=position_bias, training=training
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
intermediate_output = self.intermediate(attention_output)
layer_output = self.out(intermediate_output, attention_output, training=training)
outputs = (layer_output,) + outputs
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "out", None) is not None:
with tf.name_scope(self.out.name):
self.out.build(None)
class TFMPNetEncoder(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.n_heads = config.num_attention_heads
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.initializer_range = config.initializer_range
self.layer = [TFMPNetLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
self.relative_attention_num_buckets = config.relative_attention_num_buckets
def build(self, input_shape=None):
if self.built:
return
self.built = True
with tf.name_scope("relative_attention_bias"):
self.relative_attention_bias = self.add_weight(
name="embeddings",
shape=[self.relative_attention_num_buckets, self.n_heads],
initializer=get_initializer(self.initializer_range),
)
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
def call(
self,
hidden_states,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=False,
):
position_bias = self.compute_position_bias(hidden_states)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states,
attention_mask,
head_mask[i],
output_attentions,
position_bias=position_bias,
training=training,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_attentions
)
@staticmethod
def _relative_position_bucket(relative_position, num_buckets=32, max_distance=128):
ret = 0
n = -relative_position
num_buckets //= 2
ret += tf.cast(tf.math.less(n, 0), dtype=relative_position.dtype) * num_buckets
n = tf.math.abs(n)
max_exact = num_buckets // 2
is_small = tf.math.less(n, max_exact)
val_if_large = max_exact + tf.cast(
tf.math.log(n / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact),
dtype=relative_position.dtype,
)
val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
ret += tf.where(is_small, n, val_if_large)
return ret
def compute_position_bias(self, x, position_ids=None):
"""Compute binned relative position bias"""
input_shape = shape_list(x)
qlen, klen = input_shape[1], input_shape[1]
if position_ids is not None:
context_position = position_ids[:, :, None]
memory_position = position_ids[:, None, :]
else:
context_position = tf.range(qlen)[:, None]
memory_position = tf.range(klen)[None, :]
relative_position = memory_position - context_position
rp_bucket = self._relative_position_bucket(
relative_position,
num_buckets=self.relative_attention_num_buckets,
)
values = tf.gather(self.relative_attention_bias, rp_bucket)
values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)
return values
@keras_serializable
class TFMPNetMainLayer(keras.layers.Layer):
config_class = MPNetConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.num_hidden_layers = config.num_hidden_layers
self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.encoder = TFMPNetEncoder(config, name="encoder")
self.pooler = TFMPNetPooler(config, name="pooler")
self.embeddings = TFMPNetEmbeddings(config, name="embeddings")
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
def call(
self,
input_ids=None,
attention_mask=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
):
...
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
MPNET_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
"""
MPNET_INPUTS_DOCSTRING = r"""
Args:
input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@add_start_docstrings(
"The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")
)
MPNET_START_DOCSTRING,
注释:
)
class TFMPNetModel(TFMPNetPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mpnet = TFMPNetMainLayer(config, name="mpnet")
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
position_ids: Optional[Union[np.array, tf.Tensor]] = None,
head_mask: Optional[Union[np.array, tf.Tensor]] = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
outputs = self.mpnet(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
class TFMPNetLMHead(keras.layers.Layer):
"""MPNet head for masked and permuted language modeling"""
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.config = config
self.hidden_size = config.hidden_size
self.dense = keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
self.act = get_tf_activation("gelu")
self.decoder = input_embeddings
def build(self, input_shape=None):
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "layer_norm", None) is not None:
with tf.name_scope(self.layer_norm.name):
self.layer_norm.build([None, None, self.config.hidden_size])
def get_output_embeddings(self):
return self.decoder
def set_output_embeddings(self, value):
self.decoder.weight = value
self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self):
return {"bias": self.bias}
def set_bias(self, value):
self.bias = value["bias"]
self.config.vocab_size = shape_list(value["bias"])[0]
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.layer_norm(hidden_states)
seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
return hidden_states
@add_start_docstrings("""MPNet Model with a `language modeling` head on top.""", MPNET_START_DOCSTRING)
class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
_keys_to_ignore_on_load_missing = [r"pooler"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mpnet = TFMPNetMainLayer(config, name="mpnet")
self.lm_head = TFMPNetLMHead(config, self.mpnet.embeddings, name="lm_head")
def get_lm_head(self):
return self.lm_head
def get_prefix_bias_name(self):
warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
return self.name + "/" + self.lm_head.name
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: tf.Tensor | None = None,
training: bool = False,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
outputs = self.mpnet(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "lm_head", None) is not None:
with tf.name_scope(self.lm_head.name):
self.lm_head.build(None)
class TFMPNetClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.out_proj = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, features, training=False):
x = features[:, 0, :]
x = self.dropout(x, training=training)
x = self.dense(x)
x = self.dropout(x, training=training)
x = self.out_proj(x)
return x
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "out_proj", None) is not None:
with tf.name_scope(self.out_proj.name):
self.out_proj.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
MPNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
MPNET_START_DOCSTRING,
)
class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassificationLoss):
_keys_to_ignore_on_load_missing = [r"pooler"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.mpnet = TFMPNetMainLayer(config, name="mpnet")
self.classifier = TFMPNetClassificationHead(config, name="classifier")
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
position_ids: Optional[Union[np.array, tf.Tensor]] = None,
head_mask: Optional[Union[np.array, tf.Tensor]] = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: tf.Tensor | None = None,
training: bool = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.mpnet(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build(None)
"""
MPNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""
@add_start_docstrings(
"""
MPNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
MPNET_START_DOCSTRING,
)
class TFMPNetForMultipleChoice(TFMPNetPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mpnet = TFMPNetMainLayer(config, name="mpnet")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: tf.Tensor | None = None,
training: bool = False,
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: tf.Tensor | None = None,
training: bool = False,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs_embeds = (
tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
outputs = self.mpnet(
flat_input_ids,
flat_attention_mask,
flat_position_ids,
head_mask,
flat_inputs_embeds,
output_attentions,
output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
MPNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
MPNET_START_DOCSTRING,
)
class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificationLoss):
_keys_to_ignore_on_load_missing = [r"pooler"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.mpnet = TFMPNetMainLayer(config, name="mpnet")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: tf.Tensor | None = None,
training: bool = False,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.mpnet(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
MPNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MPNET_START_DOCSTRING,
)
class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLoss):
_keys_to_ignore_on_load_missing = [r"pooler"]
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.mpnet = TFMPNetMainLayer(config, name="mpnet")
self.qa_outputs = keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: Optional[Union[np.array, tf.Tensor]] = None,
position_ids: Optional[Union[np.array, tf.Tensor]] = None,
head_mask: Optional[Union[np.array, tf.Tensor]] = None,
inputs_embeds: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: tf.Tensor | None = None,
end_positions: tf.Tensor | None = None,
training: bool = False,
**kwargs,
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.mpnet(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions, "end_position": end_positions}
loss = self.hf_compute_loss(labels, (start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "mpnet", None) is not None:
with tf.name_scope(self.mpnet.name):
self.mpnet.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\mpnet\tokenization_mpnet.py
"""Tokenization classes for MPNet."""
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/mpnet-base": "https://huggingface.co/microsoft/mpnet-base/resolve/main/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/mpnet-base": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/mpnet-base": {"do_lower_case": True},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class MPNetTokenizer(PreTrainedTokenizer):
"""
This tokenizer inherits from [`BertTokenizer`] which contains most of the methods. Users should refer to the
superclass for more information regarding methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="[UNK]",
pad_token="<pad>",
mask_token="<mask>",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
vocab = self.added_tokens_encoder.copy()
vocab.update(self.vocab)
return vocab
def _tokenize(self, text):
"""
Tokenizes a single text string into a list of tokens using both basic and wordpiece tokenization.
Args:
text (str): The input text to be tokenized.
Returns:
List[str]: List of tokens after tokenization.
"""
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""
Converts a token (str) to its corresponding ID using the vocabulary.
Args:
token (str): The token to be converted.
Returns:
int: The ID corresponding to the token. Defaults to the ID of unknown token if not found.
"""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""
Converts an index (integer) to its corresponding token (str) using the vocabulary.
Args:
index (int): The index to be converted.
Returns:
str: The token corresponding to the index. Defaults to the unknown token if not found.
"""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""
Converts a sequence of tokens (list of str) into a single string, removing prefix '##'.
Args:
tokens (List[str]): List of tokens to be joined into a string.
Returns:
str: The concatenated string of tokens.
"""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Builds model input from sequences, adding special tokens for sequence classification tasks in MPNet format.
Args:
token_ids_0 (`List[int]`):
List of input IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of input IDs for the second sequence (for pair tasks).
Returns:
`List[int]`: List of input IDs with added special tokens appropriate for MPNet.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves a mask indicating the special tokens from input token IDs.
Args:
token_ids_0 (`List[int]`):
List of input IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of input IDs for the second sequence (for pair tasks).
already_has_special_tokens (`bool`):
Flag indicating whether the input already includes special tokens.
Returns:
`List[int]`: Mask indicating special tokens (1 for special tokens, 0 otherwise).
"""
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` methods.
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens