Transformers 源码解析(五十三)
.\models\glpn\convert_glpn_to_pytorch.py
import argparse
from collections import OrderedDict
from pathlib import Path
import requests
import torch
from PIL import Image
from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def rename_keys(state_dict):
new_state_dict = OrderedDict()
for key, value in state_dict.items():
if key.startswith("module.encoder"):
key = key.replace("module.encoder", "glpn.encoder")
if key.startswith("module.decoder"):
key = key.replace("module.decoder", "decoder.stages")
if "patch_embed" in key:
idx = key[key.find("patch_embed") + len("patch_embed")]
key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}")
if "norm" in key:
key = key.replace("norm", "layer_norm")
if "glpn.encoder.layer_norm" in key:
idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")]
key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}")
if "layer_norm1" in key:
key = key.replace("layer_norm1", "layer_norm_1")
if "layer_norm2" in key:
key = key.replace("layer_norm2", "layer_norm_2")
if "block" in key:
idx = key[key.find("block") + len("block")]
key = key.replace(f"block{idx}", f"block.{int(idx)-1}")
if "attn.q" in key:
key = key.replace("attn.q", "attention.self.query")
if "attn.proj" in key:
key = key.replace("attn.proj", "attention.output.dense")
if "attn" in key:
key = key.replace("attn", "attention.self")
if "fc1" in key:
key = key.replace("fc1", "dense1")
if "fc2" in key:
key = key.replace("fc2", "dense2")
if "linear_pred" in key:
key = key.replace("linear_pred", "classifier")
if "linear_fuse" in key:
key = key.replace("linear_fuse.conv", "linear_fuse")
key = key.replace("linear_fuse.bn", "batch_norm")
if "linear_c" in key:
idx = key[key.find("linear_c") + len("linear_c")]
key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}")
if "bot_conv" in key:
key = key.replace("bot_conv", "0.convolution")
if "skip_conv1" in key:
key = key.replace("skip_conv1", "1.convolution")
if "skip_conv2" in key:
key = key.replace("skip_conv2", "2.convolution")
if "fusion1" in key:
key = key.replace("fusion1", "1.fusion")
if "fusion2" in key:
key = key.replace("fusion2", "2.fusion")
if "fusion3" in key:
key = key.replace("fusion3", "3.fusion")
if "fusion" in key and "conv" in key:
key = key.replace("conv", "convolutional_layer")
if key.startswith("module.last_layer_depth"):
key = key.replace("module.last_layer_depth", "head.head")
new_state_dict[key] = value
return new_state_dict
def read_in_k_v(state_dict, config):
for i in range(config.num_encoder_blocks):
for j in range(config.depths[i]):
kv_weight = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.weight")
kv_bias = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.bias")
state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[config.hidden_sizes[i] :, :]
state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i]:]
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
return image
@torch.no_grad()
def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub=False, model_name=None):
"""
将模型的权重复制/粘贴/调整到我们的 GLPN 结构中。
"""
config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])
image_processor = GLPNImageProcessor()
image = prepare_img()
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
logger.info("Converting model...")
state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
state_dict = rename_keys(state_dict)
read_in_k_v(state_dict, config)
model = GLPNForDepthEstimation(config)
model.load_state_dict(state_dict)
model.eval()
outputs = model(pixel_values)
predicted_depth = outputs.predicted_depth
if model_name is not None:
if "nyu" in model_name:
expected_slice = torch.tensor(
[[4.4147, 4.0873, 4.0673], [3.7890, 3.2881, 3.1525], [3.7674, 3.5423, 3.4913]]
)
elif "kitti" in model_name:
expected_slice = torch.tensor(
[[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]]
)
else:
raise ValueError(f"Unknown model name: {model_name}")
expected_shape = torch.Size([1, 480, 640])
assert predicted_depth.shape == expected_shape
assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
print("Looks ok!")
if push_to_hub:
logger.info("Pushing model and image processor to the hub...")
model.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add model",
use_temp_dir=True,
)
image_processor.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
organization="nielsr",
commit_message="Add image processor",
use_temp_dir=True,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint_path",
default=None,
type=str,
help="Path to the original PyTorch checkpoint (.pth file).",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to upload the model to the HuggingFace hub."
)
parser.add_argument(
"--model_name",
default="glpn-kitti",
type=str,
help="Name of the model in case you're pushing to the hub.",
)
args = parser.parse_args()
convert_glpn_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)
.\models\glpn\feature_extraction_glpn.py
"""GLPN 的特征提取器类。"""
import warnings
from ...utils import logging
from .image_processing_glpn import GLPNImageProcessor
logger = logging.get_logger(__name__)
class GLPNFeatureExtractor(GLPNImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class GLPNFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
" use GLPNImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\glpn\image_processing_glpn.py
"""GLPN 的图像处理类。"""
from typing import List, Optional, Union
import numpy as np
import PIL.Image
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
class GLPNImageProcessor(BaseImageProcessor):
r"""
构建 GLPN 图像处理器。
Args:
do_resize (`bool`, *optional*, defaults to `True`):
是否调整图像的 (height, width) 尺寸,将它们舍入到最接近的 `size_divisor` 的倍数。
可以在 `preprocess` 中通过 `do_resize` 覆盖。
size_divisor (`int`, *optional*, defaults to 32):
当 `do_resize` 为 `True` 时,图像被调整大小,使其高度和宽度舍入到最接近的 `size_divisor` 的倍数。
可以在 `preprocess` 中通过 `size_divisor` 覆盖。
resample (`PIL.Image` resampling filter, *optional*, defaults to `Resampling.BILINEAR`):
如果调整图像大小,要使用的重采样滤波器。可以在 `preprocess` 中通过 `resample` 覆盖。
do_rescale (`bool`, *optional*, defaults to `True`):
是否应用缩放因子(使像素值在 0 到 1 之间)。可以在 `preprocess` 中通过 `do_rescale` 覆盖。
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size_divisor: int = 32,
resample=PILImageResampling.BILINEAR,
do_rescale: bool = True,
**kwargs,
def __init__(
self,
do_resize: bool = True,
do_rescale: bool = False,
size_divisor: int = 32,
resample: PILImageResampling = PILImageResampling.BILINEAR,
**kwargs,
) -> None:
self.do_resize = do_resize
self.do_rescale = do_rescale
self.size_divisor = size_divisor
self.resample = resample
super().__init__(**kwargs)
self._valid_processor_keys = [
"images",
"do_resize",
"size_divisor",
"resample",
"do_rescale",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size_divisor: int,
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
调整图像大小,将 (height, width) 维度向下舍入到最接近的 size_divisor 的倍数。
如果图像的维度是 (3, 260, 170),size_divisor 是 32,图像将被调整为 (3, 256, 160)。
Args:
image (`np.ndarray`):
需要调整大小的图像。
size_divisor (`int`):
图像将被调整为其高度和宽度向下舍入到最接近的 `size_divisor` 的倍数。
resample:
调整图像时使用的 PIL.Image 重采样滤波器,例如 `PILImageResampling.BILINEAR`。
data_format (`ChannelDimension` 或 `str`, *可选*):
输出图像的通道维度格式。如果为 `None`,则使用输入图像的通道维度格式。可以是以下之一:
- `ChannelDimension.FIRST`: 图像格式为 (num_channels, height, width)。
- `ChannelDimension.LAST`: 图像格式为 (height, width, num_channels)。
input_data_format (`ChannelDimension` 或 `str`, *可选*):
输入图像的通道维度格式。如果未设置,则从输入图像推断通道维度格式。可以是以下之一:
- `"channels_first"` 或 `ChannelDimension.FIRST`: 图像格式为 (num_channels, height, width)。
- `"channels_last"` 或 `ChannelDimension.LAST`: 图像格式为 (height, width, num_channels)。
Returns:
`np.ndarray`: 调整大小后的图像。
"""
height, width = get_image_size(image, channel_dim=input_data_format)
new_h = height // size_divisor * size_divisor
new_w = width // size_divisor * size_divisor
image = resize(
image,
(new_h, new_w),
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return image
def preprocess(
self,
images: Union["PIL.Image.Image", TensorType, List["PIL.Image.Image"], List[TensorType]],
do_resize: Optional[bool] = None,
size_divisor: Optional[int] = None,
resample=None,
do_rescale: Optional[bool] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\glpn\modeling_glpn.py
""" PyTorch GLPN模型。"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_glpn import GLPNConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "GLPNConfig"
_CHECKPOINT_FOR_DOC = "vinvino02/glpn-kitti"
_EXPECTED_OUTPUT_SHAPE = [1, 512, 15, 20]
GLPN_PRETRAINED_MODEL_ARCHIVE_LIST = [
"vinvino02/glpn-kitti",
]
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
每个样本(在残差块的主路径中应用时)丢弃路径(随机深度)。
评论由Ross Wightman提供:这与我为EfficientNet等网络创建的DropConnect实现相同,
然而,原始名称具有误导性,因为'Drop Connect'是另一篇论文中不同形式的dropout……
参见讨论:https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956
…我选择更改层和参数名称为'drop path'而不是将DropConnect作为层名称混合使用,并使用'survival rate'作为参数。
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class GLPNDropPath(nn.Module):
"""每个样本(在残差块的主路径中应用时)丢弃路径(随机深度)。"""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class GLPNOverlapPatchEmbeddings(nn.Module):
"""Construct the overlapping patch embeddings."""
def __init__(self, patch_size, stride, num_channels, hidden_size):
super().__init__()
self.proj = nn.Conv2d(
num_channels,
hidden_size,
kernel_size=patch_size,
stride=stride,
padding=patch_size // 2,
)
self.layer_norm = nn.LayerNorm(hidden_size)
def forward(self, pixel_values):
embeddings = self.proj(pixel_values)
_, _, height, width = embeddings.shape
embeddings = embeddings.flatten(2).transpose(1, 2)
embeddings = self.layer_norm(embeddings)
return embeddings, height, width
class GLPNEfficientSelfAttention(nn.Module):
"""SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
paper](https://arxiv.org/abs/2102.12122)."""
def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
super().__init__()
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
f"heads ({self.num_attention_heads})"
)
self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(self.hidden_size, self.all_head_size)
self.key = nn.Linear(self.hidden_size, self.all_head_size)
self.value = nn.Linear(self.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.sr_ratio = sequence_reduction_ratio
if sequence_reduction_ratio > 1:
self.sr = nn.Conv2d(
hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
)
self.layer_norm = nn.LayerNorm(hidden_size)
def transpose_for_scores(self, hidden_states):
new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
hidden_states = hidden_states.view(new_shape)
return hidden_states.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
height,
width,
output_attentions=False,
):
query_layer = self.transpose_for_scores(self.query(hidden_states))
if self.sr_ratio > 1:
batch_size, seq_len, num_channels = hidden_states.shape
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
hidden_states = self.sr(hidden_states)
hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
hidden_states = self.layer_norm(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class GLPNSelfOutput(nn.Module):
def __init__(self, config, hidden_size):
super().__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class GLPNAttention(nn.Module):
def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
super().__init__()
self.self = GLPNEfficientSelfAttention(
config=config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
)
self.output = GLPNSelfOutput(config, hidden_size=hidden_size)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, height, width, output_attentions=False):
self_outputs = self.self(hidden_states, height, width, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class GLPNDWConv(nn.Module):
def __init__(self, dim=768):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
def forward(self, hidden_states, height, width):
batch_size, seq_len, num_channels = hidden_states.shape
hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
hidden_states = self.dwconv(hidden_states)
hidden_states = hidden_states.flatten(2).transpose(1, 2)
return hidden_states
class GLPNMixFFN(nn.Module):
def __init__(self, config, in_features, hidden_features=None, out_features=None):
super().__init__()
out_features = out_features or in_features
self.dense1 = nn.Linear(in_features, hidden_features)
self.dwconv = GLPNDWConv(hidden_features)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.dense2 = nn.Linear(hidden_features, out_features)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, height, width):
hidden_states = self.dense1(hidden_states)
hidden_states = self.dwconv(hidden_states, height, width)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense2(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class GLPNLayer(nn.Module):
"""这对应于原始实现中的 Block 类。"""
def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(hidden_size)
self.attention = GLPNAttention(
config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequence_reduction_ratio=sequence_reduction_ratio,
)
self.drop_path = GLPNDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.layer_norm_2 = nn.LayerNorm(hidden_size)
mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = GLPNMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
def forward(self, hidden_states, height, width, output_attentions=False):
self_attention_outputs = self.attention(
self.layer_norm_1(hidden_states),
height,
width,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
attention_output = self.drop_path(attention_output)
hidden_states = attention_output + hidden_states
mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
mlp_output = self.drop_path(mlp_output)
layer_output = mlp_output + hidden_states
outputs = (layer_output,) + outputs
return outputs
class GLPNEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
embeddings = []
for i in range(config.num_encoder_blocks):
embeddings.append(
GLPNOverlapPatchEmbeddings(
patch_size=config.patch_sizes[i],
stride=config.strides[i],
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
hidden_size=config.hidden_sizes[i],
)
)
self.patch_embeddings = nn.ModuleList(embeddings)
blocks = []
cur = 0
for i in range(config.num_encoder_blocks):
layers = []
if i != 0:
cur += config.depths[i - 1]
for j in range(config.depths[i]):
layers.append(
GLPNLayer(
config,
hidden_size=config.hidden_sizes[i],
num_attention_heads=config.num_attention_heads[i],
drop_path=dpr[cur + j],
sequence_reduction_ratio=config.sr_ratios[i],
mlp_ratio=config.mlp_ratios[i],
)
)
blocks.append(nn.ModuleList(layers))
self.block = nn.ModuleList(blocks)
self.layer_norm = nn.ModuleList(
[nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)]
)
def forward(
self,
pixel_values,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
batch_size = pixel_values.shape[0]
hidden_states = pixel_values
for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)):
embedding_layer, block_layer, norm_layer = x
hidden_states, height, width = embedding_layer(hidden_states)
for i, blk in enumerate(block_layer):
layer_outputs = blk(hidden_states, height, width, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
hidden_states = norm_layer(hidden_states)
hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class GLPNPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GLPNConfig
base_model_prefix = "glpn"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
GLPN_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`GLPNConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GLPN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`GLPNImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
GLPN_START_DOCSTRING,
)
class GLPNModel(GLPNPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.encoder = GLPNEncoder(config)
self.post_init()
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class GLPNSelectiveFeatureFusion(nn.Module):
"""
Selective Feature Fusion module, as explained in the [paper](https://arxiv.org/abs/2201.07436) (section 3.4). This
module adaptively selects and integrates local and global features by attaining an attention map for each feature.
"""
def __init__(self, in_channel=64):
super().__init__()
self.convolutional_layer1 = nn.Sequential(
nn.Conv2d(in_channels=int(in_channel * 2), out_channels=in_channel, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(in_channel),
nn.ReLU(),
)
self.convolutional_layer2 = nn.Sequential(
nn.Conv2d(in_channels=in_channel, out_channels=int(in_channel / 2), kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(int(in_channel / 2)),
nn.ReLU(),
)
self.convolutional_layer3 = nn.Conv2d(
in_channels=int(in_channel / 2), out_channels=2, kernel_size=3, stride=1, padding=1
)
self.sigmoid = nn.Sigmoid()
def forward(self, local_features, global_features):
features = torch.cat((local_features, global_features), dim=1)
features = self.convolutional_layer1(features)
features = self.convolutional_layer2(features)
features = self.convolutional_layer3(features)
attn = self.sigmoid(features)
hybrid_features = local_features * attn[:, 0, :, :].unsqueeze(1) + global_features * attn[
:, 1, :, :
].unsqueeze(1)
return hybrid_features
class GLPNDecoderStage(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
should_skip = in_channels == out_channels
self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1) if not should_skip else nn.Identity()
self.fusion = GLPNSelectiveFeatureFusion(out_channels)
self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
def forward(self, hidden_state, residual=None):
hidden_state = self.convolution(hidden_state)
if residual is not None:
hidden_state = self.fusion(hidden_state, residual)
hidden_state = self.upsample(hidden_state)
return hidden_state
def __init__(self, config):
super().__init__()
reserved_hidden_sizes = config.hidden_sizes[::-1]
out_channels = config.decoder_hidden_size
self.stages = nn.ModuleList(
[GLPNDecoderStage(hidden_size, out_channels) for hidden_size in reserved_hidden_sizes]
)
self.stages[0].fusion = None
self.final_upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:
stage_hidden_states = []
stage_hidden_state = None
for hidden_state, stage in zip(hidden_states[::-1], self.stages):
stage_hidden_state = stage(hidden_state, stage_hidden_state)
stage_hidden_states.append(stage_hidden_state)
stage_hidden_states[-1] = self.final_upsample(stage_hidden_state)
return stage_hidden_states
class SiLogLoss(nn.Module):
r"""
Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://arxiv.org/abs/1406.2283).
$$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
y_{i}^{*}$.
"""
def __init__(self, lambd=0.5):
super().__init__()
self.lambd = lambd
def forward(self, pred, target):
valid_mask = (target > 0).detach()
diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
loss = torch.sqrt(torch.pow(diff_log, 2).mean() - self.lambd * torch.pow(diff_log.mean(), 2))
return loss
class GLPNDepthEstimationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
channels = config.decoder_hidden_size
self.head = nn.Sequential(
nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=False),
nn.Conv2d(channels, 1, kernel_size=3, stride=1, padding=1),
)
def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
hidden_states = hidden_states[self.config.head_in_index]
hidden_states = self.head(hidden_states)
predicted_depth = torch.sigmoid(hidden_states) * self.config.max_depth
predicted_depth = predicted_depth.squeeze(dim=1)
return predicted_depth
@add_start_docstrings(
"""GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.""",
GLPN_START_DOCSTRING,
)
class GLPNForDepthEstimation(GLPNPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.glpn = GLPNModel(config)
self.decoder = GLPNDecoder(config)
self.head = GLPNDepthEstimationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(GLPN_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
labels: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs
) -> DepthEstimatorOutput:
) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
r"""
labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
Ground truth depth estimation maps for computing the loss.
Returns:
Depending on the configuration and inputs, returns either a tuple with loss and predicted depth,
or a `DepthEstimatorOutput` object containing loss, predicted depth, hidden states, and attentions.
Examples:
```
>>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
>>> import torch
>>> import numpy as np
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
>>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")
>>> # prepare image for the model
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.no_grad():
... outputs = model(**inputs)
... predicted_depth = outputs.predicted_depth
>>> # interpolate to original size
>>> prediction = torch.nn.functional.interpolate(
... predicted_depth.unsqueeze(1),
... size=image.size[::-1],
... mode="bicubic",
... align_corners=False,
... )
>>> # visualize the prediction
>>> output = prediction.squeeze().cpu().numpy()
>>> formatted = (output * 255 / np.max(output)).astype("uint8")
>>> depth = Image.fromarray(formatted)
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
outputs = self.glpn(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=True,
return_dict=return_dict,
)
hidden_states = outputs.hidden_states if return_dict else outputs[1]
out = self.decoder(hidden_states)
predicted_depth = self.head(out)
loss = None
if labels is not None:
loss_fct = SiLogLoss()
loss = loss_fct(predicted_depth, labels)
if not return_dict:
if output_hidden_states:
output = (predicted_depth,) + outputs[1:]
else:
output = (predicted_depth,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return DepthEstimatorOutput(
loss=loss,
predicted_depth=predicted_depth,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions,
)
.\models\glpn\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {"configuration_glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"]}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_glpn"] = ["GLPNFeatureExtractor"]
_import_structure["image_processing_glpn"] = ["GLPNImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_glpn"] = [
"GLPN_PRETRAINED_MODEL_ARCHIVE_LIST",
"GLPNForDepthEstimation",
"GLPNLayer",
"GLPNModel",
"GLPNPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_glpn import GLPNFeatureExtractor
from .image_processing_glpn import GLPNImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_glpn import (
GLPN_PRETRAINED_MODEL_ARCHIVE_LIST,
GLPNForDepthEstimation,
GLPNLayer,
GLPNModel,
GLPNPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\gpt2\configuration_gpt2.py
""" OpenAI GPT-2 configuration"""
from collections import OrderedDict
from typing import Any, List, Mapping, Optional
from ... import PreTrainedTokenizer, TensorType, is_torch_available
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfigWithPast, PatchingSpec
from ...utils import logging
logger = logging.get_logger(__name__)
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"openai-community/gpt2": "https://huggingface.co/openai-community/gpt2/resolve/main/config.json",
"openai-community/gpt2-medium": "https://huggingface.co/openai-community/gpt2-medium/resolve/main/config.json",
"openai-community/gpt2-large": "https://huggingface.co/openai-community/gpt2-large/resolve/main/config.json",
"openai-community/gpt2-xl": "https://huggingface.co/openai-community/gpt2-xl/resolve/main/config.json",
"distilbert/distilgpt2": "https://huggingface.co/distilbert/distilgpt2/resolve/main/config.json",
}
class GPT2Config(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the GPT-2
[openai-community/gpt2](https://huggingface.co/openai-community/gpt2) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import GPT2Config, GPT2Model
>>> # Initializing a GPT2 configuration
>>> configuration = GPT2Config()
>>> # Initializing a model (with random weights) from the configuration
>>> model = GPT2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "gpt2"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "n_embd",
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=50257,
n_positions=1024,
n_embd=768,
n_layer=12,
n_head=12,
n_inner=None,
activation_function="gelu_new",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
summary_type="cls_index",
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
scale_attn_weights=True,
use_cache=True,
bos_token_id=50256,
eos_token_id=50256,
scale_attn_by_inverse_layer_idx=False,
reorder_and_upcast_attn=False,
**kwargs,
):
self.vocab_size = vocab_size
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.n_inner = n_inner
self.activation_function = activation_function
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
self.reorder_and_upcast_attn = reorder_and_upcast_attn
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
class GPT2OnnxConfig(OnnxConfigWithPast):
def __init__(
self,
config: PretrainedConfig,
task: str = "default",
patching_specs: List[PatchingSpec] = None,
use_past: bool = False,
):
super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
if not getattr(self._config, "pad_token_id", None):
self._config.pad_token_id = 0
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
else:
common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
return common_inputs
@property
def num_layers(self) -> int:
return self._config.n_layer
@property
def num_attention_heads(self) -> int:
return self._config.n_head
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
past_shape = (
batch,
self.num_attention_heads,
past_key_values_length,
self._config.hidden_size // self.num_attention_heads,
)
ordered_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
]
ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
if self.use_past:
mask_dtype = ordered_inputs["attention_mask"].dtype
ordered_inputs["attention_mask"] = torch.cat(
[ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
return ordered_inputs
@property
def default_onnx_opset(self) -> int:
return 13
.\models\gpt2\convert_gpt2_original_tf_checkpoint_to_pytorch.py
"""Convert OpenAI GPT checkpoint."""
import argparse
import torch
from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
logging.set_verbosity_info()
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
if gpt2_config_file == "":
config = GPT2Config()
else:
config = GPT2Config.from_json_file(gpt2_config_file)
model = GPT2Model(config)
load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
print(f"Save PyTorch model to {pytorch_weights_dump_path}")
torch.save(model.state_dict(), pytorch_weights_dump_path)
print(f"Save configuration file to {pytorch_config_dump_path}")
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(config.to_json_string())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--gpt2_config_file",
default="",
type=str,
help=(
"An optional config json file corresponding to the pre-trained OpenAI model. \n"
"This specifies the model architecture."
),
)
args = parser.parse_args()
convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
.\models\gpt2\modeling_flax_gpt2.py
from typing import Any, Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from ...modeling_flax_outputs import (
FlaxBaseModelOutputWithPastAndCrossAttentions,
FlaxCausalLMOutputWithCrossAttentions,
)
from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_gpt2 import GPT2Config
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
_CONFIG_FOR_DOC = "GPT2Config"
GPT2_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a Flax Linen
[flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
# Parameters参数描述:
# config ([`GPT2Config`]): 模型配置类,包含模型的所有参数。
# 通过配置文件初始化不会加载与模型相关的权重,只加载配置。
# 可查看[`~FlaxPreTrainedModel.from_pretrained`]方法以加载模型权重。
# dtype (`jax.numpy.dtype`, *optional*, 默认为 `jax.numpy.float32`):
# 计算时的数据类型。可以是`jax.numpy.float32`、`jax.numpy.float16`(在GPU上)、`jax.numpy.bfloat16`(在TPU上)之一。
# 可用于在GPU或TPU上启用混合精度训练或半精度推断。如果指定了dtype,则所有计算将使用给定的dtype。
#
# **请注意,这仅指定计算的dtype,并不影响模型参数的dtype。**
#
# 如果想要更改模型参数的dtype,请参阅[`~FlaxPreTrainedModel.to_fp16`]和[`~FlaxPreTrainedModel.to_bf16`]。
"""
Implementing FlaxGPT2Attention module for GPT-2 model.
This module handles the attention mechanism used in GPT-2, including optional cross-attention.
"""
class FlaxGPT2Attention(nn.Module):
# GPT-2模型的配置,包括注意事项和交叉注意事项的配置
config: GPT2Config
# 数据类型,默认为32位浮点数
dtype: jnp.dtype = jnp.float32
# 是否是因果注意力机制,默认为True
causal: bool = True
# 是否是交叉注意力机制,默认为False
is_cross_attention: bool = False
@nn.compact
def __call__(self, inputs):
# 将输入转换为指定的数据类型
inputs = jnp.asarray(inputs, self.dtype)
# 初始化注意力层的权重矩阵,形状为 (features, input_shape[-1])
kernel = self.param("kernel", jax.nn.initializers.normal(stddev=0.02), (self.features, inputs.shape[-1]))
kernel = jnp.asarray(kernel.transpose(), self.dtype)
# 使用 dot_general 函数计算输入和权重的乘积
y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())), precision=self.precision)
# 如果设置了使用偏置项
if self.use_bias:
# 初始化偏置项,形状为 (features,)
bias = self.param("bias", jax.nn.initializers.zeros, (self.features,))
bias = jnp.asarray(bias, self.dtype)
# 在乘积结果上加上偏置项
y = y + bias
# 返回计算结果
return y
# 定义模型初始化设置方法
def setup(self):
# 从配置中获取参数
config = self.config
# 设置嵌入维度为隐藏层大小
self.embed_dim = config.hidden_size
# 设置注意力头数为配置中的头数
self.num_heads = config.num_attention_heads
# 计算每个头的维度
self.head_dim = self.embed_dim // self.num_heads
# 如果是跨注意力机制
if self.is_cross_attention:
# 使用两倍的嵌入维度创建跨注意力层
self.c_attn = FlaxConv1D(2 * self.embed_dim, dtype=self.dtype)
# 使用单个嵌入维度创建查询注意力层
self.q_attn = FlaxConv1D(self.embed_dim, dtype=self.dtype)
else:
# 使用三倍的嵌入维度创建自注意力层
self.c_attn = FlaxConv1D(3 * self.embed_dim, dtype=self.dtype)
# 使用单个嵌入维度创建投影层
self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
# 定义残差连接的 Dropout 层,使用配置中的残差 Dropout 概率
self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
# 如果是因果注意力模型
if self.causal:
# 创建一个因果掩码,形状为 (1, 最大位置编码数),类型为布尔型
self.causal_mask = make_causal_mask(
jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool"
)
# 将隐藏状态按头分割方法
def _split_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
# 将分割后的头合并为隐藏状态方法
def _merge_heads(self, hidden_states):
return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
# 使用 nn.compact 装饰器定义紧凑的神经网络结构
@nn.compact
def _concatenate_to_cache(self, key, value, query, attention_mask):
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slightly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# 检测是否通过检查"cache"中的"cached_key"变量来初始化
is_initialized = self.has_variable("cache", "cached_key")
# 如果未初始化,则将"cached_key"初始化为形状和类型与key相同的全零张量
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
# 如果未初始化,则将"cached_value"初始化为形状和类型与value相同的全零张量
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# 如果未初始化,则将"cache_index"初始化为整数0
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
if is_initialized:
# 提取批次维度、序列最大长度、头数和每头深度,从现有的缓存中
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# 使用新的1维空间切片更新key和value缓存
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# 更新缓存中的key和value
cached_key.value = key
cached_value.value = value
# 更新缓存索引,增加已更新的缓存向量数量
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# 用于缓存解码器自注意力的因果掩码:我们的单个查询位置只能关注已生成并缓存的key位置,而不是剩余的零元素。
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
# 合并因果掩码和输入的注意力掩码
attention_mask = combine_masks(pad_mask, attention_mask)
# 返回更新后的key、value和注意力掩码
return key, value, attention_mask
# 定义一个自定义的 FlaxGPT2MLP 类,继承自 nn.Module
class FlaxGPT2MLP(nn.Module):
# 类型注解:配置信息为 GPT2Config 类型
config: GPT2Config
# 中间层的大小
intermediate_size: int
# 数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 初始化函数,设置网络结构
def setup(self):
# 嵌入维度等于配置中的隐藏大小
embed_dim = self.config.hidden_size
# 定义一个一维卷积层,输出大小为 intermediate_size
self.c_fc = FlaxConv1D(self.intermediate_size, dtype=self.dtype)
# 定义另一个一维卷积层,输出大小为 embed_dim
self.c_proj = FlaxConv1D(embed_dim, dtype=self.dtype)
# 激活函数,根据配置中的激活函数类型选择对应的函数
self.act = ACT2FN[self.config.activation_function]
# Dropout 层,根据配置中的 residual dropout 率初始化
self.dropout = nn.Dropout(rate=self.config.resid_pdrop)
# 前向传播函数,处理隐藏状态
def __call__(self, hidden_states, deterministic: bool = True):
# 先通过第一个卷积层处理隐藏状态
hidden_states = self.c_fc(hidden_states)
# 使用配置中指定的激活函数处理卷积层输出
hidden_states = self.act(hidden_states)
# 再通过第二个卷积层处理激活后的隐藏状态
hidden_states = self.c_proj(hidden_states)
# 使用 Dropout 层对卷积层输出进行随机失活处理
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 返回处理后的隐藏状态
return hidden_states
# 定义一个自定义的 FlaxGPT2Block 类,继承自 nn.Module
class FlaxGPT2Block(nn.Module):
# 类型注解:配置信息为 GPT2Config 类型
config: GPT2Config
# 数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 初始化函数,设置网络结构
def setup(self):
# 隐藏层大小等于配置中的隐藏大小
hidden_size = self.config.hidden_size
# 内部维度为配置中指定的内部层大小,如果未指定,则为 4 倍的隐藏层大小
inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
# 第一个 LayerNorm 层,使用配置中的层标准化 epsilon 值进行初始化
self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
# 自定义的 GPT2Attention 层,使用配置信息和数据类型进行初始化
self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
# 第二个 LayerNorm 层,同样使用配置中的层标准化 epsilon 值进行初始化
self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
# 如果配置中包含跨注意力机制
if self.config.add_cross_attention:
# 初始化一个用于跨注意力的 GPT2Attention 层,关闭因果性,设为跨注意力
self.crossattention = FlaxGPT2Attention(
config=self.config, dtype=self.dtype, causal=False, is_cross_attention=True
)
# 第三个 LayerNorm 层,使用配置中的层标准化 epsilon 值进行初始化
self.ln_cross_attn = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
# 自定义的 GPT2MLP 类,使用配置信息、内部维度和数据类型进行初始化
self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
# 前向传播函数,处理隐藏状态和注意力机制
def __call__(
self,
hidden_states,
attention_mask=None,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
):
residual = hidden_states # 保存当前隐藏状态作为残差连接的起点
hidden_states = self.ln_1(hidden_states) # LayerNormalization 1:对隐藏状态进行归一化
attn_outputs = self.attn(
hidden_states,
attention_mask=attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
)
# 获取自注意力机制的输出
attn_output = attn_outputs[0] # attn_output: 自注意力机制的输出,第一个元素
outputs = attn_outputs[1:] # outputs: 自注意力机制的其他输出,如注意力权重等
# 残差连接
hidden_states = attn_output + residual
# Cross-Attention Block
if encoder_hidden_states is not None:
# 添加交叉注意力块
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
"cross-attention layers by setting `config.add_cross_attention=True`"
)
residual = hidden_states # 保存当前隐藏状态作为残差连接的起点
hidden_states = self.ln_cross_attn(hidden_states) # LayerNormalization 2:对隐藏状态进行归一化
cross_attn_outputs = self.crossattention(
hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
)
# 获取交叉注意力机制的输出
attn_output = cross_attn_outputs[0]
# 残差连接
hidden_states = residual + attn_output
# 添加交叉注意力的输出到总输出中(如果输出注意力权重的话)
outputs = outputs + cross_attn_outputs[1:]
residual = hidden_states # 保存当前隐藏状态作为残差连接的起点
hidden_states = self.ln_2(hidden_states) # LayerNormalization 3:对隐藏状态进行归一化
feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
# 残差连接
hidden_states = residual + feed_forward_hidden_states
outputs = (hidden_states,) + outputs # 将最终隐藏状态和所有输出组成元组作为模块的最终输出
return outputs
# 定义一个继承自FlaxPreTrainedModel的抽象类,用于处理权重初始化、预训练模型的下载和加载接口
class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
# 配置类,指定为GPT2Config
config_class = GPT2Config
# 基础模型前缀,指定为"transformer"
base_model_prefix = "transformer"
# 模块类,暂未指定
module_class: nn.Module = None
def __init__(
self,
config: GPT2Config,
input_shape: Tuple = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 使用给定的配置和参数初始化模块
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类初始化方法,传入配置、模块、输入形状、种子、数据类型和是否初始化的标志
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量
input_ids = jnp.zeros(input_shape, dtype="i4")
# 创建与input_ids相同形状的全1张量作为注意力掩码
attention_mask = jnp.ones_like(input_ids)
# 根据input_ids的形状广播生成位置编码张量
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
# 划分随机数种子rng,用于参数和dropout
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
if self.config.add_cross_attention:
# 如果配置需要添加交叉注意力,则初始化编码器隐藏状态为零张量,注意力掩码与attention_mask相同
encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
encoder_attention_mask = attention_mask
# 使用模块的初始化方法初始化模型参数和其他输出
module_init_outputs = self.module.init(
rngs,
input_ids,
attention_mask,
position_ids,
encoder_hidden_states,
encoder_attention_mask,
return_dict=False,
)
else:
# 否则,只使用input_ids、attention_mask和position_ids初始化模型参数
module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
# 获取随机初始化的参数
random_params = module_init_outputs["params"]
if params is not None:
# 如果给定了预训练参数params,则将随机初始化的参数与params进行扁平化和解冻后的比较,处理丢失的键
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
# 返回冻结和解扁平化的参数字典
return freeze(unflatten_dict(params))
else:
# 否则,直接返回随机初始化的参数字典
return random_params
def init_cache(self, batch_size, max_length):
r"""
Args:
batch_size (`int`):
batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
max_length (`int`):
maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
cache.
"""
# 初始化输入变量以获取缓存
# 创建一个 batch_size x max_length 大小的全为1的矩阵作为输入ID
input_ids = jnp.ones((batch_size, max_length))
# 根据 input_ids 创建与之形状相同的全为1的注意力掩码
attention_mask = jnp.ones_like(input_ids)
# 根据 input_ids 的形状广播创建位置ID,形状为 input_ids.shape
position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
# 使用模型的初始化方法初始化变量,包括输入ID、注意力掩码、位置ID,并设置初始化缓存为True
init_variables = self.module.init(
jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
)
# 返回解除冻结后的缓存部分
return unfreeze(init_variables["cache"])
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
def __call__(
self,
input_ids,
attention_mask=None,
position_ids=None,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
params: dict = None,
past_key_values: dict = None,
dropout_rng: jax.random.PRNGKey = None,
train: bool = False,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 如果未显式提供 `output_attentions`,则使用配置中的设定
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未显式提供 `output_hidden_states`,则使用配置中的设定
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未显式提供 `return_dict`,则使用配置中的设定
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 如果提供了 `encoder_hidden_states` 但未提供 `encoder_attention_mask`,则创建一个全为1的注意力掩码
if encoder_hidden_states is not None and encoder_attention_mask is None:
batch_size, sequence_length = encoder_hidden_states.shape[:2]
encoder_attention_mask = jnp.ones((batch_size, sequence_length))
# 获取输入张量的批量大小和序列长度
batch_size, sequence_length = input_ids.shape
# 如果未提供 `position_ids`
if position_ids is None:
# 如果提供了 `past_key_values` 但未提供 `position_ids`,则引发错误
if past_key_values is not None:
raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
# 使用序列长度创建广播后的位置编码
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 如果未提供 `attention_mask`,则创建一个全为1的注意力掩码
if attention_mask is None:
attention_mask = jnp.ones((batch_size, sequence_length))
# 处理任何需要的伪随机数生成器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 准备输入参数字典
inputs = {"params": params or self.params}
# 如果提供了 `past_key_values`,则将其作为缓存传递给模块,同时确保缓存是可变的
if past_key_values:
inputs["cache"] = past_key_values
mutable = ["cache"]
else:
mutable = False
# 应用模块的前向传播函数,计算输出
outputs = self.module.apply(
inputs,
jnp.array(input_ids, dtype="i4"),
jnp.array(attention_mask, dtype="i4"),
jnp.array(position_ids, dtype="i4"),
encoder_hidden_states,
encoder_attention_mask,
not train,
False,
output_attentions,
output_hidden_states,
return_dict,
rngs=rngs,
mutable=mutable,
)
# 如果提供了 `past_key_values` 并且设置了 `return_dict`,则将更新后的缓存添加到模型输出中
if past_key_values is not None and return_dict:
outputs, past_key_values = outputs
outputs["past_key_values"] = unfreeze(past_key_values["cache"])
return outputs
# 如果提供了 `past_key_values` 但未设置 `return_dict`,则更新模型输出
elif past_key_values is not None and not return_dict:
outputs, past_key_values = outputs
outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
# 返回模型输出
return outputs
class FlaxGPT2BlockCollection(nn.Module):
config: GPT2Config
dtype: jnp.dtype = jnp.float32
# 初始化模块,设置隐藏层块的集合
def setup(self):
# 创建一组 GPT2Block 对象,根据给定的层数配置
self.blocks = [
FlaxGPT2Block(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
]
# 调用模块的方法,处理输入的隐藏状态并返回输出
def __call__(
self,
hidden_states,
attention_mask=None,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 初始化空的输出列表,根据需要存储注意力、隐藏状态及交叉注意力
all_attentions = () if output_attentions else None
all_hidden_states = () if output_hidden_states else None
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
# 遍历每个隐藏层块并处理隐藏状态
for block in self.blocks:
# 如果需要输出隐藏状态,则将当前隐藏状态添加到列表中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 调用当前块的处理方法,获取块的输出
layer_outputs = block(
hidden_states,
attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
)
# 更新当前隐藏状态为当前块的输出的第一个元素(即隐藏状态)
hidden_states = layer_outputs[0]
# 如果需要输出注意力,则将当前块的注意力添加到列表中
if output_attentions:
all_attentions += (layer_outputs[1],)
# 如果存在编码器的隐藏状态,则将当前块的交叉注意力添加到列表中
if encoder_hidden_states is not None:
all_cross_attentions += (layer_outputs[2],)
# 返回模块的输出,包括隐藏状态、所有隐藏状态、所有注意力和所有交叉注意力
# 注意:这里可能包含 `None` 值,`FlaxGPT2Module` 将会过滤它们
outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
return outputs
class FlaxGPT2Module(nn.Module):
config: GPT2Config
dtype: jnp.dtype = jnp.float32
# 初始化模块,设置词嵌入、位置嵌入、Dropout、隐藏层块集合和最终层归一化
def setup(self):
# 设置词嵌入维度为隐藏大小
self.embed_dim = self.config.hidden_size
# 初始化词嵌入和位置嵌入层
self.wte = nn.Embed(
self.config.vocab_size,
self.embed_dim,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
self.wpe = nn.Embed(
self.config.max_position_embeddings,
self.embed_dim,
embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
dtype=self.dtype,
)
# 初始化 Dropout 层
self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
# 初始化隐藏层块集合
self.h = FlaxGPT2BlockCollection(self.config, dtype=self.dtype)
# 初始化最终层的归一化层
self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
# 定义一个调用函数,接受多个参数:
# - input_ids: 输入的标识符序列
# - attention_mask: 注意力掩码
# - position_ids: 位置编码
# - encoder_hidden_states: 编码器的隐藏状态(可选)
# - encoder_attention_mask: 编码器的注意力掩码(可选)
# - deterministic: 是否确定性操作的标志,默认为True
# - init_cache: 是否初始化缓存的标志,默认为False
# - output_attentions: 是否输出注意力权重,默认为False
# - output_hidden_states: 是否输出隐藏状态,默认为False
# - return_dict: 是否以字典形式返回结果,默认为True
def __call__(
self,
input_ids,
attention_mask,
position_ids,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
deterministic=True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 通过输入标识符序列获取对应的词嵌入
input_embeds = self.wte(input_ids.astype("i4"))
# 根据位置编码获取对应的位置嵌入
position_embeds = self.wpe(position_ids.astype("i4"))
# 将词嵌入和位置嵌入相加得到初始的隐藏状态
hidden_states = input_embeds + position_embeds
# 对隐藏状态进行丢弃操作,以减少过拟合,deterministic参数控制是否确定性
hidden_states = self.dropout(hidden_states, deterministic=deterministic)
# 调用模型的前向传播函数h,处理隐藏状态和相关参数,获取输出
outputs = self.h(
hidden_states,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中取出新的隐藏状态
hidden_states = outputs[0]
# 对新的隐藏状态进行 LayerNorm 归一化操作
hidden_states = self.ln_f(hidden_states)
# 如果需要输出所有隐藏状态,则将它们与最新的隐藏状态合并
if output_hidden_states:
all_hidden_states = outputs[1] + (hidden_states,)
outputs = (hidden_states, all_hidden_states) + outputs[2:]
else:
outputs = (hidden_states,) + outputs[1:]
# 如果不需要以字典形式返回结果,则以元组形式返回所有非空结果
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 以特定格式返回最终结果,包含最终隐藏状态、所有隐藏状态、注意力权重和交叉注意力权重
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=outputs[1],
attentions=outputs[2],
cross_attentions=outputs[3],
)
# 使用装饰器添加文档字符串,描述这是一个不带特定顶层头部的原始隐藏状态输出的 GPT2 模型转换器
@add_start_docstrings(
"The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING,
)
# 定义 FlaxGPT2Model 类,它继承自 FlaxGPT2PreTrainedModel
class FlaxGPT2Model(FlaxGPT2PreTrainedModel):
# 模块类别设置为 FlaxGPT2Module
module_class = FlaxGPT2Module
# 使用示例调用文档字符串添加函数,为 FlaxGPT2Model 类添加文档字符串
append_call_sample_docstring(
FlaxGPT2Model,
_CHECKPOINT_FOR_DOC, # 使用的检查点文档
FlaxBaseModelOutputWithPastAndCrossAttentions, # 输出的模型输出类
_CONFIG_FOR_DOC, # 使用的配置文档
)
# 定义 FlaxGPT2LMHeadModule 类,继承自 nn.Module
class FlaxGPT2LMHeadModule(nn.Module):
config: GPT2Config # 类型为 GPT2Config 的配置对象
dtype: jnp.dtype = jnp.float32 # 数据类型设置为 jnp.float32,默认为 float32
def setup(self):
# 初始化 transformer 层,使用 FlaxGPT2Module 类
self.transformer = FlaxGPT2Module(self.config, dtype=self.dtype)
# 初始化 lm_head 层,使用全连接层 Dense,设置参数包括词汇表大小、无偏置、dtype 和初始化范围
self.lm_head = nn.Dense(
self.config.vocab_size,
use_bias=False,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
)
def __call__(
self,
input_ids,
attention_mask,
position_ids,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 调用 transformer 层进行前向传播,获取输出结果
outputs = self.transformer(
input_ids,
attention_mask,
position_ids,
encoder_hidden_states,
encoder_attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0] # 获取 transformer 输出的隐藏状态
if self.config.tie_word_embeddings:
# 如果配置要求共享词嵌入权重,则从 transformer 的参数中获取共享的词嵌入权重矩阵
shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
# 应用共享的词嵌入权重进行 lm_head 的计算
lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
else:
# 否则直接使用 lm_head 层计算 lm_logits
lm_logits = self.lm_head(hidden_states)
if not return_dict:
# 如果不要求返回字典形式的输出,则返回元组形式的结果
return (lm_logits,) + outputs[1:]
# 返回 FlaxCausalLMOutputWithCrossAttentions 类的实例,包括 lm_logits、hidden_states、attentions 和 cross_attentions
return FlaxCausalLMOutputWithCrossAttentions(
logits=lm_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
# 使用装饰器添加文档字符串,描述这是一个带有语言建模头部(线性层权重与输入嵌入绑定)的 GPT2 模型转换器
@add_start_docstrings(
"""
The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
GPT2_START_DOCSTRING,
)
# 定义 FlaxGPT2LMHeadModel 类,继承自 FlaxGPT2PreTrainedModel
class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
module_class = FlaxGPT2LMHeadModule # 模块类别设置为 FlaxGPT2LMHeadModule
# 准备生成过程的输入数据,包括初始化缓存和生成位置信息的处理
def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
# 初始化缓存,获取输入的批量大小和序列长度
batch_size, seq_length = input_ids.shape
# 使用初始化方法创建缓存的键值对
past_key_values = self.init_cache(batch_size, max_length)
# 创建一个扩展的注意力掩码,初始值为全1数组
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
# 如果有提供注意力掩码,则进行处理
if attention_mask is not None:
# 计算位置编码,累积求和减去1以确保位置正确
position_ids = attention_mask.cumsum(axis=-1) - 1
# 使用动态更新切片操作,将注意力掩码更新到扩展的注意力掩码中
extended_attention_mask = lax.dynamic_update_slice(
extended_attention_mask, attention_mask.astype("i4"), (0, 0)
)
else:
# 如果未提供注意力掩码,则根据输入序列长度广播生成位置编码
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
# 返回准备好的输入字典,包括缓存的键值对、扩展的注意力掩码和位置编码
return {
"past_key_values": past_key_values,
"attention_mask": extended_attention_mask,
"position_ids": position_ids,
}
# 更新生成过程的输入数据,主要是更新缓存和位置编码
def update_inputs_for_generation(self, model_outputs, model_kwargs):
# 更新模型关键字参数中的缓存键值对和位置编码
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
# 返回更新后的模型关键字参数
return model_kwargs
# 将样例文档字符串附加到函数或类中
append_call_sample_docstring(
# 将 FlaxGPT2LMHeadModel 类作为目标对象
FlaxGPT2LMHeadModel,
# 使用 _CHECKPOINT_FOR_DOC 作为样例文档字符串的检查点
_CHECKPOINT_FOR_DOC,
# 将 FlaxCausalLMOutputWithCrossAttentions 类作为相关交叉注意力的因果语言模型输出
FlaxCausalLMOutputWithCrossAttentions,
# 使用 _CONFIG_FOR_DOC 作为样例文档字符串的配置
_CONFIG_FOR_DOC,
)
.\models\gpt2\modeling_gpt2.py
"""PyTorch OpenAI GPT-2 model."""
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.cuda.amp import autocast
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
QuestionAnsweringModelOutput,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.model_parallel_utils import assert_device_map, get_device_map
from .configuration_gpt2 import GPT2Config
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
_CONFIG_FOR_DOC = "GPT2Config"
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-community/gpt2",
"openai-community/gpt2-medium",
"openai-community/gpt2-large",
"openai-community/gpt2-xl",
"distilbert/distilgpt2",
]
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
"""Load tf checkpoints in a pytorch model"""
try:
import re
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(gpt2_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array.squeeze())
for name, array in zip(names, arrays):
name = name[6:]
name = name.split("/")
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+\d+", m_name):
scope_names = re.split(r"(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "w" or scope_names[0] == "g":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "b":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "wpe" or scope_names[0] == "wte":
pointer = getattr(pointer, scope_names[0])
pointer = getattr(pointer, "weight")
else:
pointer = getattr(pointer, scope_names[0])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except ValueError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model
class GPT2Attention(nn.Module):
def __init__(self, config, is_cross_attention=False, layer_idx=None):
super().__init__()
max_positions = config.max_position_embeddings
self.register_buffer(
"bias",
torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
1, 1, max_positions, max_positions
),
persistent=False,
)
self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
self.split_size = self.embed_dim
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale_attn_weights = config.scale_attn_weights
self.is_cross_attention = is_cross_attention
self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
self.layer_idx = layer_idx
self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
if self.is_cross_attention:
self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
else:
self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.resid_dropout = nn.Dropout(config.resid_pdrop)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
self.num_heads = self.num_heads - len(heads)
self.pruned_heads = self.pruned_heads.union(heads)
def _attn(self, query, key, value, attention_mask=None, head_mask=None):
attn_weights = torch.matmul(query, key.transpose(-1, -2))
if self.scale_attn_weights:
attn_weights = attn_weights / torch.full(
[], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
)
if self.scale_attn_by_inverse_layer_idx:
attn_weights = attn_weights / float(self.layer_idx + 1)
if not self.is_cross_attention:
query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
mask_value = torch.finfo(attn_weights.dtype).min
mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights.type(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
bsz, num_heads, q_seq_len, dk = query.size()
_, _, k_seq_len, _ = key.size()
attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
scale_factor = 1.0
if self.scale_attn_weights:
scale_factor /= float(value.size(-1)) ** 0.5
if self.scale_attn_by_inverse_layer_idx:
scale_factor /= float(self.layer_idx + 1)
with autocast(enabled=False):
q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
if not self.is_cross_attention:
query_length, key_length = query.size(-2), key.size(-2)
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
mask_value = torch.finfo(attn_weights.dtype).min
mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
attn_weights = torch.where(causal_mask, attn_weights, mask_value)
if attention_mask is not None:
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if attn_weights.dtype != torch.float32:
raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
attn_weights = attn_weights.type(value.dtype)
attn_weights = self.attn_dropout(attn_weights)
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_output = torch.matmul(attn_weights, value)
return attn_output, attn_weights
def _split_heads(self, tensor, num_heads, attn_head_size):
"""
Splits hidden_size dim into attn_head_size and num_heads
"""
new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
tensor = tensor.view(new_shape)
return tensor.permute(0, 2, 1, 3)
def _merge_heads(self, tensor, num_heads, attn_head_size):
"""
Merges attn_head_size dim and num_attn_heads dim into hidden_size
"""
tensor = tensor.permute(0, 2, 1, 3).contiguous()
new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
return tensor.view(new_shape)
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
if encoder_hidden_states is not None:
if not hasattr(self, "q_attn"):
raise ValueError(
"If class is used as cross attention, the weights `q_attn` have to be defined. "
"Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
)
query = self.q_attn(hidden_states)
key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
attention_mask = encoder_attention_mask
else:
query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
query = self._split_heads(query, self.num_heads, self.head_dim)
key = self._split_heads(key, self.num_heads, self.head_dim)
value = self._split_heads(value, self.num_heads, self.head_dim)
if layer_past is not None:
past_key, past_value = layer_past
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
present = (key, value)
else:
present = None
if self.reorder_and_upcast_attn:
attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
else:
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
attn_output = self.c_proj(attn_output)
attn_output = self.resid_dropout(attn_output)
outputs = (attn_output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
class GPT2MLP(nn.Module):
def __init__(self, intermediate_size, config):
super().__init__()
embed_dim = config.hidden_size
self.c_fc = Conv1D(intermediate_size, embed_dim)
self.c_proj = Conv1D(embed_dim, intermediate_size)
self.act = ACT2FN[config.activation_function]
self.dropout = nn.Dropout(config.resid_pdrop)
def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
hidden_states = self.c_fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class GPT2Block(nn.Module):
def __init__(self, config, layer_idx=None):
super().__init__()
hidden_size = config.hidden_size
inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = GPT2Attention(config, layer_idx=layer_idx)
self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
if config.add_cross_attention:
self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)
self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.mlp = GPT2MLP(inner_dim, config)
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs = self.attn(
hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
hidden_states = attn_output + residual
if encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
"cross-attention layers by setting `config.add_cross_attention=True`"
)
residual = hidden_states
hidden_states = self.ln_cross_attn(hidden_states)
cross_attn_outputs = self.crossattention(
hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
)
attn_output = cross_attn_outputs[0]
hidden_states = residual + attn_output
outputs = outputs + cross_attn_outputs[2:]
residual = hidden_states
hidden_states = self.ln_2(hidden_states)
feed_forward_hidden_states = self.mlp(hidden_states)
hidden_states = residual + feed_forward_hidden_states
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
class GPT2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GPT2Config
load_tf_weights = load_tf_weights_in_gpt2
base_model_prefix = "transformer"
is_parallelizable = True
supports_gradient_checkpointing = True
_no_split_modules = ["GPT2Block"]
_skip_keys_device_placement = "past_key_values"
def __init__(self, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, (nn.Linear, Conv1D)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
for name, p in module.named_parameters():
if name == "c_proj.weight":
p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
@dataclass
class GPT2DoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
"""
loss: Optional[torch.FloatTensor] = None
mc_loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mc_logits: torch.FloatTensor = None
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
GPT2_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GPT2_INPUTS_DOCSTRING = r"""
"""
PARALLELIZE_DOCSTRING = r"""
This is an experimental feature and is a subject to change at a moment's notice.
Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
it will evenly distribute blocks across all devices.
Args:
device_map (`Dict[int, list]`, optional, defaults to None):
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
automatically mapped to the first device (for esoteric reasons). That means that the first device should
have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
following number of attention modules:
- openai-community/gpt2: 12
- openai-community/gpt2-medium: 24
- openai-community/gpt2-large: 36
- openai-community/gpt2-xl: 48
Example:
```
# Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
device_map = {
0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
}
model.parallelize(device_map)
```
"""
DEPARALLELIZE_DOCSTRING = r"""
Moves the model to cpu from a model parallel state.
Example:
```
# On a 4 GPU machine with openai-community/gpt2-large:
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
device_map = {
0: [0, 1, 2, 3, 4, 5, 6, 7],
1: [8, 9, 10, 11, 12, 13, 14, 15],
2: [16, 17, 18, 19, 20, 21, 22, 23],
3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
}
model.parallelize(device_map) # Splits the model across several devices
# 将模型从并行模式转换为单机模式,将模型放回CPU并通过调用torch.cuda.empty_cache()清理内存
model.deparallelize()
"""
@add_start_docstrings(
"The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING,
)
class GPT2Model(GPT2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embed_dim = config.hidden_size
self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.model_parallel = False
self.device_map = None
self.gradient_checkpointing = False
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
warnings.warn(
"`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
" model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
" ...}",
FutureWarning,
)
self.device_map = (
get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
)
assert_device_map(self.device_map, len(self.h))
self.model_parallel = True
self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
self.last_device = "cuda:" + str(max(self.device_map.keys()))
self.wte = self.wte.to(self.first_device)
self.wpe = self.wpe.to(self.first_device)
for k, v in self.device_map.items():
for block in v:
cuda_device = "cuda:" + str(k)
self.h[block] = self.h[block].to(cuda_device)
self.ln_f = self.ln_f.to(self.last_device)
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
def deparallelize(self):
warnings.warn(
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
self.model_parallel = False
self.device_map = None
self.first_device = "cpu"
self.last_device = "cpu"
self.wte = self.wte.to("cpu")
self.wpe = self.wpe.to("cpu")
for index in range(len(self.h)):
self.h[index] = self.h[index].to("cpu")
self.ln_f = self.ln_f.to("cpu")
torch.cuda.empty_cache()
def get_input_embeddings(self):
return self.wte
def set_input_embeddings(self, new_embeddings):
self.wte = new_embeddings
def _prune_heads(self, heads_to_prune):
for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
GPT2_START_DOCSTRING,
)
class GPT2LMHeadModel(GPT2PreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.model_parallel = False
self.device_map = None
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
warnings.warn(
"`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
" your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
" 0, 'transformer.h.1': 1, ...}",
FutureWarning,
)
self.device_map = (
get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
if device_map is None
else device_map
)
assert_device_map(self.device_map, len(self.transformer.h))
self.transformer.parallelize(self.device_map)
self.lm_head = self.lm_head.to(self.transformer.first_device)
self.model_parallel = True
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
def deparallelize(self):
warnings.warn(
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
self.transformer.deparallelize()
self.transformer = self.transformer.to("cpu")
self.lm_head = self.lm_head.to("cpu")
self.model_parallel = False
torch.cuda.empty_cache()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
else:
position_ids = None
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
)
return model_inputs
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
if self.model_parallel:
torch.cuda.set_device(self.transformer.first_device)
hidden_states = hidden_states.to(self.lm_head.weight.device)
lm_logits = self.lm_head(hidden_states)
loss = None
if labels is not None:
labels = labels.to(lm_logits.device)
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
cross_attentions=transformer_outputs.cross_attentions,
)
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
) -> Tuple[Tuple[torch.Tensor]]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
@add_start_docstrings(
"""
The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
input embeddings, the classification head takes as input the input of a specified classification token index in the
input sequence).
""",
GPT2_START_DOCSTRING,
)
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
config.num_labels = 1
self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.multiple_choice_head = SequenceSummary(config)
self.model_parallel = False
self.device_map = None
self.post_init()
@add_start_docstrings(PARALLELIZE_DOCSTRING)
def parallelize(self, device_map=None):
warnings.warn(
"`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should"
" load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your"
" own `device_map` but it needs to be a dictionary module_name to device, so for instance"
" {'transformer.h.0': 0, 'transformer.h.1': 1, ...}",
FutureWarning,
)
self.device_map = (
get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
if device_map is None
else device_map
)
assert_device_map(self.device_map, len(self.transformer.h))
self.transformer.parallelize(self.device_map)
self.lm_head = self.lm_head.to(self.transformer.first_device)
self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
self.model_parallel = True
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
def deparallelize(self):
warnings.warn(
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
self.transformer.deparallelize()
self.transformer = self.transformer.to("cpu")
self.lm_head = self.lm_head.to("cpu")
self.multiple_choice_head = self.multiple_choice_head.to("cpu")
self.model_parallel = False
torch.cuda.empty_cache()
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
else:
position_ids = None
return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
mc_token_ids: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
mc_labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
):
pass
@staticmethod
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
) -> Tuple[Tuple[torch.Tensor]]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
"""
GPT2 模型转换器,顶部带有序列分类头(线性层)。
[`GPT2ForSequenceClassification`] 使用最后一个令牌进行分类,与其他因果模型(例如 GPT-1)类似。
由于它在最后一个令牌上执行分类,因此需要知道最后一个令牌的位置。如果在配置中定义了 `pad_token_id`,它会在每行中找到不是填充令牌的最后一个令牌。如果没有定义 `pad_token_id`,它会简单地取批次中每行的最后一个值。当传递 `inputs_embeds` 而不是 `input_ids` 时,由于无法猜测填充令牌,它会执行相同的操作(取批次中每行的最后一个值)。
"""
@add_start_docstrings(
"""
GPT2 模型,顶部带有标记分类头(即隐藏状态输出的线性层),例如用于命名实体识别(NER)任务。
""",
GPT2_START_DOCSTRING,
)
class GPT2ForTokenClassification(GPT2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPT2Model(config)
if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
classifier_dropout = config.classifier_dropout
elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
classifier_dropout = config.hidden_dropout
else:
classifier_dropout = 0.1
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.model_parallel = False
self.device_map = None
self.post_init()
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint="brad1141/gpt2-finetuned-comp2",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_loss=0.25,
expected_output=[
"Lead",
"Lead",
"Lead",
"Position",
"Lead",
"Lead",
"Lead",
"Lead",
"Lead",
"Lead",
"Lead",
"Lead",
],
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
hidden_states = self.dropout(hidden_states)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + transformer_outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
The GPT-2 Model transformer with a span classification head on top for extractive question-answering tasks like
SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
GPT2_START_DOCSTRING,
)
class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPT2Model(config)
self.qa_outputs = nn.Linear(config.hidden_size, 2)
self.model_parallel = False
self.device_map = None
self.post_init()
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
real_checkpoint=_CHECKPOINT_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1).to(start_logits.device)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1).to(end_logits.device)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\gpt2\modeling_tf_gpt2.py
""" TF 2.0 OpenAI GPT-2 model."""
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutputWithPastAndCrossAttentions,
TFCausalLMOutputWithCrossAttentions,
TFSequenceClassifierOutputWithPast,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFConv1D,
TFModelInputType,
TFPreTrainedModel,
TFSequenceClassificationLoss,
TFSequenceSummary,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_gpt2 import GPT2Config
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
_CONFIG_FOR_DOC = "GPT2Config"
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-community/gpt2",
"openai-community/gpt2-medium",
"openai-community/gpt2-large",
"openai-community/gpt2-xl",
"distilbert/distilgpt2",
]
class TFAttention(keras.layers.Layer):
def __init__(self, nx, config, scale=False, is_cross_attention=False, **kwargs):
super().__init__(**kwargs)
n_state = nx
assert n_state % config.n_head == 0
self.n_head = config.n_head
self.split_size = n_state
self.scale = scale
self.output_attentions = config.output_attentions
self.is_cross_attention = is_cross_attention
if self.is_cross_attention:
self.c_attn = TFConv1D(n_state * 2, nx, initializer_range=config.initializer_range, name="c_attn")
self.q_attn = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="q_attn")
else:
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
self.attn_dropout = keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = keras.layers.Dropout(config.resid_pdrop)
self.pruned_heads = set()
self.embed_dim = n_state
def prune_heads(self, heads):
pass
@staticmethod
def causal_attention_mask(nd, ns, dtype):
"""
生成因果注意力掩码,下三角矩阵,从右下角开始计算。与 tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd) 相同,
但在 TPUs 上不会产生垃圾数据。
"""
i = tf.range(nd)[:, None]
j = tf.range(ns)
m = i >= j - ns + nd
return tf.cast(m, dtype)
def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False):
w = tf.matmul(q, k, transpose_b=True)
if self.scale:
dk = tf.cast(shape_list(k)[-1], dtype=w.dtype)
w = w / tf.math.sqrt(dk)
if not self.is_cross_attention:
_, _, nd, ns = shape_list(w)
b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
b = tf.reshape(b, [1, 1, nd, ns])
w = w * b - 1e4 * (1 - b)
if attention_mask is not None:
attention_mask = tf.cast(attention_mask, dtype=w.dtype)
w = w + attention_mask
w = stable_softmax(w, axis=-1)
w = self.attn_dropout(w, training=training)
if head_mask is not None:
w = w * head_mask
outputs = [tf.matmul(w, v)]
if output_attentions:
outputs.append(w)
return outputs
def merge_heads(self, x):
x = tf.transpose(x, [0, 2, 1, 3])
x_shape = shape_list(x)
new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
return tf.reshape(x, new_x_shape)
def split_heads(self, x):
x_shape = shape_list(x)
new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
x = tf.reshape(x, new_x_shape)
return tf.transpose(x, (0, 2, 1, 3))
def call(
self,
x,
layer_past,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
use_cache,
output_attentions,
training=False,
):
if encoder_hidden_states is not None:
if not hasattr(self, "q_attn"):
raise ValueError(
"If class is used as cross attention, the weights `q_attn` have to be defined. "
"Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
)
query = self.q_attn(x)
kv_out = self.c_attn(encoder_hidden_states)
key, value = tf.split(kv_out, 2, axis=2)
attention_mask = encoder_attention_mask
else:
x = self.c_attn(x)
query, key, value = tf.split(x, 3, axis=2)
query = self.split_heads(query)
key = self.split_heads(key)
value = self.split_heads(value)
if layer_past is not None:
past_key, past_value = tf.unstack(layer_past, axis=0, num=2)
key = tf.concat([past_key, key], axis=-2)
value = tf.concat([past_value, value], axis=-2)
if use_cache:
present = tf.stack([key, value], axis=0)
else:
present = (None,)
attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
a = attn_outputs[0]
a = self.merge_heads(a)
a = self.c_proj(a)
a = self.resid_dropout(a, training=training)
outputs = [a, present] + attn_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if self.is_cross_attention:
c_attn_shape = 2 * self.embed_dim
else:
c_attn_shape = 3 * self.embed_dim
if getattr(self, "c_proj", None) is not None:
with tf.name_scope(self.c_proj.name):
self.c_proj.build([None, None, self.embed_dim])
if getattr(self, "c_attn", None) is not None:
with tf.name_scope(self.c_attn.name):
self.c_attn.build([None, None, c_attn_shape])
if getattr(self, "q_attn", None) is not None:
with tf.name_scope(self.q_attn.name):
self.q_attn.build([None, None, self.embed_dim])
class TFMLP(keras.layers.Layer):
def __init__(self, n_state, config, **kwargs):
super().__init__(**kwargs)
nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
self.act = get_tf_activation(config.activation_function)
self.dropout = keras.layers.Dropout(config.resid_pdrop)
self.intermediate_size = n_state
self.embed_dim = nx
def call(self, x, training=False):
h = self.act(self.c_fc(x))
h2 = self.c_proj(h)
h2 = self.dropout(h2, training=training)
return h2
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "c_fc", None) is not None:
with tf.name_scope(self.c_fc.name):
self.c_fc.build([None, None, self.intermediate_size])
if getattr(self, "c_proj", None) is not None:
with tf.name_scope(self.c_proj.name):
self.c_proj.build([None, None, self.embed_dim])
class TFBlock(keras.layers.Layer):
def __init__(self, config, scale=False, **kwargs):
super().__init__(**kwargs)
nx = config.n_embd
inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
self.attn = TFAttention(nx, config, scale, name="attn")
self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
if config.add_cross_attention:
self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True)
self.ln_cross_attn = keras.layers.LayerNormalization(
epsilon=config.layer_norm_epsilon, name="ln_cross_attn"
)
self.mlp = TFMLP(inner_dim, config, name="mlp")
self.hidden_size = config.hidden_size
def call(
self,
x,
layer_past,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
use_cache,
output_attentions,
training=False,
):
a = self.ln_1(x)
output_attn = self.attn(
a,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=None,
encoder_attention_mask=None,
use_cache=use_cache,
output_attentions=output_attentions,
training=training,
)
a = output_attn[0]
outputs = output_attn[1:]
x = x + a
if encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
"cross-attention layers by setting `config.add_cross_attention=True`"
)
ca = self.ln_cross_attn(x)
output_cross_attn = self.crossattention(
ca,
layer_past=None,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=False,
output_attentions=output_attentions,
training=training,
)
ca = output_cross_attn[0]
x = x + ca
outputs = outputs + output_cross_attn[2:]
m = self.ln_2(x)
m = self.mlp(m, training=training)
x = x + m
outputs = [x] + outputs
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "ln_1", None) is not None:
with tf.name_scope(self.ln_1.name):
self.ln_1.build([None, None, self.hidden_size])
if getattr(self, "attn", None) is not None:
with tf.name_scope(self.attn.name):
self.attn.build(None)
if getattr(self, "ln_2", None) is not None:
with tf.name_scope(self.ln_2.name):
self.ln_2.build([None, None, self.hidden_size])
if getattr(self, "mlp", None) is not None:
with tf.name_scope(self.mlp.name):
self.mlp.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
if getattr(self, "ln_cross_attn", None) is not None:
with tf.name_scope(self.ln_cross_attn.name):
self.ln_cross_attn.build([None, None, self.hidden_size])
@keras_serializable
class TFGPT2MainLayer(keras.layers.Layer):
config_class = GPT2Config
def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
self.config = config
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.use_cache = config.use_cache
self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer
self.n_embd = config.n_embd
self.n_positions = config.n_positions
self.initializer_range = config.initializer_range
self.wte = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="wte",
)
self.wpe = keras.layers.Embedding(
input_dim=config.n_positions,
output_dim=config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name="wpe",
)
self.drop = keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
self.embed_dim = config.hidden_size
def get_input_embeddings(self):
return self.wte
def set_input_embeddings(self, new_embeddings):
self.wte = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "wte", None) is not None:
with tf.name_scope(self.wte.name):
self.wte.build(None)
if getattr(self, "wpe", None) is not None:
with tf.name_scope(self.wpe.name):
self.wpe.build(None)
if getattr(self, "ln_f", None) is not None:
with tf.name_scope(self.ln_f.name):
self.ln_f.build([None, None, self.embed_dim])
if getattr(self, "h", None) is not None:
for layer in self.h:
with tf.name_scope(layer.name):
layer.build(None)
class TFGPT2PreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = GPT2Config
base_model_prefix = "transformer"
_keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias", r"h.\d+.crossattention.bias"]
@property
def input_signature(self):
return {
"input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
}
@dataclass
class TFGPT2DoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
mc_logits: tf.Tensor = None
past_key_values: List[tf.Tensor] | None = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
GPT2_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Parameters:
config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
GPT2_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING,
)
class TFGPT2Model(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer")
@unpack_inputs
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
定义一个装饰器,用于为代码示例添加文档字符串,指定了文档检查点、输出类型和配置类
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
r"""
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*, defaults to `True`):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past`). Set to `False` during training, `True` during generation
"""
outputs = self.transformer(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
"""
The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
"""
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer")
def get_output_embeddings(self):
return self.get_input_embeddings()
def set_output_embeddings(self, value):
self.set_input_embeddings(value)
def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
inputs = tf.expand_dims(inputs[:, -1], -1)
if token_type_ids is not None:
token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
position_ids = kwargs.get("position_ids", None)
attention_mask = kwargs.get("attention_mask", None)
if attention_mask is not None and position_ids is None:
position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
if past_key_values:
position_ids = tf.expand_dims(position_ids[:, -1], -1)
return {
"input_ids": inputs,
"attention_mask": attention_mask,
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": use_cache,
"token_type_ids": token_type_ids,
}
@unpack_inputs
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFCausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
"""
通过在顶部添加多选分类头来扩展 GPT2 模型变换器,例如用于 RocStories/SWAG 任务。这两个头部都是线性层。语言建模头部将其权重绑定到输入嵌入,分类头部将输入分类令牌索引的输入序列。
""",
GPT2_START_DOCSTRING,
)
class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
config.num_labels = 1
self.transformer = TFGPT2MainLayer(config, name="transformer")
self.multiple_choice_head = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="multiple_choice_head"
)
@unpack_inputs
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
mc_token_ids: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
):
@property
def input_signature(self):
return {
"input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
"attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
"mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
}
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "transformer", None) is not None:
with tf.name_scope(self.transformer.name):
self.transformer.build(None)
if getattr(self, "multiple_choice_head", None) is not None:
with tf.name_scope(self.multiple_choice_head.name):
self.multiple_choice_head.build(None)
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
GPT2_START_DOCSTRING,
)
# 定义一个名为 TFGPT2ForSequenceClassification 的类,继承自 TFGPT2PreTrainedModel 和 TFSequenceClassificationLoss
class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
# 调用父类的初始化方法
super().__init__(config, *inputs, **kwargs)
# 设置类属性 num_labels,表示分类的标签数量
self.num_labels = config.num_labels
# 创建一个名为 score 的全连接层 Dense,用于分类得分计算
self.score = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="score",
use_bias=False,
)
# 创建一个 GPT2 主体层,用于序列转换
self.transformer = TFGPT2MainLayer(config, name="transformer")
# 存储配置信息
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint="microsoft/DialogRPT-updown",
output_type=TFSequenceClassifierOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
# 定义 call 方法,接收输入并进行模型前向传播
def call(
self,
input_ids: TFModelInputType | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFSequenceClassifierOutputWithPast, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
"""
# 调用Transformer模型处理输入数据,获取Transformer的输出结果
transformer_outputs = self.transformer(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 从Transformer的输出中获取隐藏状态(hidden_states)
hidden_states = transformer_outputs[0]
# 将隐藏状态通过分类器得到预测的logits
logits = self.score(hidden_states)
# 获取logits的形状信息
logits_shape = shape_list(logits)
# 初始化in_logits变量
in_logits = None
# 如果模型配置中没有定义pad_token_id
if self.config.pad_token_id is None:
# 将序列长度设置为-1
sequence_lengths = -1
else:
# 如果输入中包含input_ids
if input_ids is not None:
# 计算每个序列的有效长度
sequence_lengths = (
tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
- 1
)
# 将小于0的长度替换为默认序列长度-1
sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
# 根据有效长度从logits中抽取相应的部分
in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
else:
# 如果没有输入input_ids,则将序列长度设置为-1
sequence_lengths = -1
# 记录警告日志,说明在使用inputs_embeds时无法检测到填充标记
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
# 初始化损失值
loss = None
# 如果提供了标签数据
if labels is not None:
# 断言条件,确保模型能处理批次大小大于1的情况,或者至少有一个填充标记被定义
assert (
self.config.pad_token_id is not None or logits_shape[0] == 1
), "Cannot handle batch sizes > 1 if no padding token is defined."
# 如果sequence_lengths不是Tensor,说明在计算中已经从logits中获取了相应的部分
if not tf.is_tensor(sequence_lengths):
in_logits = logits[0 : logits_shape[0], sequence_lengths]
# 计算交叉熵损失
loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(in_logits, [-1, self.num_labels]))
# 如果没有in_logits,则使用原始logits作为池化后的logits
pooled_logits = in_logits if in_logits is not None else logits
# 如果return_dict为False,则返回一个元组
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果return_dict为True,则返回TFSequenceClassifierOutputWithPast对象
return TFSequenceClassifierOutputWithPast(
loss=loss,
logits=pooled_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# 定义一个方法 `build`,用于构建模型,可以接受输入形状参数 `input_shape`
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回,不再重复构建
if self.built:
return
# 设置标志位,表示模型已经构建
self.built = True
# 如果模型具有 `score` 属性且不为 `None`
if getattr(self, "score", None) is not None:
# 使用 `tf.name_scope` 建立命名空间,命名为 `self.score.name`
with tf.name_scope(self.score.name):
# 调用 `self.score` 的 `build` 方法,传入形状参数 `[None, None, self.config.n_embd]`
self.score.build([None, None, self.config.n_embd])
# 如果模型具有 `transformer` 属性且不为 `None`
if getattr(self, "transformer", None) is not None:
# 使用 `tf.name_scope` 建立命名空间,命名为 `self.transformer.name`
with tf.name_scope(self.transformer.name):
# 调用 `self.transformer` 的 `build` 方法,传入 `None` 作为参数
self.transformer.build(None)