Transformers 源码解析(一百三十)
.\models\yolos\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {"configuration_yolos": ["YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP", "YolosConfig", "YolosOnnxConfig"]}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_yolos"] = ["YolosFeatureExtractor"]
_import_structure["image_processing_yolos"] = ["YolosImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_yolos"] = [
"YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST",
"YolosForObjectDetection",
"YolosModel",
"YolosPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_yolos import YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP, YolosConfig, YolosOnnxConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_yolos import YolosFeatureExtractor
from .image_processing_yolos import YolosImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_yolos import (
YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST,
YolosForObjectDetection,
YolosModel,
YolosPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\yoso\configuration_yoso.py
""" YOSO model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"uw-madison/yoso-4096": "https://huggingface.co/uw-madison/yoso-4096/resolve/main/config.json",
}
class YosoConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`YosoModel`]. It is used to instantiate an YOSO
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the YOSO
[uw-madison/yoso-4096](https://huggingface.co/uw-madison/yoso-4096) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import YosoConfig, YosoModel
>>> # Initializing a YOSO uw-madison/yoso-4096 style configuration
>>> configuration = YosoConfig()
>>> # Initializing a model (with random weights) from the uw-madison/yoso-4096 style configuration
>>> model = YosoModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "yoso"
def __init__(
self,
vocab_size=50265,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=4096,
type_vocab_size=1,
initializer_range=0.02,
layer_norm_eps=1e-12,
position_embedding_type="absolute",
use_expectation=True,
hash_code_len=9,
num_hash=64,
conv_window=None,
use_fast_hash=True,
lsh_backward=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
**kwargs,
)
self.position_embedding_type = position_embedding_type
self.use_expectation = use_expectation
self.hash_code_len = hash_code_len
self.num_hash = num_hash
self.conv_window = conv_window
self.use_fast_hash = use_fast_hash
self.lsh_backward = lsh_backward
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_expectation = use_expectation
self.hash_code_len = hash_code_len
self.num_hash = num_hash
self.conv_window = conv_window
self.use_fast_hash = use_fast_hash
self.lsh_backward = lsh_backward
.\models\yoso\convert_yoso_pytorch_to_pytorch.py
def rename_key(orig_key):
if "model" in orig_key:
orig_key = orig_key.replace("model.", "")
if "norm1" in orig_key:
orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
if "norm2" in orig_key:
orig_key = orig_key.replace("norm2", "output.LayerNorm")
if "norm" in orig_key:
orig_key = orig_key.replace("norm", "LayerNorm")
if "transformer" in orig_key:
layer_num = orig_key.split(".")[0].split("_")[-1]
orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
if "mha.attn" in orig_key:
orig_key = orig_key.replace("mha.attn", "attention.self")
if "mha" in orig_key:
orig_key = orig_key.replace("mha", "attention")
if "W_q" in orig_key:
orig_key = orig_key.replace("W_q", "self.query")
if "W_k" in orig_key:
orig_key = orig_key.replace("W_k", "self.key")
if "W_v" in orig_key:
orig_key = orig_key.replace("W_v", "self.value")
if "ff1" in orig_key:
orig_key = orig_key.replace("ff1", "intermediate.dense")
if "ff2" in orig_key:
orig_key = orig_key.replace("ff2", "output.dense")
if "ff" in orig_key:
orig_key = orig_key.replace("ff", "output.dense")
if "mlm_class" in orig_key:
orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
if "mlm" in orig_key:
orig_key = orig_key.replace("mlm", "cls.predictions.transform")
if "cls" not in orig_key:
orig_key = "yoso." + orig_key
return orig_key
def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if ("pooler" in key) or ("sen_class" in key):
continue
else:
orig_state_dict[rename_key(key)] = val
orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
orig_state_dict["yoso.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
return orig_state_dict
orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
config = YosoConfig.from_json_file(yoso_config_file)
model = YosoForMaskedLM(config)
new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
print(model.load_state_dict(new_state_dict))
model.eval()
model.save_pretrained(pytorch_dump_path)
print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pytorch_model_path", default=None, type=str, required=True, help="Path to YOSO pytorch checkpoint."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The json file for YOSO model config.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_yoso_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
.\models\yoso\modeling_yoso.py
""" PyTorch YOSO model."""
import math
from pathlib import Path
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_ninja_available,
is_torch_cuda_available,
logging,
)
from .configuration_yoso import YosoConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "uw-madison/yoso-4096"
_CONFIG_FOR_DOC = "YosoConfig"
YOSO_PRETRAINED_MODEL_ARCHIVE_LIST = [
"uw-madison/yoso-4096",
]
lsh_cumulation = None
def load_cuda_kernels():
"""
加载 CUDA 内核函数。
"""
global lsh_cumulation
from torch.utils.cpp_extension import load
def append_root(files):
src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "yoso"
return [src_folder / file for file in files]
src_files = append_root(["fast_lsh_cumulation_torch.cpp", "fast_lsh_cumulation.cu", "fast_lsh_cumulation_cuda.cu"])
load("fast_lsh_cumulation", src_files, verbose=True)
import fast_lsh_cumulation as lsh_cumulation
def to_contiguous(input_tensors):
"""
确保输入张量是连续的(contiguous)。
"""
if isinstance(input_tensors, list):
out = []
for tensor in input_tensors:
if not tensor.is_contiguous():
tensor = tensor.contiguous()
out.append(tensor)
return out
else:
if not input_tensors.is_contiguous():
input_tensors = input_tensors.contiguous()
return input_tensors
def normalize(input_tensors):
"""
对输入张量进行 L2 归一化。
"""
if isinstance(input_tensors, list):
out = []
for tensor in input_tensors:
out.append(nn.functional.normalize(tensor, p=2, dim=-1))
return out
else:
return nn.functional.normalize(input_tensors, p=2, dim=-1)
def hashing(query, key, num_hash, hash_len):
if len(query.size()) != 3:
raise ValueError("Query has incorrect size.")
if len(key.size()) != 3:
raise ValueError("Key has incorrect size.")
rmat = torch.randn(query.size(0), query.size(2), num_hash * hash_len, device=query.device)
raise_pow = 2 ** torch.arange(hash_len, device=query.device)
query_projection = torch.matmul(query, rmat).reshape(query.size(0), query.size(1), num_hash, hash_len)
key_projection = torch.matmul(key, rmat).reshape(key.size(0), key.size(1), num_hash, hash_len)
query_binary = (query_projection > 0).int()
key_binary = (key_projection > 0).int()
query_hash = torch.sum(query_binary * raise_pow, dim=-1)
key_hash = torch.sum(key_binary * raise_pow, dim=-1)
return query_hash.int(), key_hash.int()
class YosoCumulation(torch.autograd.Function):
@staticmethod
def forward(ctx, query_mask, key_mask, query, key, value, config):
hash_code_len = config["hash_code_len"]
expectation = (1 - torch.acos(torch.matmul(query, key.transpose(-1, -2))) / math.pi) ** hash_code_len
expectation = expectation * query_mask[:, :, None] * key_mask[:, None, :]
cumulation_value = torch.matmul(expectation, value)
ctx.save_for_backward(query_mask, key_mask, expectation, query, key, value)
ctx.config = config
return cumulation_value
@staticmethod
def backward(ctx, grad):
grad = to_contiguous(grad)
query_mask, key_mask, expectation, query, key, value = ctx.saved_tensors
config = ctx.config
hash_code_len = config["hash_code_len"]
weighted_exp = torch.matmul(grad, value.transpose(-1, -2)) * expectation
grad_query = torch.matmul(weighted_exp, (hash_code_len / 2) * key)
grad_key = torch.matmul(weighted_exp.transpose(-1, -2), (hash_code_len / 2) * query)
grad_value = torch.matmul(expectation.transpose(-1, -2), grad)
return None, None, grad_query, grad_key, grad_value, None
class YosoLSHCumulation(torch.autograd.Function):
@staticmethod
def forward(ctx, query_mask, key_mask, query, key, value, config):
if query_mask.size(0) != key_mask.size(0):
raise ValueError("Query mask and Key mask differ in sizes in dimension 0")
if query_mask.size(0) != query.size(0):
raise ValueError("Query mask and Query differ in sizes in dimension 0")
if query_mask.size(0) != key.size(0):
raise ValueError("Query mask and Key differ in sizes in dimension 0")
if query_mask.size(0) != value.size(0):
raise ValueError("Query mask and Value mask differ in sizes in dimension 0")
if key.size(1) != value.size(1):
raise ValueError("Key and Value differ in sizes in dimension 1")
if query.size(2) != key.size(2):
raise ValueError("Query and Key differ in sizes in dimension 2")
query_mask, key_mask, query, key, value = to_contiguous([query_mask, key_mask, query, key, value])
use_cuda = query_mask.is_cuda
num_hash = config["num_hash"]
hash_code_len = config["hash_code_len"]
hashtable_capacity = int(2**hash_code_len)
if config["use_fast_hash"]:
query_hash_code, key_hash_code = lsh_cumulation.fast_hash(
query_mask, query, key_mask, key, num_hash, hash_code_len, use_cuda, 1
)
else:
query_hash_code, key_hash_code = hashing(query, key, num_hash, hash_code_len)
cumulation_value = lsh_cumulation.lsh_cumulation(
query_mask, query_hash_code, key_mask, key_hash_code, value, hashtable_capacity, use_cuda, 1
)
ctx.save_for_backward(query_mask, key_mask, query_hash_code, key_hash_code, query, key, value)
ctx.config = config
return cumulation_value
@staticmethod
def backward(ctx, grad):
grad = to_contiguous(grad)
query_mask, key_mask, query_hash_code, key_hash_code, query, key, value = ctx.saved_tensors
config = ctx.config
use_cuda = grad.is_cuda
hash_code_len = config["hash_code_len"]
hashtable_capacity = int(2**hash_code_len)
if config["lsh_backward"]:
grad_value = lsh_cumulation.lsh_cumulation(
key_mask, key_hash_code, query_mask, query_hash_code, grad, hashtable_capacity, use_cuda, 1
)
grad_query = lsh_cumulation.lsh_weighted_cumulation(
query_mask,
query_hash_code,
grad,
key_mask,
key_hash_code,
value,
(hash_code_len / 2) * key,
hashtable_capacity,
use_cuda,
4,
)
grad_key = lsh_cumulation.lsh_weighted_cumulation(
key_mask,
key_hash_code,
value,
query_mask,
query_hash_code,
grad,
(hash_code_len / 2) * query,
hashtable_capacity,
use_cuda,
4,
)
else:
expectation = (1 - torch.acos(torch.matmul(query, key.transpose(-1, -2))) / math.pi) ** hash_code_len
expectation = expectation * query_mask[:, :, None] * key_mask[:, None, :]
weighted_exp = torch.matmul(grad, value.transpose(-1, -2)) * expectation
grad_query = torch.matmul(weighted_exp, (hash_code_len / 2) * key)
grad_key = torch.matmul(weighted_exp.transpose(-1, -2), (hash_code_len / 2) * query)
grad_value = torch.matmul(expectation.transpose(-1, -2), grad)
return None, None, grad_query, grad_key, grad_value, None
class YosoEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"token_type_ids",
torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
persistent=False,
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class YosoSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
kernel_loaded = lsh_cumulation is not None
if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
try:
load_cuda_kernels()
except Exception as e:
logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = (
position_embedding_type if position_embedding_type is not None else config.position_embedding_type
)
self.use_expectation = config.use_expectation
self.hash_code_len = config.hash_code_len
self.use_conv = config.conv_window is not None
self.use_fast_hash = config.use_fast_hash
self.num_hash = config.num_hash
self.lsh_backward = config.lsh_backward
self.lsh_config = {
"hash_code_len": self.hash_code_len,
"use_fast_hash": self.use_fast_hash,
"num_hash": self.num_hash,
"lsh_backward": self.lsh_backward,
}
if config.conv_window is not None:
self.conv = nn.Conv2d(
in_channels=config.num_attention_heads,
out_channels=config.num_attention_heads,
kernel_size=(config.conv_window, 1),
padding=(config.conv_window // 2, 0),
bias=False,
groups=config.num_attention_heads,
)
def transpose_for_scores(self, layer):
new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
layer = layer.view(*new_layer_shape)
return layer.permute(0, 2, 1, 3)
class YosoSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class YosoAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = YosoSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = YosoSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
self_outputs = self.self(hidden_states, attention_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class YosoIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class YosoOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class YosoLayer(nn.Module):
pass
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = YosoAttention(config)
self.add_cross_attention = config.add_cross_attention
self.intermediate = YosoIntermediate(config)
self.output = YosoOutput(config)
def forward(self, hidden_states, attention_mask=None, output_attentions=False):
self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class YosoEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([YosoLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
output_attentions,
)
else:
layer_outputs = layer_module(hidden_states, attention_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutputWithCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class YosoPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class YosoLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = YosoPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class YosoOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = YosoLMPredictionHead(config)
def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class YosoPreTrainedModel(PreTrainedModel):
"""
一个抽象类,用于处理权重初始化、下载和加载预训练模型的简单接口。
"""
config_class = YosoConfig
base_model_prefix = "yoso"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""初始化权重"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
YOSO_START_DOCSTRING = r"""
这个模型是一个PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)的子类。
可以像普通的PyTorch模块一样使用,并且可以参考PyTorch文档了解一切与一般使用和行为相关的问题。
参数:
config ([`YosoConfig`]): 包含模型所有参数的模型配置类。
使用配置文件初始化不会加载模型关联的权重,只会加载配置。
可以查看[`~PreTrainedModel.from_pretrained`]方法来加载模型权重。
"""
YOSO_INPUTS_DOCSTRING = r"""
输入:
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
# 输入序列中每个token的索引,用于在词汇表中查找对应的token表示
Indices of input sequence tokens in the vocabulary.
# 可以使用AutoTokenizer获取这些索引。参见PreTrainedTokenizer.encode和PreTrainedTokenizer.__call__获取更多细节。
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
# 注意力掩码,用于避免对填充的token进行注意力计算。掩码值选择在[0, 1]之间:
# - 1表示该token是**未被掩码**的,
# - 0表示该token是**被掩码**的。
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 段落token索引,用于指示输入的第一部分和第二部分。索引选择在[0, 1]之间:
# - 0对应*句子A*的token,
# - 1对应*句子B*的token。
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
# 每个输入序列token在位置嵌入中的位置索引。选取范围为[0, config.max_position_embeddings - 1]。
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
# 用于将自注意力模块的特定头部置为零的掩码。掩码值选择在[0, 1]之间:
# - 1表示该头部**未被掩码**,
# - 0表示该头部**被掩码**。
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
# 可选项,可以直接传递嵌入表示而不是传递input_ids。如果需要更多控制如何将input_ids索引转换为关联向量,则这是有用的,比模型内部的嵌入查找矩阵更灵活。
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的`attentions`。
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。详细信息请参见返回的张量中的`hidden_states`。
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail.
return_dict (`bool`, *optional*):
# 是否返回`~utils.ModelOutput`而不是普通的元组。
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
YOSO Model transformer outputting raw hidden-states without any specific head on top.
YOSO_START_DOCSTRING: 表示模型开始文档字符串的示例。
"""
class YosoModel(YosoPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = YosoEmbeddings(config) # 初始化 YosoEmbeddings 对象,用于处理输入的嵌入层
self.encoder = YosoEncoder(config) # 初始化 YosoEncoder 对象,用于编码输入数据
# Initialize weights and apply final processing
self.post_init() # 调用模型初始化后处理函数
def get_input_embeddings(self):
return self.embeddings.word_embeddings # 返回输入嵌入层的单词嵌入
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value # 设置输入嵌入层的单词嵌入为给定的 value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads) # 剪枝模型中的注意力头部
@add_start_docstrings("""YOSO Model with a `language modeling` head on top.""", YOSO_START_DOCSTRING)
class YosoForMaskedLM(YosoPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
self.yoso = YosoModel(config) # 初始化 YosoModel 作为 YosoForMaskedLM 的基础模型
self.cls = YosoOnlyMLMHead(config) # 初始化 YosoOnlyMLMHead 作为 YosoForMaskedLM 的 MLM 头部
# Initialize weights and apply final processing
self.post_init() # 调用模型初始化后处理函数
def get_output_embeddings(self):
return self.cls.predictions.decoder # 返回输出嵌入层的解码器
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings # 设置输出嵌入层的解码器为给定的新嵌入
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token IDs,可以为空
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,用于指示哪些token需要注意,可以为空
token_type_ids: Optional[torch.Tensor] = None, # token类型IDs,如用于区分segment A和segment B,可以为空
position_ids: Optional[torch.Tensor] = None, # 位置IDs,用于指示token的位置信息,可以为空
head_mask: Optional[torch.Tensor] = None, # 头部掩码,用于指定哪些注意力头部被屏蔽,可以为空
inputs_embeds: Optional[torch.Tensor] = None, # 输入的嵌入表示,可以为空
labels: Optional[torch.Tensor] = None, # 用于计算MLM损失的标签,形状为(batch_size, sequence_length),可选
output_attentions: Optional[bool] = None, # 是否输出注意力权重,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,可选
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,若为None,则使用self.config.use_return_dict的设置
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict # 确定是否返回字典形式的输出,根据传入的return_dict或self.config.use_return_dict决定
outputs = self.yoso(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
) # 调用yoso模型,传入各种参数,并根据return_dict确定返回形式
sequence_output = outputs[0] # 获取模型输出的序列输出
prediction_scores = self.cls(sequence_output) # 使用线性层对序列输出进行预测得分计算
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss() # 定义交叉熵损失函数,用于计算MLM损失
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) # 计算MLM损失
if not return_dict:
output = (prediction_scores,) + outputs[1:] # 如果不返回字典形式的输出,则构造输出元组
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output # 返回带有损失的输出元组或普通的输出元组
return MaskedLMOutput(
loss=masked_lm_loss, # 返回带有损失信息的MaskedLMOutput对象
logits=prediction_scores, # 返回预测得分
hidden_states=outputs.hidden_states, # 返回隐藏状态
attentions=outputs.attentions, # 返回注意力权重
)
class YosoClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
# 线性层,输入和输出维度均为 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# Dropout 层,用于随机屏蔽输入单元,防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 输出层线性变换,将隐藏状态映射到标签数量维度
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
def forward(self, features, **kwargs):
# 选择 features 中的第一个位置处的特征向量,通常代表 [CLS] 标记的特征
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x) # 应用 Dropout 层
x = self.dense(x) # 线性变换
# 使用配置中指定的激活函数处理 x
x = ACT2FN[self.config.hidden_act](x)
x = self.dropout(x) # 再次应用 Dropout 层
x = self.out_proj(x) # 输出层线性变换
return x
@add_start_docstrings(
"""YOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks.""",
YOSO_START_DOCSTRING,
)
class YosoForSequenceClassification(YosoPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# YOSO 模型的初始化
self.yoso = YosoModel(config)
# 分类器头部初始化
self.classifier = YosoClassificationHead(config)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 如果 return_dict 不为 None,则使用该值;否则使用 self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用模型的前向传播方法 yoso,传入各种输入参数
outputs = self.yoso(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取序列输出
sequence_output = outputs[0]
# 将序列输出传入分类器,得到 logits
logits = self.classifier(sequence_output)
# 初始化损失值
loss = None
# 如果给定了标签 labels,则计算相应的损失
if labels is not None:
# 如果问题类型未定义,则根据情况进行定义
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型选择对应的损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
# 如果只有一个标签,使用损失函数计算损失
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
# 使用交叉熵损失函数计算损失
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
# 使用带 logits 的二元交叉熵损失函数计算损失
loss = loss_fct(logits, labels)
# 如果 return_dict 为 False,则返回 logits 和额外的输出项
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 返回带有损失、logits、隐藏状态和注意力权重的 SequenceClassifierOutput 对象
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# 使用装饰器为类添加文档字符串,描述其作为多选分类模型的YOSO模型
@add_start_docstrings(
"""YOSO Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
YOSO_START_DOCSTRING,
)
# 定义YosoForMultipleChoice类,继承自YosoPreTrainedModel
class YosoForMultipleChoice(YosoPreTrainedModel):
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 初始化Yoso模型
self.yoso = YosoModel(config)
# 初始化预分类器,使用线性层将隐藏状态大小映射到相同的隐藏状态大小
self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
# 初始化分类器,使用线性层将隐藏状态大小映射到1,用于多选分类任务
self.classifier = nn.Linear(config.hidden_size, 1)
# 调用后处理初始化方法
self.post_init()
# 使用装饰器添加文档字符串描述forward方法的输入参数
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
# 使用装饰器添加代码示例和检查点等文档字符串,指定输出类型为MultipleChoiceModelOutput
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# forward方法的参数说明完毕,没有实现具体的功能逻辑
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
# 确保返回的字典对象不为空,根据配置决定是否使用返回字典
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 获取输入的选项数量,如果是通过 input_ids 计算得到的话
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
# 重新组织输入,将各种输入类型展平为二维张量,便于模型处理
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
# 将处理后的输入传递给模型,获取模型的输出
outputs = self.yoso(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中获取隐藏状态
hidden_state = outputs[0] # (bs * num_choices, seq_len, dim)
# 从隐藏状态中提取池化输出,通常是第一个位置的隐藏状态
pooled_output = hidden_state[:, 0] # (bs * num_choices, dim)
# 将池化输出传递给预分类器,进行进一步处理
pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim)
# 使用 ReLU 激活函数处理池化输出
pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim)
# 将处理后的池化输出传递给分类器,得到最终的 logits
logits = self.classifier(pooled_output)
# 重新调整 logits 的形状,使其与 labels 的形状匹配
reshaped_logits = logits.view(-1, num_choices)
# 计算损失,如果提供了 labels 的话
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
# 根据 return_dict 决定返回的格式
if not return_dict:
# 如果不要求返回字典,则返回一个元组
output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 如果要求返回字典,则返回一个 MultipleChoiceModelOutput 对象
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# YOSO Model with a token classification head on top (a linear layer on top of
# the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
# 继承自 YosoPreTrainedModel 类的 YosoForTokenClassification 类,用于在 YOSO 模型基础上添加标记分类头部,
# 例如用于命名实体识别(NER)任务。
class YosoForTokenClassification(YosoPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
# 初始化 YOSO 模型
self.yoso = YosoModel(config)
# Dropout 层
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 线性分类器,将隐藏状态映射到标签数量的输出
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
# 前向传播函数,接受多种输入参数并返回输出,用于模型推理和训练
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
# 如果 return_dict 为 None,则根据配置决定是否使用 return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用模型的 forward 方法进行预测
outputs = self.yoso(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中取出序列输出
sequence_output = outputs[0]
# 对序列输出应用 dropout
sequence_output = self.dropout(sequence_output)
# 使用分类器进行分类得到 logits
logits = self.classifier(sequence_output)
# 初始化损失为 None
loss = None
if labels is not None:
# 使用交叉熵损失函数
loss_fct = CrossEntropyLoss()
# 只保留激活部分的损失
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
# 如果不使用 return_dict,则返回 logits 和额外的输出
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
# 使用 TokenClassifierOutput 封装并返回结果
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# YOSO模型,用于支持像SQuAD这样的抽取式问答任务,具有一个用于计算“起始位置logits”和“结束位置logits”的线性分类头部。
# 该模型继承自YosoPreTrainedModel。
@add_start_docstrings(
"""YOSO Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
YOSO_START_DOCSTRING,
)
class YosoForQuestionAnswering(YosoPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 设置模型的标签数目为2
config.num_labels = 2
self.num_labels = config.num_labels
# 初始化YOSO模型和用于问答输出的线性层
self.yoso = YosoModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 下面是模型前向传播所需的输入参数说明
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 如果 return_dict 为 None,则使用配置中的 use_return_dict 值
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用模型 yoso,传入各种输入参数,并返回输出
outputs = self.yoso(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 从模型输出中取出序列输出(通常是 BERT 输出的第一个元素)
sequence_output = outputs[0]
# 将序列输出传入问答模型的输出层,得到起始和结束 logits
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
total_loss = None
if start_positions is not None and end_positions is not None:
# 如果 start_positions 和 end_positions 不为空,则计算损失
# 如果在多 GPU 下训练,可能需要增加维度
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 忽略超出模型输入的 start/end positions
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 使用交叉熵损失函数,忽略索引为 ignored_index 的位置
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
# 如果 return_dict 为 False,则返回元组形式的输出
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
# 如果 return_dict 为 True,则返回 QuestionAnsweringModelOutput 类的实例
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\yoso\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {"configuration_yoso": ["YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP", "YosoConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_yoso"] = [
"YOSO_PRETRAINED_MODEL_ARCHIVE_LIST",
"YosoForMaskedLM",
"YosoForMultipleChoice",
"YosoForQuestionAnswering",
"YosoForSequenceClassification",
"YosoForTokenClassification",
"YosoLayer",
"YosoModel",
"YosoPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_yoso import YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP, YosoConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_yoso import (
YOSO_PRETRAINED_MODEL_ARCHIVE_LIST,
YosoForMaskedLM,
YosoForMultipleChoice,
YosoForQuestionAnswering,
YosoForSequenceClassification,
YosoForTokenClassification,
YosoLayer,
YosoModel,
YosoPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\__init__.py
from . import (
albert,
align,
altclip,
audio_spectrogram_transformer,
auto,
autoformer,
bark,
bart,
barthez,
bartpho,
beit,
bert,
bert_generation,
bert_japanese,
bertweet,
big_bird,
bigbird_pegasus,
biogpt,
bit,
blenderbot,
blenderbot_small,
blip,
blip_2,
bloom,
bridgetower,
bros,
byt5,
camembert,
canine,
chinese_clip,
clap,
clip,
clipseg,
clvp,
code_llama,
codegen,
cohere,
conditional_detr,
convbert,
convnext,
convnextv2,
cpm,
cpmant,
ctrl,
cvt,
data2vec,
deberta,
deberta_v2,
decision_transformer,
deformable_detr,
deit,
deprecated,
depth_anything,
deta,
detr,
dialogpt,
dinat,
dinov2,
distilbert,
dit,
donut,
dpr,
dpt,
efficientformer,
efficientnet,
electra,
encodec,
encoder_decoder,
ernie,
ernie_m,
esm,
falcon,
fastspeech2_conformer,
flaubert,
flava,
fnet,
focalnet,
fsmt,
funnel,
fuyu,
gemma,
git,
glpn,
gpt2,
gpt_bigcode,
gpt_neo,
gpt_neox,
gpt_neox_japanese,
gpt_sw3,
gptj,
gptsan_japanese,
graphormer,
groupvit,
herbert,
hubert,
ibert,
idefics,
imagegpt,
informer,
instructblip,
jukebox,
kosmos2,
layoutlm,
layoutlmv2,
layoutlmv3,
layoutxlm,
led,
levit,
lilt,
llama,
llava,
llava_next,
longformer,
longt5,
luke,
lxmert,
m2m_100,
mamba,
marian,
markuplm,
mask2former,
maskformer,
mbart,
mbart50,
mega,
megatron_bert,
megatron_gpt2,
import rembert
import resnet
import roberta
import roberta_prelayernorm
import roc_bert
import roformer
import rwkv
import sam
import seamless_m4t
import seamless_m4t_v2
import segformer
import seggpt
import sew
import sew_d
import siglip
import speech_encoder_decoder
import speech_to_text
import speech_to_text_2
import speecht5
import splinter
import squeezebert
import stablelm
import starcoder2
import superpoint
import swiftformer
import swin
import swin2sr
import swinv2
import switch_transformers
import t5
import table_transformer
import tapas
import time_series_transformer
import timesformer
import timm_backbone
import trocr
import tvlt
import tvp
import udop
import umt5
import unispeech
import unispeech_sat
import univnet
import upernet
import videomae
import vilt
import vipllava
import vision_encoder_decoder
import vision_text_dual_encoder
import visual_bert
import vit
import vit_hybrid
import vit_mae
import vit_msn
import vitdet
import vitmatte
import vits
import vivit
import wav2vec2
import wav2vec2_bert
import wav2vec2_conformer
import wav2vec2_phoneme
import wav2vec2_with_lm
import wavlm
import whisper
import x_clip
import xglm
import xlm
import xlm_prophetnet
import xlm_roberta
import xlm_roberta_xl
import xlnet
import xmod
import yolos
import yoso
)
.\onnx\config.py
import copy
import dataclasses
import warnings
from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Mapping,
Optional,
Tuple,
Union,
)
import numpy as np
from packaging import version
from ..utils import TensorType, is_torch_available, is_vision_available, logging
from .utils import (
ParameterFormat,
compute_effective_axis_dimension,
compute_serialized_parameters_size,
)
if TYPE_CHECKING:
from ..configuration_utils import PretrainedConfig
from ..feature_extraction_utils import FeatureExtractionMixin
from ..image_processing_utils import ImageProcessingMixin
from ..tokenization_utils_base import PreTrainedTokenizerBase
if is_vision_available():
from PIL import Image
logger = logging.get_logger(__name__)
DEFAULT_ONNX_OPSET = 11
EXTERNAL_DATA_FORMAT_SIZE_LIMIT = 2 * 1024 * 1024 * 1024
@dataclasses.dataclass
class PatchingSpec:
"""
数据类,保存补丁规范。
Args:
o: 包含要打补丁的操作的模块 / 对象
name: 要打补丁的操作的名称
custom_op: 打补丁的自定义操作
orig_op: 正在被打补丁的原始操作
op_wrapper: 包装器(可选),包装原始操作和自定义操作。
对于类或静态方法很有用。
"""
o: Any
name: str
custom_op: Callable
orig_op: Optional[Callable] = None
op_wrapper: Optional[Callable] = None
class OnnxConfig(ABC):
"""
ONNX 可导出模型的基类,描述通过 ONNX 格式导出模型的元数据。
"""
default_fixed_batch = 2
default_fixed_sequence = 8
default_fixed_num_choices = 4
torch_onnx_minimum_version = version.parse("1.8")
_tasks_to_common_outputs = {
"causal-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
"default": OrderedDict({"last_hidden_state": {0: "batch", 1: "sequence"}}),
"image-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
"image-segmentation": OrderedDict(
{
"logits": {0: "batch", 1: "sequence"},
"pred_boxes": {0: "batch", 1: "sequence"},
"pred_masks": {0: "batch", 1: "sequence"},
}
),
"masked-im": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
"masked-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
"multiple-choice": OrderedDict({"logits": {0: "batch"}}),
"object-detection": OrderedDict(
{
"logits": {0: "batch", 1: "sequence"},
"pred_boxes": {0: "batch", 1: "sequence"},
}
),
"question-answering": OrderedDict(
{
"start_logits": {0: "batch", 1: "sequence"},
"end_logits": {0: "batch", 1: "sequence"},
}
),
"semantic-segmentation": OrderedDict({"logits": {0: "batch", 1: "num_labels", 2: "height", 3: "width"}}),
"seq2seq-lm": OrderedDict({"logits": {0: "batch", 1: "decoder_sequence"}}),
"sequence-classification": OrderedDict({"logits": {0: "batch"}}),
"token-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
"vision2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
"speech2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
}
def __init__(self, config: "PretrainedConfig", task: str = "default", patching_specs: List[PatchingSpec] = None):
self._config = config
if task not in self._tasks_to_common_outputs:
raise ValueError(
f"{task} is not a supported task, supported tasks: {self._tasks_to_common_outputs.keys()}"
)
self.task = task
self._patching_specs = []
for spec in patching_specs if patching_specs is not None else []:
final_spec = spec
if spec.orig_op is None:
final_spec = dataclasses.replace(spec, orig_op=getattr(spec.o, spec.name))
self._patching_specs.append(final_spec)
@classmethod
def from_model_config(cls, config: "PretrainedConfig", task: str = "default") -> "OnnxConfig":
"""
根据模型配置生成一个 OnnxConfig 实例
Args:
config: 导出到 ONNX 时使用的模型配置
Returns:
该模型的 OnnxConfig 实例
"""
return cls(config, task=task)
@property
@abstractmethod
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
"""
Mapping containing the axis definition of the input tensors to provide to the model
Returns:
For each input: its name associated to the axes symbolic name and the axis position within the tensor
"""
raise NotImplementedError()
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
"""
Mapping containing the axis definition of the output tensors to provide to the model
Returns:
For each output: its name associated to the axes symbolic name and the axis position within the tensor
"""
common_outputs = self._tasks_to_common_outputs[self.task]
return copy.deepcopy(common_outputs)
@property
def values_override(self) -> Optional[Mapping[str, Any]]:
"""
Dictionary of keys to override in the model's config before exporting
Returns:
Dictionary with the keys (and their corresponding values) to override
"""
if hasattr(self._config, "use_cache"):
return {"use_cache": False}
return None
@property
def default_batch_size(self) -> int:
"""
The default batch size to use if no other indication
Returns:
Integer > 0
"""
return OnnxConfig.default_fixed_batch
@property
def default_sequence_length(self) -> int:
"""
The default sequence length to use if no other indication
Returns:
Integer > 0
"""
return OnnxConfig.default_fixed_sequence
@property
def default_num_choices(self) -> int:
"""
The default number of choices to use if no other indication
Returns:
Integer > 0
"""
return OnnxConfig.default_fixed_num_choices
@property
def default_onnx_opset(self) -> int:
"""
Which onnx opset to use when exporting the model
Returns:
Integer ONNX Opset version
"""
return DEFAULT_ONNX_OPSET
@property
def atol_for_validation(self) -> float:
"""
What absolute tolerance value to use during model conversion validation.
Returns:
Float absolute tolerance value.
"""
return 1e-5
@property
def is_torch_support_available(self) -> bool:
"""
The minimum PyTorch version required to export the model.
Returns:
`bool`: Whether the installed version of PyTorch is compatible with the model.
"""
if is_torch_available():
from transformers.utils import get_torch_version
return version.parse(get_torch_version()) >= self.torch_onnx_minimum_version
else:
return False
def use_external_data_format(num_parameters: int) -> bool:
"""
Flag indicating if the model requires using external data format
Args:
num_parameters: Number of parameters in the model
Returns:
True if the serialized parameter size in float32 >= 2Gb, False otherwise
"""
return (
compute_serialized_parameters_size(num_parameters, ParameterFormat.Float)
>= EXTERNAL_DATA_FORMAT_SIZE_LIMIT
)
def _generate_dummy_images(
self, batch_size: int = 2, num_channels: int = 3, image_height: int = 40, image_width: int = 40
):
"""
Generate dummy images as a list of PIL Image objects.
Args:
batch_size: Number of images to generate
num_channels: Number of color channels per image
image_height: Height of each image
image_width: Width of each image
Returns:
List of PIL Image objects
"""
images = []
for _ in range(batch_size):
data = np.random.rand(image_height, image_width, num_channels) * 255
images.append(Image.fromarray(data.astype("uint8")).convert("RGB"))
return images
def _generate_dummy_audio(
self, batch_size: int = 2, sampling_rate: int = 22050, time_duration: float = 5.0, frequency: int = 220
):
"""
Generate dummy audio data as a list of numpy arrays representing audio samples.
Args:
batch_size: Number of audio samples to generate
sampling_rate: Sampling rate of the audio samples
time_duration: Duration of each audio sample in seconds
frequency: Frequency of the sine wave to generate
Returns:
List of numpy arrays representing audio samples
"""
audio_data = []
for _ in range(batch_size):
t = np.linspace(0, time_duration, int(time_duration * sampling_rate), endpoint=False)
audio_data.append(0.5 * np.sin(2 * np.pi * frequency * t))
return audio_data
def generate_dummy_inputs(
self,
preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin", "ImageProcessingMixin"],
batch_size: int = -1,
seq_length: int = -1,
num_choices: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
num_channels: int = 3,
image_width: int = 40,
image_height: int = 40,
sampling_rate: int = 22050,
time_duration: float = 5.0,
frequency: int = 220,
tokenizer: "PreTrainedTokenizerBase" = None,
):
"""
Generate dummy inputs for the model, such as images, audio, or text tokens.
Args:
preprocessor: Preprocessor object for handling different input types
batch_size: Number of inputs to generate
seq_length: Length of sequence inputs
num_choices: Number of choices (for multiple choice scenarios)
is_pair: Whether the input is a pair
framework: Framework type for input handling
num_channels: Number of channels for image inputs
image_width: Width of image inputs
image_height: Height of image inputs
sampling_rate: Sampling rate for audio inputs
time_duration: Duration of audio inputs
frequency: Frequency of audio inputs
tokenizer: Tokenizer object for token-based inputs
Returns:
Dummy inputs suitable for the model
"""
def generate_dummy_inputs_onnxruntime(self, reference_model_inputs: Mapping[str, Any]) -> Mapping[str, Any]:
"""
Generate inputs for ONNX Runtime using the reference model inputs.
Args:
reference_model_inputs: Mapping of inputs for the model
Returns:
Mapping of inputs suitable for the model's forward function in ONNX Runtime
"""
return reference_model_inputs
def patch_ops(self):
"""
Patch operations on the model instance using predefined specifications.
"""
for spec in self._patching_specs:
custom_op = spec.custom_op if spec.op_wrapper is None else spec.op_wrapper(spec.custom_op)
setattr(spec.o, spec.name, custom_op)
def restore_ops(self):
for spec in self._patching_specs:
orig_op = spec.orig_op if spec.op_wrapper is None else spec.op_wrapper(spec.orig_op)
setattr(spec.o, spec.name, orig_op)
@classmethod
def flatten_output_collection_property(cls, name: str, field: Iterable[Any]) -> Dict[str, Any]:
"""
Flatten any potential nested structure expanding the name of the field with the index of the element within the
structure.
Args:
name: The name of the nested structure
field: The structure to, potentially, be flattened
Returns:
(Dict[str, Any]): Outputs with flattened structure and key mapping this new structure.
"""
from itertools import chain
return {f"{name}.{idx}": item for idx, item in enumerate(chain.from_iterable(field))}
class OnnxConfigWithPast(OnnxConfig, ABC):
def __init__(
self,
config: "PretrainedConfig",
task: str = "default",
patching_specs: List[PatchingSpec] = None,
use_past: bool = False,
):
super().__init__(config, task=task, patching_specs=patching_specs)
self.use_past = use_past
@classmethod
def with_past(cls, config: "PretrainedConfig", task: str = "default") -> "OnnxConfigWithPast":
"""
实例化一个带有 `use_past` 属性设置为 True 的 OnnxConfig 对象
Args:
config: 导出到 ONNX 时使用的底层模型配置
Returns:
设置了 `.use_past = True` 的 OnnxConfig 对象
"""
return cls(config, task=task, use_past=True)
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
common_outputs = super().outputs
if self.use_past:
self.fill_with_past_key_values_(common_outputs, direction="outputs")
return common_outputs
@property
def values_override(self) -> Optional[Mapping[str, Any]]:
if hasattr(self._config, "use_cache"):
return {"use_cache": self.use_past}
return None
@property
def num_layers(self) -> int:
"""
从模型配置中获取层数属性。对于不称为 `num_layers` 的模型配置,请覆盖此方法。
"""
if not hasattr(self._config, "num_layers"):
raise AttributeError(
"could not find the number of layers attribute in the model configuration, override the num_layers"
" property of the model OnnxConfig to solve this"
)
return self._config.num_layers
@property
def num_attention_heads(self) -> int:
"""
从模型配置中获取注意力头数属性。对于不称为 `num_attention_heads` 的模型配置,请覆盖此方法。
"""
if not hasattr(self._config, "num_attention_heads"):
raise AttributeError(
"could not find the number of attention heads attribute in the model configuration, override the"
" num_attention_heads property of the model OnnxConfig to solve this"
)
return self._config.num_attention_heads
def generate_dummy_inputs(
self,
tokenizer: "PreTrainedTokenizerBase",
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
):
pass
) -> Mapping[str, Any]:
common_inputs = super().generate_dummy_inputs(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen + 2
shape = (
batch,
self.num_attention_heads,
past_key_values_length,
self._config.hidden_size // self.num_attention_heads,
)
if "attention_mask" in common_inputs:
mask_dtype = common_inputs["attention_mask"].dtype
common_inputs["attention_mask"] = torch.cat(
[common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)],
dim=1,
)
common_inputs["past_key_values"] = []
for _ in range(self.num_layers):
common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
return common_inputs
def fill_with_past_key_values_(
self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str, inverted_values_shape: bool = False
):
"""
Fill the input_or_outputs mapping with past_key_values dynamic axes considering.
Args:
inputs_or_outputs: The mapping to fill.
direction: either "inputs" or "outputs", it specifies whether input_or_outputs is the input mapping or the
output mapping, this is important for axes naming.
inverted_values_shape:
If `True`, store values on dynamic axis 1, else on axis 2.
"""
if direction not in ["inputs", "outputs"]:
raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
name = "past_key_values" if direction == "inputs" else "present"
for i in range(self.num_layers):
inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
if inverted_values_shape:
inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch", 1: "past_sequence + sequence"}
else:
inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
flattened_output[f"{name}.{idx}.key"] = t[0]
flattened_output[f"{name}.{idx}.value"] = t[1]
def flatten_output_collection_property(self, name: str, field: Iterable[Any]) -> Dict[str, Any]:
flattened_output = {}
if name in ["present", "past_key_values"]:
for idx, t in enumerate(field):
self._flatten_past_key_values_(flattened_output, name, idx, t)
else:
flattened_output = super().flatten_output_collection_property(name, field)
return flattened_output
class OnnxSeq2SeqConfigWithPast(OnnxConfigWithPast):
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
common_outputs = super(OnnxConfigWithPast, self).outputs
for name, axes_names in common_outputs.items():
sequence_name = "encoder_sequence" if "encoder" in name else "decoder_sequence"
for axis_idx, name in axes_names.items():
if "sequence" in name:
axes_names[axis_idx] = sequence_name
else:
axes_names[axis_idx] = name
if self.use_past:
self.fill_with_past_key_values_(common_outputs, direction="outputs")
return common_outputs
@property
def num_layers(self) -> Tuple[int]:
try:
num_layers = super().num_layers
num_layers = (num_layers, num_layers)
except AttributeError:
if hasattr(self._config, "encoder_layers") and hasattr(self._config, "decoder_layers"):
num_layers = (self._config.encoder_layers, self._config.decoder_layers)
else:
raise AttributeError(
"could not find the number of encoder and decoder layers attributes in the model configuration,"
" override the num_layers property of the model OnnxConfig to solve this"
)
return num_layers
@property
def num_attention_heads(self) -> Tuple[int]:
try:
num_attention_heads = super().num_attention_heads
num_attention_heads = (num_attention_heads, num_attention_heads)
except AttributeError:
if hasattr(self._config, "encoder_attention_heads") and hasattr(self._config, "decoder_attention_heads"):
num_attention_heads = (self._config.encoder_attention_heads, self._config.decoder_attention_heads)
else:
raise AttributeError(
"could not find the number of attention heads for the encoder and the decoder attributes in the"
" model configuration, override the num_attention_heads property of the model OnnxConfig to solve"
" this"
)
return num_attention_heads
def generate_dummy_inputs(
self,
tokenizer: "PreTrainedTokenizerBase",
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
if direction not in ["inputs", "outputs"]:
raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
name = "past_key_values" if direction == "inputs" else "present"
num_encoder_layers, num_decoder_layers = self.num_layers
min_num_layers = min(num_encoder_layers, num_decoder_layers)
max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
encoder_sequence = "past_encoder_sequence"
decoder_sequence = "past_decoder_sequence" if direction == "inputs" else "past_decoder_sequence + sequence"
for i in range(min_num_layers):
inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch", 2: decoder_sequence}
inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch", 2: decoder_sequence}
inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch", 2: encoder_sequence}
inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch", 2: encoder_sequence}
for i in range(min_num_layers, max_num_layers):
if remaining_side_name == "encoder":
axes_info = {0: "batch", 2: encoder_sequence}
else:
axes_info = {0: "batch", 2: decoder_sequence}
inputs_or_outputs[f"{name}.{i}.{remaining_side_name}.key"] = axes_info
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
flattened_output[f"{name}.{idx}.decoder.key"] = t[0]
flattened_output[f"{name}.{idx}.decoder.value"] = t[1]
flattened_output[f"{name}.{idx}.encoder.key"] = t[2]
flattened_output[f"{name}.{idx}.encoder.value"] = t[3]
.\onnx\convert.py
import warnings
from inspect import signature
from itertools import chain
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Tuple, Union
import numpy as np
from packaging.version import Version, parse
from ..tokenization_utils_base import PreTrainedTokenizerBase
from ..utils import (
TensorType,
is_tf_available,
is_torch_available,
logging,
)
from .config import OnnxConfig
if is_torch_available():
from ..modeling_utils import PreTrainedModel
if is_tf_available():
from ..modeling_tf_utils import TFPreTrainedModel
if TYPE_CHECKING:
from ..feature_extraction_utils import FeatureExtractionMixin
from ..processing_utils import ProcessorMixin
from ..tokenization_utils import PreTrainedTokenizer
logger = logging.get_logger(__name__)
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")
def check_onnxruntime_requirements(minimum_version: Version):
"""
检查是否安装了ONNX Runtime,并且安装的版本是否足够新
Raises:
ImportError: 如果未安装ONNX Runtime或版本太旧
"""
try:
import onnxruntime
ort_version = parse(onnxruntime.__version__)
if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
raise ImportError(
f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
"Please update onnxruntime by running `pip install --upgrade onnxruntime`"
)
except ImportError:
raise ImportError(
"onnxruntime doesn't seem to be currently installed. "
"Please install the onnxruntime by running `pip install onnxruntime`"
" and relaunch the conversion."
)
def export_pytorch(
preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
model: "PreTrainedModel",
config: OnnxConfig,
opset: int,
output: Path,
tokenizer: "PreTrainedTokenizer" = None,
device: str = "cpu",
) -> Tuple[List[str], List[str]]:
"""
导出PyTorch模型至ONNX格式
Args:
preprocessor (Union[PreTrainedTokenizer, FeatureExtractionMixin, ProcessorMixin]):
预处理器对象,可能是PreTrainedTokenizer、FeatureExtractionMixin或ProcessorMixin的子类实例
model (PreTrainedModel): 预训练模型对象,是PreTrainedModel的子类实例
config (OnnxConfig): ONNX导出配置对象,是OnnxConfig类的实例
opset (int): ONNX操作集版本号
output (Path): 导出的ONNX模型路径
tokenizer (PreTrainedTokenizer, optional):
如果模型需要tokenizer,此处提供其对象,可能是PreTrainedTokenizer的子类实例. Defaults to None.
device (str, optional): 设备类型,例如'cpu'或'cuda'. Defaults to "cpu".
Returns:
Tuple[List[str], List[str]]: 返回两个字符串列表,分别表示成功和失败的导出步骤
"""
if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.")
if tokenizer is not None:
warnings.warn(
"The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
" `preprocessor` instead.",
FutureWarning,
)
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
preprocessor = tokenizer
if issubclass(type(model), PreTrainedModel):
import torch
from torch.onnx import export as onnx_export
logger.info(f"Using framework PyTorch: {torch.__version__}")
with torch.no_grad():
model.config.return_dict = True
model.eval()
if config.values_override is not None:
logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
for override_config_key, override_config_value in config.values_override.items():
logger.info(f"\t- {override_config_key} -> {override_config_value}")
setattr(model.config, override_config_key, override_config_value)
model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.PYTORCH)
device = torch.device(device)
if device.type == "cuda" and torch.cuda.is_available():
model.to(device)
model_inputs_device = {}
for k, v in model_inputs.items():
if isinstance(v, Tuple):
model_inputs_device[k] = tuple(
x.to(device) if isinstance(x, torch.Tensor) else None for x in v
)
elif isinstance(v, List):
model_inputs_device[k] = [
tuple(x.to(device) if isinstance(x, torch.Tensor) else None for x in t) for t in v
]
else:
model_inputs_device[k] = v.to(device)
model_inputs = model_inputs_device
inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
onnx_outputs = list(config.outputs.keys())
if not inputs_match:
raise ValueError("Model and config inputs doesn't match")
config.patch_ops()
onnx_export(
model,
(model_inputs,),
f=output.as_posix(),
input_names=list(config.inputs.keys()),
output_names=onnx_outputs,
dynamic_axes=dict(chain(config.inputs.items(), config.outputs.items())),
do_constant_folding=True,
opset_version=opset,
)
config.restore_ops()
return matched_inputs, onnx_outputs
def export_tensorflow(
preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin"],
model: "TFPreTrainedModel",
config: OnnxConfig,
opset: int,
output: Path,
tokenizer: "PreTrainedTokenizer" = None,
) -> Tuple[List[str], List[str]]:
"""
Args:
preprocessor: ([`PreTrainedTokenizer`] or [`FeatureExtractionMixin`]):
用于对数据进行编码的预处理器。
model ([`TFPreTrainedModel`]):
要导出的模型。
config ([`~onnx.config.OnnxConfig`]):
导出模型相关的 ONNX 配置。
opset (`int`):
要使用的 ONNX 操作集的版本。
output (`Path`):
存储导出的 ONNX 模型的目录。
Returns:
`Tuple[List[str], List[str]]`: 包含模型输入顺序列表和来自 ONNX 配置的命名输入的元组。
"""
import onnx
import tensorflow as tf
import tf2onnx
if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
raise ValueError("You cannot provide both a tokenizer and preprocessor to export the model.")
if tokenizer is not None:
warnings.warn(
"The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
" `preprocessor` instead.",
FutureWarning,
)
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
preprocessor = tokenizer
model.config.return_dict = True
if config.values_override is not None:
logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
for override_config_key, override_config_value in config.values_override.items():
logger.info(f"\t- {override_config_key} -> {override_config_value}")
setattr(model.config, override_config_key, override_config_value)
model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.TENSORFLOW)
inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
onnx_outputs = list(config.outputs.keys())
input_signature = [
tf.TensorSpec([None] * tensor.ndim, dtype=tensor.dtype, name=key) for key, tensor in model_inputs.items()
]
onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=opset)
onnx.save(onnx_model, output.as_posix())
config.restore_ops()
return matched_inputs, onnx_outputs
device: str = "cpu",
def export_to_onnx(
preprocessor: Union['PreTrainedTokenizer', 'FeatureExtractionMixin', 'ProcessorMixin'],
model: Union['PreTrainedModel', 'TFPreTrainedModel'],
config: OnnxConfig,
opset: int,
output: Path,
device: str = 'cpu'
) -> Tuple[List[str], List[str]]:
"""
Export a Pytorch or TensorFlow model to an ONNX Intermediate Representation (IR)
Args:
preprocessor (Union['PreTrainedTokenizer', 'FeatureExtractionMixin', 'ProcessorMixin']):
The preprocessor used for encoding the data.
model (Union['PreTrainedModel', 'TFPreTrainedModel']):
The model to export.
config (OnnxConfig):
The ONNX configuration associated with the exported model.
opset (int):
The version of the ONNX operator set to use.
output (Path):
Directory to store the exported ONNX model.
device (str, optional, defaults to 'cpu'):
The device on which the ONNX model will be exported. Either 'cpu' or 'cuda'. Only PyTorch is supported for
export on CUDA devices.
Returns:
Tuple[List[str], List[str]]: A tuple with an ordered list of the model's inputs, and the named inputs from
the ONNX configuration.
"""
if not (is_torch_available() or is_tf_available()):
raise ImportError(
"Cannot convert because neither PyTorch nor TensorFlow are installed. "
"Please install torch or tensorflow first."
)
if is_tf_available() and isinstance(model, TFPreTrainedModel) and device == "cuda":
raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.")
if tokenizer is not None:
warnings.warn(
"The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
" `preprocessor` instead.",
FutureWarning,
)
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
preprocessor = tokenizer
if is_torch_available():
from ..utils import get_torch_version
if not config.is_torch_support_available:
logger.warning(
f"Unsupported PyTorch version for this model. Minimum required is {config.torch_onnx_minimum_version},"
f" got: {get_torch_version()}"
)
if is_torch_available() and issubclass(type(model), PreTrainedModel):
return export_pytorch(preprocessor, model, config, opset, output, tokenizer=tokenizer, device=device)
elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
return export_tensorflow(preprocessor, model, config, opset, output, tokenizer=tokenizer)
atol: float,
tokenizer: "PreTrainedTokenizer" = None,
atol: float,
tokenizer: "PreTrainedTokenizer" = None,
from onnxruntime import InferenceSession, SessionOptions
logger.info("Validating ONNX model...")
if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
raise ValueError("You cannot provide both a tokenizer and a preprocessor to validate the model outputs.")
if tokenizer is not None:
warnings.warn(
"The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
" `preprocessor` instead.",
FutureWarning,
)
logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
preprocessor = tokenizer
if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
reference_model_inputs = config.generate_dummy_inputs(
preprocessor,
batch_size=config.default_fixed_batch + 1,
seq_length=config.default_fixed_sequence + 1,
framework=TensorType.PYTORCH,
)
else:
reference_model_inputs = config.generate_dummy_inputs(
preprocessor,
batch_size=config.default_fixed_batch + 1,
seq_length=config.default_fixed_sequence + 1,
framework=TensorType.TENSORFLOW,
)
options = SessionOptions()
session = InferenceSession(onnx_model.as_posix(), options, providers=["CPUExecutionProvider"])
if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
reference_model.to("cpu")
ref_outputs = reference_model(**reference_model_inputs)
ref_outputs_dict = {}
for name, value in ref_outputs.items():
if name == "past_key_values":
name = "present"
if isinstance(value, (list, tuple)):
value = config.flatten_output_collection_property(name, value)
ref_outputs_dict.update(value)
else:
ref_outputs_dict[name] = value
reference_model_inputs_onnxruntime = config.generate_dummy_inputs_onnxruntime(reference_model_inputs)
onnx_inputs = {}
for name, value in reference_model_inputs_onnxruntime.items():
if isinstance(value, (list, tuple)):
value = config.flatten_output_collection_property(name, value)
onnx_inputs.update({tensor_name: pt_tensor.numpy() for tensor_name, pt_tensor in value.items()})
else:
onnx_inputs[name] = value.numpy()
onnx_outputs = session.run(onnx_named_outputs, onnx_inputs)
ref_outputs_set, onnx_outputs_set = set(ref_outputs_dict.keys()), set(onnx_named_outputs)
if not onnx_outputs_set.issubset(ref_outputs_set):
logger.info(
f"\t-[x] ONNX model output names {onnx_outputs_set} do not match reference model {ref_outputs_set}"
)
raise ValueError(
"Outputs don't match between reference model and ONNX exported model: "
f"{onnx_outputs_set.difference(ref_outputs_set)}"
)
else:
logger.info(f"\t-[✓] ONNX model output names match reference model ({onnx_outputs_set})")
for name, ort_value in zip(onnx_named_outputs, onnx_outputs):
if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
ref_value = ref_outputs_dict[name].detach().numpy()
else:
ref_value = ref_outputs_dict[name].numpy()
logger.info(f'\t- Validating ONNX Model output "{name}":')
if not ort_value.shape == ref_value.shape:
logger.info(f"\t\t-[x] shape {ort_value.shape} doesn't match {ref_value.shape}")
raise ValueError(
"Outputs shape doesn't match between reference model and ONNX exported model: "
f"Got {ref_value.shape} (reference) and {ort_value.shape} (ONNX)"
)
else:
logger.info(f"\t\t-[✓] {ort_value.shape} matches {ref_value.shape}")
if not np.allclose(ref_value, ort_value, atol=atol):
bad_indices = np.logical_not(np.isclose(ref_value, ort_value, atol=atol))
logger.info(f"\t\t-[x] values not close enough (atol: {atol})")
raise ValueError(
"Outputs values don't match between reference model and ONNX exported model: "
f"Got max absolute difference of: {np.amax(np.abs(ref_value - ort_value))} for "
f"{ref_value[bad_indices]} vs {ort_value[bad_indices]}"
)
else:
logger.info(f"\t\t-[✓] all values close (atol: {atol})")
def ensure_model_and_config_inputs_match(
model: Union["PreTrainedModel", "TFPreTrainedModel"], model_inputs: Iterable[str]
) -> Tuple[bool, List[str]]:
"""
确保模型输入和配置输入匹配的函数。
:param model: 预训练模型对象,可以是 `PreTrainedModel` 或 `TFPreTrainedModel` 的子类之一
:param model_inputs: 模型期望的输入参数的可迭代对象,通常是字符串列表
:return: 返回一个元组,包含一个布尔值和一个字符串列表。布尔值表示模型输入是否与配置输入匹配,字符串列表表示匹配的输入参数的有序列表。
"""
if is_torch_available() and issubclass(type(model), PreTrainedModel):
forward_parameters = signature(model.forward).parameters
else:
forward_parameters = signature(model.call).parameters
model_inputs_set = set(model_inputs)
forward_inputs_set = set(forward_parameters.keys())
is_ok = model_inputs_set.issubset(forward_inputs_set)
matching_inputs = forward_inputs_set.intersection(model_inputs_set)
ordered_inputs = [parameter for parameter in forward_parameters.keys() if parameter in matching_inputs]
return is_ok, ordered_inputs
.\onnx\features.py
import os
from functools import partial, reduce
from typing import TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type, Union
import transformers
from .. import PretrainedConfig, is_tf_available, is_torch_available
from ..utils import TF2_WEIGHTS_NAME, WEIGHTS_NAME, logging
from .config import OnnxConfig
if TYPE_CHECKING:
from transformers import PreTrainedModel, TFPreTrainedModel
logger = logging.get_logger(__name__)
if is_torch_available():
from transformers.models.auto import (
AutoModel,
AutoModelForCausalLM,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForMaskedImageModeling,
AutoModelForMaskedLM,
AutoModelForMultipleChoice,
AutoModelForObjectDetection,
AutoModelForQuestionAnswering,
AutoModelForSemanticSegmentation,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoModelForSpeechSeq2Seq,
AutoModelForTokenClassification,
AutoModelForVision2Seq,
)
if is_tf_available():
from transformers.models.auto import (
TFAutoModel,
TFAutoModelForCausalLM,
TFAutoModelForMaskedLM,
TFAutoModelForMultipleChoice,
TFAutoModelForQuestionAnswering,
TFAutoModelForSemanticSegmentation,
TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification,
TFAutoModelForTokenClassification,
)
if not is_torch_available() and not is_tf_available():
logger.warning(
"The ONNX export features are only supported for PyTorch or TensorFlow. You will not be able to export models"
" without one of these libraries installed."
)
def supported_features_mapping(
*supported_features: str, onnx_config_cls: str = None
) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:
"""
Generate the mapping between supported the features and their corresponding OnnxConfig for a given model.
Args:
*supported_features: The names of the supported features.
onnx_config_cls: The OnnxConfig full name corresponding to the model.
Returns:
The dictionary mapping a feature to an OnnxConfig constructor.
"""
if onnx_config_cls is None:
raise ValueError("A OnnxConfig class must be provided")
config_cls = transformers
for attr_name in onnx_config_cls.split("."):
config_cls = getattr(config_cls, attr_name)
mapping = {}
for feature in supported_features:
if "-with-past" in feature:
task = feature.replace("-with-past", "")
mapping[feature] = partial(config_cls.with_past, task=task)
else:
mapping[feature] = partial(config_cls.from_model_config, task=feature)
return mapping
class FeaturesManager:
_TASKS_TO_AUTOMODELS = {}
_TASKS_TO_TF_AUTOMODELS = {}
if is_torch_available():
_TASKS_TO_AUTOMODELS = {
"default": AutoModel,
"masked-lm": AutoModelForMaskedLM,
"causal-lm": AutoModelForCausalLM,
"seq2seq-lm": AutoModelForSeq2SeqLM,
"sequence-classification": AutoModelForSequenceClassification,
"token-classification": AutoModelForTokenClassification,
"multiple-choice": AutoModelForMultipleChoice,
"object-detection": AutoModelForObjectDetection,
"question-answering": AutoModelForQuestionAnswering,
"image-classification": AutoModelForImageClassification,
"image-segmentation": AutoModelForImageSegmentation,
"masked-im": AutoModelForMaskedImageModeling,
"semantic-segmentation": AutoModelForSemanticSegmentation,
"vision2seq-lm": AutoModelForVision2Seq,
"speech2seq-lm": AutoModelForSpeechSeq2Seq,
}
if is_tf_available():
_TASKS_TO_TF_AUTOMODELS = {
"default": TFAutoModel,
"masked-lm": TFAutoModelForMaskedLM,
"causal-lm": TFAutoModelForCausalLM,
"seq2seq-lm": TFAutoModelForSeq2SeqLM,
"sequence-classification": TFAutoModelForSequenceClassification,
"token-classification": TFAutoModelForTokenClassification,
"multiple-choice": TFAutoModelForMultipleChoice,
"question-answering": TFAutoModelForQuestionAnswering,
"semantic-segmentation": TFAutoModelForSemanticSegmentation,
}
AVAILABLE_FEATURES = sorted(reduce(lambda s1, s2: s1 | s2, (v.keys() for v in _SUPPORTED_MODEL_TYPE.values())))
@staticmethod
def get_supported_features_for_model_type(
model_type: str, model_name: Optional[str] = None
) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:
"""
Tries to retrieve the feature -> OnnxConfig constructor map from the model type.
Args:
model_type (`str`):
The model type to retrieve the supported features for.
model_name (`str`, *optional*):
The name attribute of the model object, only used for the exception message.
Returns:
The dictionary mapping each feature to a corresponding OnnxConfig constructor.
"""
model_type = model_type.lower()
if model_type not in FeaturesManager._SUPPORTED_MODEL_TYPE:
model_type_and_model_name = f"{model_type} ({model_name})" if model_name else model_type
raise KeyError(
f"{model_type_and_model_name} is not supported yet. "
f"Only {list(FeaturesManager._SUPPORTED_MODEL_TYPE.keys())} are supported. "
f"If you want to support {model_type} please propose a PR or open up an issue."
)
return FeaturesManager._SUPPORTED_MODEL_TYPE[model_type]
@staticmethod
def feature_to_task(feature: str) -> str:
"""
Converts a feature string by removing the '-with-past' suffix.
Args:
feature (`str`):
The feature string to be converted.
Returns:
The feature string with '-with-past' suffix removed.
"""
return feature.replace("-with-past", "")
@staticmethod
def _validate_framework_choice(framework: str):
"""
Validates if the framework requested for the export is both correct and available, otherwise throws an
exception.
Args:
framework (`str`):
The framework requested for ONNX export.
Raises:
ValueError: If the provided framework is not 'pt' or 'tf'.
RuntimeError: If the requested framework is 'pt' but PyTorch is not available,
or if the requested framework is 'tf' but TensorFlow is not available.
"""
if framework not in ["pt", "tf"]:
raise ValueError(
f"Only two frameworks are supported for ONNX export: pt or tf, but {framework} was provided."
)
elif framework == "pt" and not is_torch_available():
raise RuntimeError("Cannot export model to ONNX using PyTorch because no PyTorch package was found.")
elif framework == "tf" and not is_tf_available():
raise RuntimeError("Cannot export model to ONNX using TensorFlow because no TensorFlow package was found.")
def get_model_class_for_feature(feature: str, framework: str = "pt") -> Type:
"""
Attempts to retrieve an AutoModel class from a feature name.
Args:
feature (`str`):
The feature required.
framework (`str`, *optional*, defaults to `"pt"`):
The framework to use for the export.
Returns:
The AutoModel class corresponding to the feature.
"""
task = FeaturesManager.feature_to_task(feature)
FeaturesManager._validate_framework_choice(framework)
if framework == "pt":
task_to_automodel = FeaturesManager._TASKS_TO_AUTOMODELS
else:
task_to_automodel = FeaturesManager._TASKS_TO_TF_AUTOMODELS
if task not in task_to_automodel:
raise KeyError(
f"Unknown task: {feature}. Possible values are {list(FeaturesManager._TASKS_TO_AUTOMODELS.values())}"
)
return task_to_automodel[task]
@staticmethod
def determine_framework(model: str, framework: str = None) -> str:
"""
Determines the framework to use for the export.
The priority is in the following order:
1. User input via `framework`.
2. If local checkpoint is provided, use the same framework as the checkpoint.
3. Available framework in environment, with priority given to PyTorch
Args:
model (`str`):
The name of the model to export.
framework (`str`, *optional*, defaults to `None`):
The framework to use for the export. See above for priority if none provided.
Returns:
The framework to use for the export.
"""
if framework is not None:
return framework
framework_map = {"pt": "PyTorch", "tf": "TensorFlow"}
exporter_map = {"pt": "torch", "tf": "tf2onnx"}
if os.path.isdir(model):
if os.path.isfile(os.path.join(model, WEIGHTS_NAME)):
framework = "pt"
elif os.path.isfile(os.path.join(model, TF2_WEIGHTS_NAME)):
framework = "tf"
else:
raise FileNotFoundError(
"Cannot determine framework from given checkpoint location."
f" There should be a {WEIGHTS_NAME} for PyTorch"
f" or {TF2_WEIGHTS_NAME} for TensorFlow."
)
logger.info(f"Local {framework_map[framework]} model found.")
else:
if is_torch_available():
framework = "pt"
elif is_tf_available():
framework = "tf"
else:
raise EnvironmentError("Neither PyTorch nor TensorFlow found in environment. Cannot export to ONNX.")
logger.info(f"Framework not requested. Using {exporter_map[framework]} to export to ONNX.")
return framework
def get_model_from_feature(
feature: str, model: str, framework: str = None, cache_dir: str = None
) -> Union["PreTrainedModel", "TFPreTrainedModel"]:
"""
Attempts to retrieve a model instance based on the given feature and model name.
Args:
feature (`str`):
The specific feature required by the model.
model (`str`):
The name of the model to retrieve.
framework (`str`, *optional*, defaults to `None`):
The framework to use for model instantiation. If not provided, determined by `FeaturesManager.determine_framework`.
Returns:
Union["PreTrainedModel", "TFPreTrainedModel"]: The instantiated model object.
"""
framework = FeaturesManager.determine_framework(model, framework)
model_class = FeaturesManager.get_model_class_for_feature(feature, framework)
try:
model = model_class.from_pretrained(model, cache_dir=cache_dir)
except OSError:
if framework == "pt":
logger.info("Loading TensorFlow model in PyTorch before exporting to ONNX.")
model = model_class.from_pretrained(model, from_tf=True, cache_dir=cache_dir)
else:
logger.info("Loading PyTorch model in TensorFlow before exporting to ONNX.")
model = model_class.from_pretrained(model, from_pt=True, cache_dir=cache_dir)
return model
@staticmethod
def check_supported_model_or_raise(
model: Union["PreTrainedModel", "TFPreTrainedModel"], feature: str = "default"
) -> Tuple[str, Callable]:
"""
Checks if a given model supports a specified feature.
Args:
model (Union["PreTrainedModel", "TFPreTrainedModel"]):
The model instance to check.
feature (`str`, *optional*, defaults to `"default"`):
The feature name to verify if supported.
Returns:
Tuple[str, Callable]:
- The type of the model (`str`).
- Callable function from `FeaturesManager._SUPPORTED_MODEL_TYPE` corresponding to the feature.
"""
model_type = model.config.model_type.replace("_", "-")
model_name = getattr(model, "name", "")
model_features = FeaturesManager.get_supported_features_for_model_type(model_type, model_name=model_name)
if feature not in model_features:
raise ValueError(
f"{model.config.model_type} doesn't support feature {feature}. Supported values are: {model_features}"
)
return model.config.model_type, FeaturesManager._SUPPORTED_MODEL_TYPE[model_type][feature]
def get_config(model_type: str, feature: str) -> OnnxConfig:
"""
Retrieves the configuration for a specified model type and feature combination.
Args:
model_type (`str`):
The type of model to fetch the configuration for.
feature (`str`):
The feature to retrieve the configuration for.
Returns:
`OnnxConfig`: Configuration object for the specified model type and feature.
"""
return FeaturesManager._SUPPORTED_MODEL_TYPE[model_type][feature]
.\onnx\utils.py
from ctypes import c_float, sizeof
from enum import Enum
from typing import TYPE_CHECKING, Optional, Union
if TYPE_CHECKING:
from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
class ParameterFormat(Enum):
Float = c_float
@property
def size(self) -> int:
"""
返回该数据类型所需的字节数
Returns:
Integer > 0
"""
return sizeof(self.value)
def compute_effective_axis_dimension(dimension: int, fixed_dimension: int, num_token_to_add: int = 0) -> int:
"""
Args:
dimension: 当前轴的维度
fixed_dimension: 固定的轴维度
num_token_to_add: 需要添加的标记数量
Returns:
计算后的有效轴维度
"""
if dimension <= 0:
dimension = fixed_dimension
dimension -= num_token_to_add
return dimension
def compute_serialized_parameters_size(num_parameters: int, dtype: ParameterFormat) -> int:
"""
计算在给定存储格式中序列化模型时所有参数占用的大小
Args:
num_parameters: 需要保存的参数数量
dtype: 每个参数保存的数据格式
Returns:
所有参数保存时占用的字节数
"""
return num_parameters * dtype.size
def get_preprocessor(model_name: str) -> Optional[Union["AutoTokenizer", "AutoFeatureExtractor", "AutoProcessor"]]:
"""
获取适用于 `model_name` 的预处理器(分词器、特征提取器或处理器)。
Args:
model_name (`str`): 模型名称,用于加载预处理器。
Returns:
`Optional[Union[AutoTokenizer, AutoFeatureExtractor, AutoProcessor]]`:
如果找到处理器,则返回处理器。如果存在分词器或特征提取器,则返回分词器或特征提取器。如果同时存在分词器和特征提取器,则会引发错误。如果找不到预处理器,则返回 `None`。
"""
from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
try:
return AutoProcessor.from_pretrained(model_name)
except (ValueError, OSError, KeyError):
tokenizer, feature_extractor = None, None
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
except (OSError, KeyError):
pass
try:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
except (OSError, KeyError):
pass
if tokenizer is not None and feature_extractor is not None:
raise ValueError(
f"Couldn't auto-detect preprocessor for {model_name}. Found both a tokenizer and a feature extractor."
)
elif tokenizer is None and feature_extractor is None:
return None
elif tokenizer is not None:
return tokenizer
else:
return feature_extractor
.\onnx\__init__.py
from typing import TYPE_CHECKING
from ..utils import _LazyModule
_import_structure = {
"config": [
"EXTERNAL_DATA_FORMAT_SIZE_LIMIT",
"OnnxConfig",
"OnnxConfigWithPast",
"OnnxSeq2SeqConfigWithPast",
"PatchingSpec",
],
"convert": ["export", "validate_model_outputs"],
"features": ["FeaturesManager"],
"utils": ["ParameterFormat", "compute_serialized_parameters_size"],
}
if TYPE_CHECKING:
from .config import (
EXTERNAL_DATA_FORMAT_SIZE_LIMIT,
OnnxConfig,
OnnxConfigWithPast,
OnnxSeq2SeqConfigWithPast,
PatchingSpec,
)
from .convert import export, validate_model_outputs
from .features import FeaturesManager
from .utils import ParameterFormat, compute_serialized_parameters_size
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\onnx\__main__.py
import subprocess
import sys
import warnings
from argparse import ArgumentParser
from pathlib import Path
from packaging import version
from .. import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer
from ..utils import logging
from ..utils.import_utils import is_optimum_available
from .convert import export, validate_model_outputs
from .features import FeaturesManager
from .utils import get_preprocessor
MIN_OPTIMUM_VERSION = "1.5.0"
ENCODER_DECODER_MODELS = ["vision-encoder-decoder"]
def export_with_optimum(args):
if is_optimum_available():
from optimum.version import __version__ as optimum_version
parsed_optimum_version = version.parse(optimum_version)
if parsed_optimum_version < version.parse(MIN_OPTIMUM_VERSION):
raise RuntimeError(
f"transformers.onnx requires optimum >= {MIN_OPTIMUM_VERSION} but {optimum_version} is installed. You "
"can upgrade optimum by running: pip install -U optimum[exporters]"
)
else:
raise RuntimeError(
"transformers.onnx requires optimum to run, you can install the library by running: pip install "
"optimum[exporters]"
)
cmd_line = [
sys.executable,
"-m",
"optimum.exporters.onnx",
f"--model {args.model}",
f"--task {args.feature}",
f"--framework {args.framework}" if args.framework is not None else "",
f"{args.output}",
]
proc = subprocess.Popen(cmd_line, stdout=subprocess.PIPE)
proc.wait()
logger.info(
"The export was done by optimum.exporters.onnx. We recommend using to use this package directly in future, as "
"transformers.onnx is deprecated, and will be removed in v5. You can find more information here: "
"https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model."
)
def export_with_transformers(args):
args.output = args.output if args.output.is_file() else args.output.joinpath("model.onnx")
if not args.output.parent.exists():
args.output.parent.mkdir(parents=True)
model = FeaturesManager.get_model_from_feature(
args.feature, args.model, framework=args.framework, cache_dir=args.cache_dir
)
model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature=args.feature)
onnx_config = model_onnx_config(model.config)
if model_kind in ENCODER_DECODER_MODELS:
encoder_model = model.get_encoder()
decoder_model = model.get_decoder()
encoder_onnx_config = onnx_config.get_encoder_config(encoder_model.config)
decoder_onnx_config = onnx_config.get_decoder_config(
encoder_model.config, decoder_model.config, feature=args.feature
)
if args.opset is None:
args.opset = max(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)
if args.opset < min(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset):
raise ValueError(
f"Opset {args.opset} is not sufficient to export {model_kind}. At least "
f"{min(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)} is required."
)
preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
onnx_inputs, onnx_outputs = export(
preprocessor,
encoder_model,
encoder_onnx_config,
args.opset,
args.output.parent.joinpath("encoder_model.onnx"),
)
validate_model_outputs(
encoder_onnx_config,
preprocessor,
encoder_model,
args.output.parent.joinpath("encoder_model.onnx"),
onnx_outputs,
args.atol if args.atol else encoder_onnx_config.atol_for_validation,
)
preprocessor = AutoTokenizer.from_pretrained(args.model)
onnx_inputs, onnx_outputs = export(
preprocessor,
decoder_model,
decoder_onnx_config,
args.opset,
args.output.parent.joinpath("decoder_model.onnx"),
)
validate_model_outputs(
decoder_onnx_config,
preprocessor,
decoder_model,
args.output.parent.joinpath("decoder_model.onnx"),
onnx_outputs,
args.atol if args.atol else decoder_onnx_config.atol_for_validation,
)
logger.info(
f"All good, model saved at: {args.output.parent.joinpath('encoder_model.onnx').as_posix()},"
f" {args.output.parent.joinpath('decoder_model.onnx').as_posix()}"
)
else:
if args.preprocessor == "auto":
preprocessor = get_preprocessor(args.model)
elif args.preprocessor == "tokenizer":
preprocessor = AutoTokenizer.from_pretrained(args.model)
elif args.preprocessor == "image_processor":
preprocessor = AutoImageProcessor.from_pretrained(args.model)
elif args.preprocessor == "feature_extractor":
preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
elif args.preprocessor == "processor":
preprocessor = AutoProcessor.from_pretrained(args.model)
else:
raise ValueError(f"Unknown preprocessor type '{args.preprocessor}'")
if args.opset is None:
args.opset = onnx_config.default_onnx_opset
if args.opset < onnx_config.default_onnx_opset:
raise ValueError(
f"Opset {args.opset} is not sufficient to export {model_kind}. "
f"At least {onnx_config.default_onnx_opset} is required."
)
onnx_inputs, onnx_outputs = export(
preprocessor,
model,
onnx_config,
args.opset,
args.output,
)
if args.atol is None:
args.atol = onnx_config.atol_for_validation
validate_model_outputs(onnx_config, preprocessor, model, args.output, onnx_outputs, args.atol)
logger.info(f"All good, model saved at: {args.output.as_posix()}")
warnings.warn(
"The export was done by transformers.onnx which is deprecated and will be removed in v5. We recommend"
" using optimum.exporters.onnx in future. You can find more information here:"
" https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model.",
FutureWarning,
)
def main():
parser = ArgumentParser("Hugging Face Transformers ONNX exporter")
parser.add_argument(
"-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from."
)
parser.add_argument(
"--feature",
default="default",
help="The type of features to export the model with.",
)
parser.add_argument("--opset", type=int, default=None, help="ONNX opset version to export the model with.")
parser.add_argument(
"--atol", type=float, default=None, help="Absolute difference tolerance when validating the model."
)
parser.add_argument(
"--framework",
type=str,
choices=["pt", "tf"],
default=None,
help=(
"The framework to use for the ONNX export."
" If not provided, will attempt to use the local checkpoint's original framework"
" or what is available in the environment."
),
)
parser.add_argument("output", type=Path, help="Path indicating where to store generated ONNX model.")
parser.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
parser.add_argument(
"--preprocessor",
type=str,
choices=["auto", "tokenizer", "feature_extractor", "image_processor", "processor"],
default="auto",
help="Which type of preprocessor to use. 'auto' tries to automatically detect it.",
)
parser.add_argument(
"--export_with_transformers",
action="store_true",
help=(
"Whether to use transformers.onnx instead of optimum.exporters.onnx to perform the ONNX export. It can be "
"useful when exporting a model supported in transformers but not in optimum, otherwise it is not "
"recommended."
),
)
args = parser.parse_args()
if args.export_with_transformers or not is_optimum_available():
export_with_transformers(args)
else:
export_with_optimum(args)
if __name__ == "__main__":
logger = logging.get_logger("transformers.onnx")
logger.setLevel(logging.INFO)
main()
.\optimization.py
"""PyTorch optimization for BERT model."""
import math
import warnings
from functools import partial
from typing import Callable, Iterable, Optional, Tuple, Union
import torch
from torch import nn
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
from .trainer_pt_utils import LayerWiseDummyOptimizer, LayerWiseDummyScheduler
from .trainer_utils import SchedulerType
from .utils import logging
from .utils.versions import require_version
logger = logging.get_logger(__name__)
def _get_constant_lambda(_=None):
return 1
def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
"""
Create a schedule with a constant learning rate, using the learning rate set in optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)
def get_reduce_on_plateau_schedule(optimizer: Optimizer, **kwargs):
"""
Create a schedule with a constant learning rate that decreases when a metric has stopped improving.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
kwargs (`dict`, *optional*):
Extra parameters to be passed to the scheduler. See `torch.optim.lr_scheduler.ReduceLROnPlateau`
for possible parameters.
Return:
`torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule.
"""
return ReduceLROnPlateau(optimizer, **kwargs)
def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1.0, num_warmup_steps))
return 1.0
def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
"""
Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
increases linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
return LambdaLR(optimizer, partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps), last_epoch=last_epoch)
lr_lambda = partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps)
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
"""
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
lr_lambda = partial(
_get_linear_schedule_with_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def _get_cosine_schedule_with_warmup_lr_lambda(
current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float
):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
def get_cosine_schedule_with_warmup(
optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
num_cycles (`float`, *optional*, defaults to 0.5):
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
following a half-cosine).
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
"""
lr_lambda = partial(
_get_cosine_schedule_with_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
lr_lambda = partial(
_get_cosine_schedule_with_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(
current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int
):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
if progress >= 1.0:
return 0.0
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
def get_cosine_with_hard_restarts_schedule_with_warmup(
optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
num_cycles (`int`, *optional*, defaults to 1):
The number of hard restarts to use.
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
lr_lambda = partial(
_get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def _get_polynomial_decay_schedule_with_warmup_lr_lambda(
current_step: int,
*,
num_warmup_steps: int,
num_training_steps: int,
lr_end: float,
power: float,
lr_init: int,
):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
elif current_step > num_training_steps:
return lr_end / lr_init
else:
lr_range = lr_init - lr_end
decay_steps = num_training_steps - num_warmup_steps
pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
decay = lr_range * pct_remaining**power + lr_end
return decay / lr_init
def get_polynomial_decay_schedule_with_warmup(
optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
"""
Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
optimizer to `lr_end` after `num_training_steps`, with a linear warmup over `num_warmup_steps` steps.
Args:
optimizer (`torch.optim.Optimizer`):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
lr_end (`float`, optional, defaults to 1e-7):
The final learning rate after the decay.
power (`float`, optional, defaults to 1.0):
Power factor for polynomial decay.
last_epoch (`int`, optional, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
lr_lambda = partial(
_get_polynomial_decay_schedule_with_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
lr_end=lr_end,
power=power,
lr_init=optimizer.param_groups[0]['lr'],
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
lr_init = optimizer.defaults["lr"]
if not (lr_init > lr_end):
raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
lr_lambda = partial(
_get_polynomial_decay_schedule_with_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
lr_end=lr_end,
power=power,
lr_init=lr_init,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
shift = timescale - num_warmup_steps
decay = 1.0 / math.sqrt((current_step + shift) / timescale)
return decay
def get_inverse_sqrt_schedule(
optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
):
"""
创建一个逆平方根学习率调度,从优化器中设置的初始学习率开始,在一个预热期间内线性增加学习率,从0增加到初始学习率。
Args:
optimizer ([`~torch.optim.Optimizer`]):
要调度学习率的优化器。
num_warmup_steps (`int`):
预热阶段的步数。
timescale (`int`, *可选*, 默认为 `num_warmup_steps`):
时间尺度。
last_epoch (`int`, *可选*, 默认为 -1):
恢复训练时的最后一个周期索引。
Returns:
`torch.optim.lr_scheduler.LambdaLR`:带有适当调度的对象。
"""
if timescale is None:
timescale = num_warmup_steps or 10_000
lr_lambda = partial(_get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale)
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
TYPE_TO_SCHEDULER_FUNCTION = {
SchedulerType.LINEAR: get_linear_schedule_with_warmup,
SchedulerType.COSINE: get_cosine_schedule_with_warmup,
SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
SchedulerType.CONSTANT: get_constant_schedule,
SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
}
def get_scheduler(
name: Union[str, SchedulerType],
optimizer: Optimizer,
num_warmup_steps: Optional[int] = None,
num_training_steps: Optional[int] = None,
scheduler_specific_kwargs: Optional[dict] = None,
):
"""
统一的 API 通过名称获取任何调度器。
Args:
name (Union[str, SchedulerType]):
调度器的名称。
optimizer (Optimizer):
要调度学习率的优化器。
num_warmup_steps (Optional[int], 可选):
预热阶段的步数。
num_training_steps (Optional[int], 可选):
训练总步数。
scheduler_specific_kwargs (Optional[dict], 可选):
特定于调度器的其他参数。
"""
Args:
name (`str` or `SchedulerType`):
The name of the scheduler to use.
optimizer (`torch.optim.Optimizer`):
The optimizer that will be used during training.
num_warmup_steps (`int`, *optional*):
The number of warmup steps to do. This is not required by all schedulers (hence the argument being
optional), the function will raise an error if it's unset and the scheduler type requires it.
num_training_steps (`int`, *optional*):
The number of training steps to do. This is not required by all schedulers (hence the argument being
optional), the function will raise an error if it's unset and the scheduler type requires it.
scheduler_specific_kwargs (`dict`, *optional*):
Extra parameters for schedulers such as cosine with restarts. Mismatched scheduler types and scheduler
parameters will cause the scheduler function to raise a TypeError.
"""
# Convert `name` to SchedulerType enum
name = SchedulerType(name)
# Retrieve the scheduler function corresponding to `name`
schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
# If `optimizer` is a LayerWiseDummyOptimizer, recursively fetch schedulers for each parameter
if optimizer is not None and isinstance(optimizer, LayerWiseDummyOptimizer):
optimizer_dict = optimizer.optimizer_dict
scheduler_dict = {}
# Iterate over optimizer parameters and fetch corresponding schedulers
for param in optimizer_dict.keys():
scheduler_dict[param] = get_scheduler(
name,
optimizer=optimizer_dict[param],
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
)
# Define a scheduler hook for each parameter to apply the respective scheduler step
def scheduler_hook(param):
if param.grad is not None:
scheduler_dict[param].step()
# Register the scheduler hook for each parameter that requires gradients
for param in optimizer_dict.keys():
if param.requires_grad:
param.register_post_accumulate_grad_hook(scheduler_hook)
# Return a LayerWiseDummyScheduler instance
return LayerWiseDummyScheduler()
# For constant scheduler types, directly apply the scheduler function on `optimizer`
if name == SchedulerType.CONSTANT:
return schedule_func(optimizer)
# Handle REDUCE_ON_PLATEAU scheduler type with specific kwargs if provided
if scheduler_specific_kwargs is None:
scheduler_specific_kwargs = {}
if name == SchedulerType.REDUCE_ON_PLATEAU:
return schedule_func(optimizer, **scheduler_specific_kwargs)
# Raise an error if `num_warmup_steps` is not provided for required scheduler types
if num_warmup_steps is None:
raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
# Apply schedulers requiring `num_warmup_steps` with the provided value
if name == SchedulerType.CONSTANT_WITH_WARMUP:
return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
if name == SchedulerType.INVERSE_SQRT:
return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
# All other schedulers require `num_training_steps` to be provided
# 如果 `num_training_steps` 参数为 None,则抛出 ValueError 异常,指示需要提供 `num_training_steps` 参数
if num_training_steps is None:
raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
# 调用指定的 `schedule_func` 函数,传入以下参数:
# - optimizer: 优化器对象
# - num_warmup_steps: 预热步数
# - num_training_steps: 训练步数
# - **scheduler_specific_kwargs: 其他特定调度器的关键字参数,传递给 `schedule_func`
return schedule_func(
optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
**scheduler_specific_kwargs,
)
# 定义 AdamW 优化器类,继承自 Optimizer 类
"""
Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
Regularization](https://arxiv.org/abs/1711.05101).
Parameters:
params (`Iterable[nn.parameter.Parameter]`):
Iterable of parameters to optimize or dictionaries defining parameter groups.
lr (`float`, *optional*, defaults to 0.001):
The learning rate to use.
betas (`Tuple[float,float]`, *optional*, defaults to `(0.9, 0.999)`):
Adam's betas parameters (b1, b2).
eps (`float`, *optional*, defaults to 1e-06):
Adam's epsilon for numerical stability.
weight_decay (`float`, *optional*, defaults to 0.0):
Decoupled weight decay to apply.
correct_bias (`bool`, *optional*, defaults to `True`):
Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
no_deprecation_warning (`bool`, *optional*, defaults to `False`):
A flag used to disable the deprecation warning (set to `True` to disable the warning).
"""
def __init__(
self,
params: Iterable[nn.parameter.Parameter],
lr: float = 1e-3,
betas: Tuple[float, float] = (0.9, 0.999),
eps: float = 1e-6,
weight_decay: float = 0.0,
correct_bias: bool = True,
no_deprecation_warning: bool = False,
):
# 如果未禁用不推荐使用警告,则发出未来将删除警告
if not no_deprecation_warning:
warnings.warn(
"This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch"
" implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this"
" warning",
FutureWarning,
)
# 检查学习率是否非负
if lr < 0.0:
raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
# 检查 beta 参数是否在有效范围内
if not 0.0 <= betas[0] < 1.0:
raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
if not 0.0 <= betas[1] < 1.0:
raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
# 检查 epsilon 是否非负
if not 0.0 <= eps:
raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
# 设置默认参数字典
defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
# 调用父类 Optimizer 的构造函数进行初始化
super().__init__(params, defaults)
@torch.no_grad()
# 执行单个优化步骤的方法
def step(self, closure: Callable = None):
"""
Performs a single optimization step.
Arguments:
closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
"""
loss = None
# 如果提供了闭包函数,重新评估模型并返回损失
if closure is not None:
loss = closure()
# 遍历每个参数组
for group in self.param_groups:
# 遍历当前参数组中的参数
for p in group["params"]:
# 如果参数没有梯度,继续下一个参数
if p.grad is None:
continue
grad = p.grad
# 如果梯度是稀疏的,Adam 不支持稀疏梯度,建议使用 SparseAdam
if grad.is_sparse:
raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
# 获取或初始化该参数的状态信息
state = self.state[p]
# 状态初始化
if len(state) == 0:
state["step"] = 0
# 梯度值的指数移动平均
state["exp_avg"] = torch.zeros_like(p)
# 梯度值平方的指数移动平均
state["exp_avg_sq"] = torch.zeros_like(p)
# 获取当前参数的 exp_avg 和 exp_avg_sq
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
beta1, beta2 = group["betas"]
# 更新步数
state["step"] += 1
# 更新梯度值的指数移动平均和平方梯度值的指数移动平均
# 使用原地操作同时更新平均值
exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
denom = exp_avg_sq.sqrt().add_(group["eps"])
step_size = group["lr"]
# 如果需要修正偏差(比如对BERT),不进行偏差修正
if group["correct_bias"]:
bias_correction1 = 1.0 - beta1 ** state["step"]
bias_correction2 = 1.0 - beta2 ** state["step"]
step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
# 更新参数值
p.addcdiv_(exp_avg, denom, value=-step_size)
# 使用Adam进行权重衰减,与梯度平方移动平均无关
if group["weight_decay"] > 0.0:
p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
# 返回损失值(如果有)
return loss
class Adafactor(Optimizer):
"""
AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
Paper: *Adafactor: Adaptive Learning Rates with Sublinear Memory Cost* https://arxiv.org/abs/1804.04235 Note that
this optimizer internally adjusts the learning rate depending on the `scale_parameter`, `relative_step` and
`warmup_init` options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
`relative_step=False`.
Arguments:
params (`Iterable[nn.parameter.Parameter]`):
Iterable of parameters to optimize or dictionaries defining parameter groups.
lr (`float`, *optional*):
The external learning rate.
eps (`Tuple[float, float]`, *optional*, defaults to `(1e-30, 0.001)`):
Regularization constants for square gradient and parameter scale respectively
clip_threshold (`float`, *optional*, defaults to 1.0):
Threshold of root mean square of final gradient update
decay_rate (`float`, *optional*, defaults to -0.8):
Coefficient used to compute running averages of square
beta1 (`float`, *optional*):
Coefficient used for computing running averages of gradient
weight_decay (`float`, *optional*, defaults to 0.0):
Weight decay (L2 penalty)
scale_parameter (`bool`, *optional*, defaults to `True`):
If True, learning rate is scaled by root mean square
relative_step (`bool`, *optional*, defaults to `True`):
If True, time-dependent learning rate is computed instead of external learning rate
warmup_init (`bool`, *optional*, defaults to `False`):
Time-dependent learning rate computation depends on whether warm-up initialization is being used
This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.
Recommended T5 finetuning settings (https://discuss.huggingface.co/t/t5-finetuning-tips/684/3):
- Training without LR warmup or clip_threshold is not recommended.
- use scheduled LR warm-up to fixed LR
- use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235)
- Disable relative updates
- Use scale_parameter=False
- Additional optimizer operations like gradient clipping should not be used alongside Adafactor
Example:
```
Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
```
Others reported the following combination to work well:
```
Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
```
When using `lr=None` with [`Trainer`] you will most likely need to use [`~optimization.AdafactorSchedule`]
scheduler as following:
```
```
def __init__(self, params, lr=None, eps=(1e-30, 0.001), clip_threshold=1.0, decay_rate=-0.8, beta1=None, weight_decay=0.0,
scale_parameter=True, relative_step=True, warmup_init=False):
"""
Initialize Adafactor optimizer
Args:
params (Iterable[nn.parameter.Parameter]): Iterable of parameters to optimize or dicts defining parameter groups
lr (float, optional): External learning rate (default: None)
eps (Tuple[float, float], optional): Regularization constants for square gradient and parameter scale (default: (1e-30, 0.001))
clip_threshold (float, optional): Threshold of root mean square of final gradient update (default: 1.0)
decay_rate (float, optional): Coefficient used to compute running averages of square (default: -0.8)
beta1 (float, optional): Coefficient used for computing running averages of gradient (default: None)
weight_decay (float, optional): Weight decay (L2 penalty) (default: 0.0)
scale_parameter (bool, optional): If True, learning rate is scaled by root mean square (default: True)
relative_step (bool, optional): If True, time-dependent learning rate is computed instead of external learning rate (default: True)
warmup_init (bool, optional): Time-dependent learning rate computation depends on whether warm-up initialization is being used (default: False)
"""
super(Adafactor, self).__init__(params, defaults=dict(lr=lr, eps=eps, clip_threshold=clip_threshold,
decay_rate=decay_rate, beta1=beta1,
weight_decay=weight_decay))
self.scale_parameter = scale_parameter
self.relative_step = relative_step
self.warmup_init = warmup_init
def step(self, closure=None):
"""
Performs a single optimization step.
Args:
closure (callable, optional): A closure that reevaluates the model and returns the loss.
Returns:
None
"""
lr = self.defaults['lr']
if lr is None:
raise ValueError('Learning rate is required for Adafactor optimizer')
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['exp_avg_sq'] = torch.zeros_like(p.data)
if self.scale_parameter:
state['exp_avg_sq_prm'] = torch.zeros_like(p.data)
exp_avg_sq = state['exp_avg_sq']
if self.scale_parameter:
exp_avg_sq_prm = state['exp_avg_sq_prm']
state['step'] += 1
bias_correction = 1 - self.decay_rate ** state['step']
if self.scale_parameter:
grad_sq = grad.pow(2).add_(group['eps'][0])
exp_avg_sq.mul_(self.decay_rate).add_(1.0 - self.decay_rate, grad_sq)
rms = exp_avg_sq_prm.mul(1 - bias_correction).sqrt().add_(group['eps'][1])
p.data.addcdiv_(-lr, grad, rms)
else:
grad_sq = grad.pow(2).add_(group['eps'][0])
exp_avg_sq.mul_(self.decay_rate).add_(1.0 - self.decay_rate, grad_sq)
rms = exp_avg_sq.sqrt().add_(group['eps'][1])
p.data.addcdiv_(-lr, grad, rms)
if group['clip_threshold'] > 0:
rms_clipped = rms.clamp(min=group['clip_threshold'])
p.data.div_(rms_clipped)
if group['weight_decay'] > 0:
p.data.add_(-group['weight_decay'], p.data)
exp_avg_sq.copy_(exp_avg_sq)
return None
from transformers.optimization import Adafactor, AdafactorSchedule
optimizer = Adafactor(
model.parameters(),
lr=1e-3,
eps=(1e-30, 1e-3),
clip_threshold=1.0,
decay_rate=-0.8,
beta1=None,
weight_decay=0.0,
relative_step=False,
scale_parameter=False,
warmup_init=False,
)
lr_scheduler = AdafactorSchedule(optimizer)
trainer = Trainer(
...,
optimizers=(optimizer, lr_scheduler)
)
class AdafactorSchedule(LambdaLR):
"""
[`~optimization.Adafactor`] 自行执行调度,如果训练循环依赖于调度器(例如用于日志记录),此类创建代理对象,
从优化器中检索当前 lr 值。
在启动期间返回 `initial_lr`,在步进期间返回实际的 `lr`。
"""
def __init__(self, optimizer, initial_lr=0.0):
def lr_lambda(_):
return initial_lr
for group in optimizer.param_groups:
group["initial_lr"] = initial_lr
super().__init__(optimizer, lr_lambda)
for group in optimizer.param_groups:
del group["initial_lr"]
def get_lr(self):
opt = self.optimizer
lrs = [
opt._get_lr(group, opt.state[group["params"][0]])
for group in opt.param_groups
if group["params"][0].grad is not None
]
if len(lrs) == 0:
lrs = self.base_lrs
return lrs
def get_adafactor_schedule(optimizer, initial_lr=0.0):
"""
获取 [`~optimization.Adafactor`] 的代理调度对象
Args:
optimizer ([`~torch.optim.Optimizer`]):
要调度学习率的优化器。
initial_lr (`float`, *可选*, 默认为 0.0):
初始学习率
Return:
[`~optimization.Adafactor`] 的代理调度对象。
"""
return AdafactorSchedule(optimizer, initial_lr)
.\optimization_tf.py
import re
from typing import Callable, List, Optional, Union
import tensorflow as tf
try:
from tf_keras.optimizers.legacy import Adam
except (ImportError, ModuleNotFoundError):
from tensorflow.keras.optimizers.legacy import Adam
from .modeling_tf_utils import keras
if hasattr(keras.optimizers.schedules, "learning_rate_schedule"):
schedules = keras.optimizers.schedules.learning_rate_schedule
else:
schedules = keras.optimizers.schedules
class WarmUp(schedules.LearningRateSchedule):
"""
应用于给定学习率衰减计划的热身(warmup)计划。
Args:
initial_learning_rate (`float`):
热身结束后计划的初始学习率(这将是热身结束时的学习率)。
decay_schedule_fn (`Callable`):
热身结束后应用于剩余训练的衰减计划函数。
warmup_steps (`int`):
训练过程中热身部分的步数。
power (`float`, *optional*, defaults to 1.0):
用于多项式热身的幂次数(默认为线性热身)。
name (`str`, *optional*):
计划期间返回张量的可选名称前缀。
"""
def __init__(
self,
initial_learning_rate: float,
decay_schedule_fn: Callable,
warmup_steps: int,
power: float = 1.0,
name: str = None,
):
super().__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.power = power
self.decay_schedule_fn = decay_schedule_fn
self.name = name
def __call__(self, step):
with tf.name_scope(self.name or "WarmUp") as name:
global_step_float = tf.cast(step, tf.float32)
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
warmup_percent_done = global_step_float / warmup_steps_float
warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
return tf.cond(
global_step_float < warmup_steps_float,
lambda: warmup_learning_rate,
lambda: self.decay_schedule_fn(step - self.warmup_steps),
name=name,
)
def get_config(self):
return {
"initial_learning_rate": self.initial_learning_rate,
"decay_schedule_fn": self.decay_schedule_fn,
"warmup_steps": self.warmup_steps,
"power": self.power,
"name": self.name,
}
def create_optimizer(
init_lr: float,
num_train_steps: int,
num_warmup_steps: int,
min_lr_ratio: float = 0.0,
adam_beta1: float = 0.9,
adam_beta2: float = 0.999,
adam_epsilon: float = 1e-8,
adam_clipnorm: Optional[float] = None,
adam_global_clipnorm: Optional[float] = None,
weight_decay_rate: float = 0.0,
power: float = 1.0,
include_in_weight_decay: Optional[List[str]] = None,
):
"""
创建一个优化器,并使用热身阶段后的线性衰减学习率计划。
Args:
init_lr (`float`):
热身阶段结束时的初始学习率。
num_train_steps (`int`):
总训练步数。
num_warmup_steps (`int`):
热身步数。
min_lr_ratio (`float`, *optional*, defaults to 0):
线性衰减结束时的最终学习率将为 `init_lr * min_lr_ratio`。
adam_beta1 (`float`, *optional*, defaults to 0.9):
Adam优化器中的beta1参数。
adam_beta2 (`float`, *optional*, defaults to 0.999):
Adam优化器中的beta2参数。
adam_epsilon (`float`, *optional*, defaults to 1e-8):
Adam优化器中的epsilon参数。
adam_clipnorm (`float`, *optional*, defaults to `None`):
如果不为`None`,则对每个权重张量的梯度范数进行裁剪。
adam_global_clipnorm (`float`, *optional*, defaults to `None`):
如果不为`None`,则将梯度范数裁剪到此值。使用此参数时,梯度范数计算为所有权重张量的向量化结果。
weight_decay_rate (`float`, *optional*, defaults to 0):
使用的权重衰减率。
power (`float`, *optional*, defaults to 1.0):
PolynomialDecay中使用的幂次数。
include_in_weight_decay (`List[str]`, *optional*):
要应用权重衰减的参数名称列表(或正则表达式模式)。如果未传入,则权重衰减将应用于除偏置和层归一化参数之外的所有参数。
"""
lr_schedule = schedules.PolynomialDecay(
initial_learning_rate=init_lr,
decay_steps=num_train_steps - num_warmup_steps,
end_learning_rate=init_lr * min_lr_ratio,
power=power,
)
if num_warmup_steps:
lr_schedule = WarmUp(
initial_learning_rate=init_lr,
decay_schedule_fn=lr_schedule,
warmup_steps=num_warmup_steps,
)
if weight_decay_rate > 0.0:
optimizer = AdamWeightDecay(
learning_rate=lr_schedule,
weight_decay_rate=weight_decay_rate,
beta_1=adam_beta1,
beta_2=adam_beta2,
epsilon=adam_epsilon,
clipnorm=adam_clipnorm,
global_clipnorm=adam_global_clipnorm,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
include_in_weight_decay=include_in_weight_decay,
)
else:
optimizer = keras.optimizers.Adam(
learning_rate=lr_schedule,
beta_1=adam_beta1,
beta_2=adam_beta2,
epsilon=adam_epsilon,
clipnorm=adam_clipnorm,
global_clipnorm=adam_global_clipnorm,
)
return optimizer, lr_schedule
class AdamWeightDecay(Adam):
"""
Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
Regularization](https://arxiv.org/abs/1711.05101).
Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
to adding the square of the weights to the loss with plain (non-momentum) SGD.
Args:
learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
The learning rate to use or a schedule.
beta_1 (`float`, *optional*, defaults to 0.9):
The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
beta_2 (`float`, *optional*, defaults to 0.999):
The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
epsilon (`float`, *optional*, defaults to 1e-07):
The epsilon parameter in Adam, which is a small constant for numerical stability.
amsgrad (`bool`, *optional*, defaults to `False`):
Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
Beyond](https://arxiv.org/abs/1904.09237).
weight_decay_rate (`float`, *optional*, defaults to 0.0):
The weight decay to apply.
include_in_weight_decay (`List[str]`, *optional*):
List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
exclude_from_weight_decay (`List[str]`, *optional*):
List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
`include_in_weight_decay` is passed, the names in it will supersede this list.
name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
Optional name for the operations created when applying gradients.
kwargs (`Dict[str, Any]`, *optional*):
Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
`learning_rate` instead.
"""
def __init__(
self,
learning_rate: Union[float, LearningRateSchedule] = 0.001,
beta_1: float = 0.9,
beta_2: float = 0.999,
epsilon: float = 1e-07,
amsgrad: bool = False,
weight_decay_rate: float = 0.0,
include_in_weight_decay: Optional[List[str]] = None,
exclude_from_weight_decay: Optional[List[str]] = None,
name: str = "AdamWeightDecay",
**kwargs: Dict[str, Any]
):
super().__init__(
learning_rate=learning_rate,
beta_1=beta_1,
beta_2=beta_2,
epsilon=epsilon,
amsgrad=amsgrad,
name=name,
**kwargs
)
self.weight_decay_rate = weight_decay_rate
self.include_in_weight_decay = include_in_weight_decay
self.exclude_from_weight_decay = exclude_from_weight_decay
def apply_gradients(self, grads_and_vars: List[Tuple[tf.Tensor, tf.Variable]], name: Optional[str] = None):
if self.weight_decay_rate > 0.0:
lr = self._get_hyper("learning_rate")
apply_decay = self._should_apply_weight_decay()
for grad, var in grads_and_vars:
if apply_decay and self._do_use_weight_decay(var.name):
grad += self.weight_decay_rate * var
self._resource_apply_dense(grad, var, apply_state=True)
return super().apply_gradients(grads_and_vars, name=name)
def _do_use_weight_decay(self, param_name: str) -> bool:
if self.include_in_weight_decay:
for pattern in self.include_in_weight_decay:
if re.search(pattern, param_name):
return True
if self.exclude_from_weight_decay:
for pattern in self.exclude_from_weight_decay:
if re.search(pattern, param_name):
return False
return True
def _should_apply_weight_decay(self) -> bool:
return self.weight_decay_rate > 0.0
def __init__(
self,
learning_rate: Union[float, schedules.LearningRateSchedule] = 0.001,
beta_1: float = 0.9,
beta_2: float = 0.999,
epsilon: float = 1e-7,
amsgrad: bool = False,
weight_decay_rate: float = 0.0,
include_in_weight_decay: Optional[List[str]] = None,
exclude_from_weight_decay: Optional[List[str]] = None,
name: str = "AdamWeightDecay",
**kwargs,
):
super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
self.weight_decay_rate = weight_decay_rate
self._include_in_weight_decay = include_in_weight_decay
self._exclude_from_weight_decay = exclude_from_weight_decay
@classmethod
def from_config(cls, config):
"""从配置中创建优化器,并添加WarmUp自定义对象。"""
custom_objects = {"WarmUp": WarmUp}
return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
def _prepare_local(self, var_device, var_dtype, apply_state):
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant(
self.weight_decay_rate, name="adam_weight_decay_rate"
)
def _decay_weights_op(self, var, learning_rate, apply_state):
do_decay = self._do_use_weight_decay(var.name)
if do_decay:
return var.assign_sub(
learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"],
use_locking=self._use_locking,
)
return tf.no_op()
def apply_gradients(self, grads_and_vars, name=None, **kwargs):
grads, tvars = list(zip(*grads_and_vars))
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name, **kwargs)
def _get_lr(self, var_device, var_dtype, apply_state):
"""从状态中获取给定变量的学习率。"""
if apply_state is None:
return self._decayed_lr_t[var_dtype], {}
apply_state = apply_state or {}
coefficients = apply_state.get((var_device, var_dtype))
if coefficients is None:
coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients
return coefficients["lr_t"], {"apply_state": apply_state}
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
def get_config(self):
config = super().get_config()
config.update({"weight_decay_rate": self.weight_decay_rate})
return config
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if self.weight_decay_rate == 0:
return False
if self._include_in_weight_decay:
for r in self._include_in_weight_decay:
if re.search(r, param_name) is not None:
return True
if self._exclude_from_weight_decay:
for r in self._exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
class GradientAccumulator:
"""
梯度累积工具类。当与分布策略一起使用时,累加器应在副本上下文中调用。
梯度将在每个副本上本地累积,且不进行同步。用户应调用 `.gradients` 方法获取梯度,
如果需要,对梯度进行缩放,并将结果传递给 `apply_gradients` 方法。
"""
def __init__(self):
"""初始化累加器。"""
self._gradients = []
self._accum_steps = None
@property
def step(self):
"""累积步数的属性。"""
if self._accum_steps is None:
self._accum_steps = tf.Variable(
tf.constant(0, dtype=tf.int64),
trainable=False,
synchronization=tf.VariableSynchronization.ON_READ,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
)
return self._accum_steps.value()
@property
def gradients(self):
"""当前副本上累积的梯度列表。"""
if not self._gradients:
raise ValueError("需要先调用累加器以初始化梯度")
return [gradient.value() if gradient is not None else gradient for gradient in self._gradients]
def __call__(self, gradients):
"""在当前副本上累积 `gradients`。"""
if not self._gradients:
_ = self.step
self._gradients.extend(
[
tf.Variable(
tf.zeros_like(gradient),
trainable=False,
synchronization=tf.VariableSynchronization.ON_READ,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
)
if gradient is not None
else gradient
for gradient in gradients
]
)
if len(gradients) != len(self._gradients):
raise ValueError(f"期望 {len(self._gradients)} 个梯度,但实际得到 {len(gradients)} 个")
for accum_gradient, gradient in zip(self._gradients, gradients):
if accum_gradient is not None and gradient is not None:
accum_gradient.assign_add(gradient)
self._accum_steps.assign_add(1)
def reset(self):
"""重置当前副本上累积的梯度。"""
if not self._gradients:
return
self._accum_steps.assign(0)
for gradient in self._gradients:
if gradient is not None:
gradient.assign(tf.zeros_like(gradient))