Transformers 源码解析(三十七)
.\models\deprecated\transfo_xl\convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
"""转换 Transformer XL 检查点和数据集。"""
import argparse
import os
import pickle
import sys
import torch
from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils
from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
logging.set_verbosity_info()
data_utils.Vocab = data_utils.TransfoXLTokenizer
data_utils.Corpus = data_utils.TransfoXLCorpus
sys.modules["data_utils"] = data_utils
sys.modules["vocabulary"] = data_utils
def convert_transfo_xl_checkpoint_to_pytorch(
tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
):
if transfo_xl_dataset_file:
with open(transfo_xl_dataset_file, "rb") as fp:
corpus = pickle.load(fp, encoding="latin1")
pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
print(f"Save vocabulary to {pytorch_vocab_dump_path}")
corpus_vocab_dict = corpus.vocab.__dict__
torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
corpus_dict_no_vocab = corpus.__dict__
corpus_dict_no_vocab.pop("vocab", None)
pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
print(f"Save dataset to {pytorch_dataset_dump_path}")
torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
if tf_checkpoint_path:
config_path = os.path.abspath(transfo_xl_config_file)
tf_path = os.path.abspath(tf_checkpoint_path)
print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.")
if transfo_xl_config_file == "":
config = TransfoXLConfig()
else:
config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
print(f"Building PyTorch model from configuration: {config}")
model = TransfoXLLMHeadModel(config)
model = load_tf_weights_in_transfo_xl(model, config, tf_path)
pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
torch.save(model.state_dict(), pytorch_weights_dump_path)
print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
f.write(config.to_json_string())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
required=True,
help="Path to the folder to store the PyTorch model or dataset/vocab.",
)
parser.add_argument(
"--tf_checkpoint_path",
default="",
type=str,
help="An optional path to a TensorFlow checkpoint path to be converted.",
)
parser.add_argument(
"--transfo_xl_config_file",
default="",
type=str,
help=(
"An optional config json file corresponding to the pre-trained BERT model. \n"
"This specifies the model architecture."
),
)
parser.add_argument(
"--transfo_xl_dataset_file",
default="",
type=str,
help="An optional dataset file to be converted in a vocabulary.",
)
args = parser.parse_args()
convert_transfo_xl_checkpoint_to_pytorch(
args.tf_checkpoint_path,
args.transfo_xl_config_file,
args.pytorch_dump_folder_path,
args.transfo_xl_dataset_file,
)
.\models\deprecated\transfo_xl\modeling_tf_transfo_xl.py
"""
TF 2.0 Transformer XL model.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ....modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ....tf_utils import shape_list, stable_softmax
from ....utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_transfo_xl import TransfoXLConfig
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
_CONFIG_FOR_DOC = "TransfoXLConfig"
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl/transfo-xl-wt103",
]
class TFPositionalEmbedding(keras.layers.Layer):
def __init__(self, demb, **kwargs):
super().__init__(**kwargs)
self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
def call(self, pos_seq, bsz=None):
self.inv_freq = tf.cast(self.inv_freq, dtype=pos_seq.dtype)
sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq)
pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
if bsz is not None:
return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
else:
return pos_emb[:, None, :]
class TFPositionwiseFF(keras.layers.Layer):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super().__init__(**kwargs)
self.d_model = d_model
self.d_inner = d_inner
self.dropout = dropout
self.layer_1 = keras.layers.Dense(
d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
)
self.drop_1 = keras.layers.Dropout(dropout)
self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
self.drop_2 = keras.layers.Dropout(dropout)
self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
self.pre_lnorm = pre_lnorm
def call(self, inp, training=False):
if self.pre_lnorm:
core_out = self.layer_norm(inp)
core_out = self.layer_1(core_out)
core_out = self.drop_1(core_out, training=training)
core_out = self.layer_2(core_out)
core_out = self.drop_2(core_out, training=training)
output = core_out + inp
else:
core_out = self.layer_1(inp)
core_out = self.drop_1(core_out, training=training)
core_out = self.layer_2(core_out)
core_out = self.drop_2(core_out, training=training)
output = self.layer_norm(inp + core_out)
return output
class TFRelPartialLearnableDecoderLayer(keras.layers.Layer):
def __init__(
self,
n_head,
d_model,
d_head,
d_inner,
dropout,
dropatt=0.0,
pre_lnorm=False,
r_w_bias=None,
r_r_bias=None,
layer_norm_epsilon=1e-5,
init_std=0.02,
output_attentions=False,
**kwargs,
):
super().__init__(**kwargs)
self.n_head = n_head
self.d_model = d_model
self.d_head = d_head
self.dropout = dropout
self.output_attentions = output_attentions
self.qkv_net = keras.layers.Dense(
3 * n_head * d_head,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name="qkv_net"
)
self.drop = keras.layers.Dropout(dropout)
self.dropatt = keras.layers.Dropout(dropatt)
self.o_net = keras.layers.Dense(
d_model,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name="o_net"
)
self.layer_norm = keras.layers.LayerNormalization(
epsilon=layer_norm_epsilon,
name="layer_norm"
)
self.scale = 1 / (d_head**0.5)
self.pre_lnorm = pre_lnorm
if r_r_bias is not None and r_w_bias is not None:
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
else:
self.r_r_bias = None
self.r_w_bias = None
self.r_net = keras.layers.Dense(
self.n_head * self.d_head,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name="r_net"
)
def build(self, input_shape):
if self.r_r_bias is None or self.r_w_bias is None:
self.r_r_bias = self.add_weight(
shape=(self.n_head, self.d_head),
initializer="zeros",
trainable=True,
name="r_r_bias"
)
self.r_w_bias = self.add_weight(
shape=(self.n_head, self.d_head),
initializer="zeros",
trainable=True,
name="r_w_bias"
)
super().build(input_shape)
def _rel_shift(self, x):
x_size = shape_list(x)
x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
x = tf.reshape(x, x_size)
return x
):
super().__init__(**kwargs)
self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
n_head,
d_model,
d_head,
dropout,
dropatt=dropatt,
pre_lnorm=pre_lnorm,
r_w_bias=r_w_bias,
r_r_bias=r_r_bias,
init_std=init_std,
layer_norm_epsilon=layer_norm_epsilon,
output_attentions=output_attentions,
name="dec_attn",
)
self.pos_ff = TFPositionwiseFF(
d_model,
d_inner,
dropout,
pre_lnorm=pre_lnorm,
init_std=init_std,
layer_norm_epsilon=layer_norm_epsilon,
name="pos_ff",
)
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.0, pre_lnorm=False,
r_w_bias=None, r_r_bias=None, init_std=0.02, layer_norm_epsilon=1e-12,
output_attentions=False, **kwargs):
super().__init__(**kwargs)
self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
n_head,
d_model,
d_head,
dropout,
dropatt=dropatt,
pre_lnorm=pre_lnorm,
r_w_bias=r_w_bias,
r_r_bias=r_r_bias,
init_std=init_std,
layer_norm_epsilon=layer_norm_epsilon,
output_attentions=output_attentions,
name="dec_attn",
)
self.pos_ff = TFPositionwiseFF(
d_model,
d_inner,
dropout,
pre_lnorm=pre_lnorm,
init_std=init_std,
layer_norm_epsilon=layer_norm_epsilon,
name="pos_ff",
)
def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False):
attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training)
ff_output = self.pos_ff(attn_outputs[0], training=training)
outputs = [ff_output] + attn_outputs[1:]
return outputs
def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False):
attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training)
ff_output = self.pos_ff(attn_outputs[0], training=training)
outputs = [ff_output] + attn_outputs[1:]
return outputs
class TFTransfoEmbeddings(keras.layers.Layer):
def __init__(self, vocab_size, emb_size, init_std, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.emb_size = emb_size
self.init_std = init_std
def build(self, input_shape):
self.weight = self.add_weight(
shape=(self.vocab_size, self.emb_size),
initializer=get_initializer(self.init_std),
name="embeddings",
)
super().build(input_shape)
def call(self, inputs):
return tf.gather(self.weight, inputs)
class TFAdaptiveEmbedding(keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
super().__init__(**kwargs)
self.n_token = n_token
self.d_embed = d_embed
self.init_std = init_std
self.cutoffs = cutoffs + [n_token]
self.div_val = div_val
self.d_proj = d_proj
self.emb_scale = d_proj**0.5
self.cutoff_ends = [0] + self.cutoffs
self.emb_layers = []
self.emb_projs = []
if div_val == 1:
raise NotImplementedError
else:
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
d_emb_i = d_embed // (div_val**i)
self.emb_layers.append(
TFTransfoEmbeddings(
r_idx - l_idx,
d_emb_i,
init_std,
name=f"emb_layers_._{i}",
)
)
def build(self, input_shape):
for i in range(len(self.cutoffs)):
d_emb_i = self.d_embed // (self.div_val**i)
self.emb_projs.append(
self.add_weight(
shape=(d_emb_i, self.d_proj),
initializer=get_initializer(self.init_std),
trainable=True,
name=f"emb_projs_._{i}",
)
)
super().build(input_shape)
def call(self, inp):
if self.div_val == 1:
raise NotImplementedError
else:
inp_flat = tf.reshape(inp, (-1,))
emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
emb_i = self.emb_layers[i](inp_i)
emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i])
mask_idx = tf.where(mask_i)
scatter = tf.scatter_nd(mask_idx, emb_i, shape_list(emb_flat))
emb_flat = tf.cast(emb_flat, dtype=scatter.dtype)
emb_flat += scatter
embed_shape = shape_list(inp) + [self.d_proj]
embed = tf.reshape(emb_flat, embed_shape)
embed *= self.emb_scale
return embed
@keras_serializable
class TFTransfoXLMainLayer(keras.layers.Layer):
config_class = TransfoXLConfig
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
self.n_token = config.vocab_size
self.d_embed = config.d_embed
self.d_model = config.d_model
self.n_head = config.n_head
self.d_head = config.d_head
self.untie_r = config.untie_r
self.word_emb = TFAdaptiveEmbedding(
config.vocab_size,
config.d_embed,
config.d_model,
config.cutoffs,
div_val=config.div_val,
init_std=config.init_std,
name="word_emb",
)
self.drop = keras.layers.Dropout(config.dropout)
self.n_layer = config.n_layer
self.mem_len = config.mem_len
self.attn_type = config.attn_type
self.layers = []
if config.attn_type == 0:
for i in range(config.n_layer):
self.layers.append(
TFRelPartialLearnableDecoderLayer(
config.n_head,
config.d_model,
config.d_head,
config.d_inner,
config.dropout,
dropatt=config.dropatt,
pre_lnorm=config.pre_lnorm,
r_w_bias=None if self.untie_r else self.r_w_bias,
r_r_bias=None if self.untie_r else self.r_r_bias,
layer_norm_epsilon=config.layer_norm_epsilon,
init_std=config.init_std,
output_attentions=self.output_attentions,
name=f"layers_._{i}",
)
)
else:
raise NotImplementedError
self.same_length = config.same_length
self.clamp_len = config.clamp_len
if self.attn_type == 0:
self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb")
else:
raise NotImplementedError
def build(self, input_shape):
if not self.untie_r:
self.r_w_bias = self.add_weight(
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
)
self.r_r_bias = self.add_weight(
shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
)
super().build(input_shape)
def get_input_embeddings(self):
return self.word_emb
def set_input_embeddings(self, value):
raise NotImplementedError
def backward_compatible(self):
self.sample_softmax = -1
def reset_memory_length(self, mem_len):
self.mem_len = mem_len
def _prune_heads(self, heads):
raise NotImplementedError
def init_mems(self, bsz):
if self.mem_len > 0:
mems = []
for i in range(self.n_layer):
empty = tf.zeros([self.mem_len, bsz, self.d_model])
mems.append(empty)
return mems
else:
return None
def _update_mems(self, hids, mems, mlen, qlen):
if mems is None:
return None
assert len(hids) == len(mems), "len(hids) != len(mems)"
new_mems = []
end_idx = mlen + tf.math.maximum(0, qlen)
beg_idx = tf.math.maximum(0, end_idx - tf.convert_to_tensor(self.mem_len))
for i in range(len(hids)):
mems[i] = tf.cast(mems[i], dtype=hids[i].dtype)
cat = tf.concat([mems[i], hids[i]], axis=0)
tf.stop_gradient(cat)
new_mems.append(cat[beg_idx:end_idx])
return new_mems
def call(
self,
input_ids: TFModelInputType | None = None,
mems: List[tf.Tensor] | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = TransfoXLConfig
base_model_prefix = "transformer"
@dataclass
class TFTransfoXLModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
mems (`List[tf.Tensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
input) to speed up sequential decoding. The token ids which have their past given to this model should not
be passed as input ids as they have already been computed.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
mems: List[tf.Tensor] = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFTransfoXLLMHeadModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
"""
"""
Args:
losses (`tf.Tensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
Language modeling losses (not reduced).
语言建模的损失(未归并),形状为 *(batch_size, sequence_length-1)*,在提供 `labels` 时返回。
prediction_scores (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
语言建模头部的预测分数,形状为 `(batch_size, sequence_length, config.vocab_size)`,
表示每个词汇标记的预测分数(经过 SoftMax 后的分数)。
mems (`List[tf.Tensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
input) to speed up sequential decoding. The token ids which have their past given to this model should not
be passed as input ids as they have already been computed.
包含预先计算的隐藏状态(注意力块中的键和值)。长度为 `config.n_layers` 的列表,
可用于加速序列解码。已经计算过的 token id 不应该作为输入传递给模型。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
模型在每一层输出的隐藏状态,以及初始嵌入输出的元组。当设置 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均。当设置 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
"""
prediction_scores: tf.Tensor = None
mems: List[tf.Tensor] = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
@dataclass
class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
"""
Base class for outputs of sentence classification models.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (`List[tf.Tensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
input) to speed up sequential decoding. The token ids which have their past given to this model should not
be passed as input ids as they have already been computed.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: tf.Tensor | None = None
logits: tf.Tensor = None
mems: List[tf.Tensor] = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
TRANSFO_XL_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
# 在使用 `model.fit()` 支持的任何格式传递输入和标签!然而,如果你想在 Keras 方法之外使用第二种格式,比如在使用 Keras 的 `Functional` API 创建自定义层或模型时,可以使用以下三种方法来收集所有输入张量到第一个位置参数中:
# - 仅包含 `input_ids` 的单个张量:`model(input_ids)`
# - 包含不同长度列表,按照文档字符串中给定的顺序包含一个或多个输入张量:`model([input_ids, attention_mask])` 或 `model([input_ids, attention_mask, token_type_ids])`
# - 包含一个或多个输入张量,并与文档字符串中给定的输入名称关联的字典:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
# 注意,当使用 [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) 创建模型和层时,不需要担心这些问题,因为可以像传递到任何其他 Python 函数一样传递输入!
# Parameters:
# config ([`TransfoXLConfig`]): 包含模型所有参数的模型配置类。
# 使用配置文件初始化不会加载与模型关联的权重,只加载配置。可以查看 [`~PreTrainedModel.from_pretrained`] 方法加载模型权重。
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
Args:
input_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
mems (`List[tf.Tensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
`mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
given to this model should not be passed as `input_ids` as they have already been computed.
head_mask (`tf.Tensor` or `Numpy array` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
config will be used instead.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
used instead.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@add_start_docstrings(
"The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
TRANSFO_XL_START_DOCSTRING,
)
class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
"""
TFTransfoXLModel 类的定义,继承自 TFTransfoXLPreTrainedModel 类。
使用 @add_start_docstrings 装饰器添加了类的文档字符串,说明此类是一个不带顶层头的原始 TransfoXL 模型。
"""
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@unpack_inputs
@add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTransfoXLModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
mems: List[tf.Tensor] | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
training: bool = False,
) -> TFTransfoXLModelOutput | Tuple[tf.Tensor]:
outputs = self.transformer(
input_ids=input_ids,
mems=mems,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
"""
Transformer-XL 模型,在顶部有一个语言建模头部(自适应 softmax,其权重与自适应输入嵌入层相结合)。
"""
@add_start_docstrings(
"""
Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
input embeddings)
""",
TRANSFO_XL_START_DOCSTRING,
)
class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
self.sample_softmax = config.sample_softmax
assert self.sample_softmax <= 0, (
"Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
" https://github.com/huggingface/transformers/issues/3310"
)
self.crit = TFAdaptiveSoftmaxMask(
config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
)
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError()
def get_output_embeddings(self):
"""Double-check if you are using adaptive softmax."""
if len(self.crit.out_layers) > 0:
return self.crit.out_layers[-1]
return None
def reset_memory_length(self, mem_len):
self.transformer.reset_memory_length(mem_len)
def init_mems(self, bsz):
return self.transformer.init_mems(bsz)
@unpack_inputs
@add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTransfoXLLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
mems: List[tf.Tensor] | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
labels: np.ndarray | tf.Tensor | None = None,
training: bool = False,
**kwargs,
):
pass
) -> TFTransfoXLLMHeadModelOutput | Tuple[tf.Tensor]:
if input_ids is not None:
bsz, tgt_len = shape_list(input_ids)[:2]
else:
bsz, tgt_len = shape_list(inputs_embeds)[:2]
transformer_outputs = self.transformer(
input_ids,
mems,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict,
training=training,
)
last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:]
softmax_output = self.crit(pred_hid, labels, training=training)
prediction_scores = softmax_output if labels is None else ()
if not return_dict:
return (prediction_scores,) + transformer_outputs[1:]
return TFTransfoXLLMHeadModelOutput(
prediction_scores=prediction_scores,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
inputs = {}
if past_key_values:
input_ids = tf.expand_dims(input_ids[:, -1], axis=-1)
else:
input_ids = input_ids
return inputs
def tf_to_pt_weight_rename(self, tf_weight):
if self.config.tie_word_embeddings and "crit.out_layers" in tf_weight:
return tf_weight, tf_weight.replace("crit.out_layers", "transformer.word_emb.emb_layers")
elif self.config.tie_projs and "crit.out_projs" in tf_weight:
for i, tie_proj in enumerate(self.config.tie_projs):
if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
return tf_weight, tf_weight.replace(f"crit.out_projs.{i}", "transformer.word_emb.emb_projs.0")
elif tie_proj and self.config.div_val != 1:
return tf_weight, tf_weight.replace("crit.out_projs", "transformer.word_emb.emb_projs")
else:
return (tf_weight,)
"""
The Transfo XL Model transformer with a sequence classification head on top (linear layer).
[`TFTransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
models (e.g. GPT-1,GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
"""
@add_start_docstrings(
"""
Decorator to add docstrings to the model's constructor (__init__ method) for `TFTransfoXLForSequenceClassification`.
Args:
config (:class:`~transformers.TransfoXLConfig`):
The configuration class to instantiate the model with.
This initializes the sequence classifier by setting the number of labels and creating a Dense layer for scoring,
and instantiates the main Transformer layer (`TFTransfoXLMainLayer`).
This model supports sequence classification tasks based on the transformer's last token.
""",
TRANSFO_XL_START_DOCSTRING,
)
class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.score = keras.layers.Dense(
config.num_labels,
kernel_initializer=get_initializer(config.init_range),
name="score",
use_bias=False,
)
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
def get_output_embeddings(self):
"""
Retrieve the output embeddings from the transformer's word embeddings.
This method warns that sequence classification models do not have output embeddings and that
`.get_output_embeddings` will be removed in future versions of transformers.
"""
logger.warning(
"Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
"in transformers v4.32."
)
return self.transformer.word_emb
@unpack_inputs
@add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFTransfoXLSequenceClassifierOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
mems: List[tf.Tensor] | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
**kwargs,
):
"""
Perform the forward pass of the TFTransfoXLForSequenceClassification model.
This method processes inputs through the Transformer XL model and computes logits for sequence classification.
Args:
input_ids (:obj:`tf.Tensor` or :obj:`np.ndarray`, `optional`):
The input IDs of shape `[batch_size, sequence_length]`.
mems (:obj:`List[tf.Tensor]` or :obj:`None`, `optional`):
List of memory states from previous batches to speed up sequential decoding.
head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` or :obj:`None`, `optional`):
Mask to nullify selected heads of the self-attention modules.
inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` or :obj:`None`, `optional`):
Instead of input IDs, directly pass embeddings. Shape should be `[batch_size, sequence_length, hidden_size]`.
output_attentions (:obj:`bool`, `optional`):
Whether to return attention weights.
output_hidden_states (:obj:`bool`, `optional`):
Whether to return hidden states.
return_dict (:obj:`bool`, `optional`):
Whether to return outputs as a dictionary instead of tuple.
labels (:obj:`np.ndarray` or :obj:`tf.Tensor` or :obj:`None`, `optional`):
Labels for computing the sequence classification loss. Shape should be `[batch_size]`.
training (:obj:`bool`, `optional`):
Whether to run in training mode. Defaults to `False`.
Returns:
:obj:`Union[TFTransfoXLSequenceClassifierOutput, Tuple]`:
The sequence classifier output, which includes logits, hidden states, and/or attention weights,
depending on the configuration and optional outputs.
"""
) -> Union[Tuple, TFTransfoXLSequenceClassifierOutputWithPast]:
r"""
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
config.vocab_size - 1]`.
"""
transformer_outputs = self.transformer(
input_ids=input_ids,
mems=mems,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
hidden_states = transformer_outputs[0]
logits = self.score(hidden_states)
in_logits = None
if self.config.pad_token_id is None:
sequence_lengths = -1
else:
if input_ids is not None:
sequence_lengths = (
tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
- 1
)
sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
else:
sequence_lengths = -1
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
loss = None
if labels is not None:
if input_ids is not None:
batch_size, sequence_length = shape_list(input_ids)[:2]
else:
batch_size, sequence_length = shape_list(inputs_embeds)[:2]
assert (
self.config.pad_token_id is not None or batch_size == 1
), "Cannot handle batch sizes > 1 if no padding token is defined."
if not tf.is_tensor(sequence_lengths):
in_logits = logits[0:batch_size, sequence_lengths]
loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))
pooled_logits = in_logits if in_logits is not None else logits
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTransfoXLSequenceClassifierOutputWithPast(
loss=loss,
logits=pooled_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
.\models\deprecated\transfo_xl\modeling_tf_transfo_xl_utilities.py
"""
A TF 2.0 Adaptive Softmax for Transformer XL model.
"""
import tensorflow as tf
from ....modeling_tf_utils import keras
from ....tf_utils import shape_list
class TFAdaptiveSoftmaxMask(keras.layers.Layer):
def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.d_embed = d_embed
self.d_proj = d_proj
self.cutoffs = cutoffs + [vocab_size]
self.cutoff_ends = [0] + self.cutoffs
self.div_val = div_val
self.shortlist_size = self.cutoffs[0]
self.n_clusters = len(self.cutoffs) - 1
self.head_size = self.shortlist_size + self.n_clusters
self.keep_order = keep_order
self.out_layers = []
self.out_projs = []
def build(self, input_shape):
if self.n_clusters > 0:
self.cluster_weight = self.add_weight(
shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
)
self.cluster_bias = self.add_weight(
shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
)
if self.div_val == 1:
for i in range(len(self.cutoffs)):
if self.d_proj != self.d_embed:
weight = self.add_weight(
shape=(self.d_embed, self.d_proj),
initializer="zeros",
trainable=True,
name=f"out_projs_._{i}",
)
self.out_projs.append(weight)
else:
self.out_projs.append(None)
weight = self.add_weight(
shape=(self.vocab_size, self.d_embed),
initializer="zeros",
trainable=True,
name=f"out_layers_._{i}_._weight",
)
bias = self.add_weight(
shape=(self.vocab_size,),
initializer="zeros",
trainable=True,
name=f"out_layers_._{i}_._bias",
)
self.out_layers.append((weight, bias))
else:
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
d_emb_i = self.d_embed // (self.div_val**i)
weight = self.add_weight(
shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name=f"out_projs_._{i}"
)
self.out_projs.append(weight)
weight = self.add_weight(
shape=(r_idx - l_idx, d_emb_i),
initializer="zeros",
trainable=True,
name=f"out_layers_._{i}_._weight",
)
bias = self.add_weight(
shape=(r_idx - l_idx,),
initializer="zeros",
trainable=True,
name=f"out_layers_._{i}_._bias",
)
self.out_layers.append((weight, bias))
super().build(input_shape)
@staticmethod
def _logit(x, W, b, proj=None):
y = x
if proj is not None:
y = tf.einsum("ibd,ed->ibe", y, proj)
return tf.einsum("ibd,nd->ibn", y, W) + b
@staticmethod
def _gather_logprob(logprob, target):
lp_size = shape_list(logprob)
r = tf.range(lp_size[0], dtype=target.dtype)
idx = tf.stack([r, target], 1)
return tf.gather_nd(logprob, idx)
.\models\deprecated\transfo_xl\modeling_transfo_xl.py
"""
PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
"""
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ....modeling_utils import PreTrainedModel
from ....utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_transfo_xl import TransfoXLConfig
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
_CONFIG_FOR_DOC = "TransfoXLConfig"
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl/transfo-xl-wt103",
]
def build_tf_to_pytorch_map(model, config):
"""
A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original
PyTorch model as possible.
"""
tf_to_pt_map = {}
if hasattr(model, "transformer"):
tf_to_pt_map.update(
{
"transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
"transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias,
}
)
for i, (out_l, proj_l, tie_proj) in enumerate(
zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
):
layer_str = f"transformer/adaptive_softmax/cutoff_{i}/"
if config.tie_word_embeddings:
tf_to_pt_map.update({layer_str + "b": out_l.bias})
else:
raise NotImplementedError
tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias})
if not tie_proj:
tf_to_pt_map.update({layer_str + "proj": proj_l})
model = model.transformer
for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
layer_str = f"transformer/adaptive_embed/cutoff_{i}/"
tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})
for i, b in enumerate(model.layers):
layer_str = f"transformer/layer_{i}/"
tf_to_pt_map.update(
{
layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
}
)
if config.untie_r:
r_r_list = []
r_w_list = []
for b in model.layers:
r_r_list.append(b.dec_attn.r_r_bias)
r_w_list.append(b.dec_attn.r_w_bias)
else:
r_r_list = [model.r_r_bias]
r_w_list = [model.r_w_bias]
tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list})
return tf_to_pt_map
def load_tf_weights_in_transfo_xl(model, config, tf_path):
"""Load tf checkpoints in a pytorch model"""
try:
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_to_pt_map = build_tf_to_pytorch_map(model, config)
init_vars = tf.train.list_variables(tf_path)
tf_weights = {}
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array
for name, pointer in tf_to_pt_map.items():
assert name in tf_weights
array = tf_weights[name]
if "kernel" in name or "proj" in name:
array = np.transpose(array)
if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
assert len(pointer) == array.shape[0]
for i, p_i in enumerate(pointer):
arr_i = array[i, ...]
try:
assert p_i.shape == arr_i.shape
except AssertionError as e:
e.args += (p_i.shape, arr_i.shape)
raise
logger.info(f"Initialize PyTorch weight {name} for layer {i}")
p_i.data = torch.from_numpy(arr_i)
else:
try:
assert pointer.shape == array.shape
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
tf_weights.pop(name, None)
tf_weights.pop(name + "/Adam", None)
tf_weights.pop(name + "/Adam_1", None)
logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
return model
class PositionalEmbedding(nn.Module):
def __init__(self, demb):
super().__init__()
self.demb = demb
inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
self.register_buffer("inv_freq", inv_freq)
def forward(self, pos_seq, bsz=None):
sinusoid_inp = torch.outer(pos_seq, self.inv_freq)
pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
if bsz is not None:
return pos_emb[:, None, :].expand(-1, bsz, -1)
else:
return pos_emb[:, None, :]
class RelPartialLearnableMultiHeadAttn(nn.Module):
def __init__(
self,
n_head,
d_model,
d_head,
dropout,
dropatt=0,
pre_lnorm=False,
r_r_bias=None,
r_w_bias=None,
layer_norm_epsilon=1e-5,
):
super().__init__()
self.n_head = n_head
self.d_model = d_model
self.d_head = d_head
self.dropout = dropout
self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
self.drop = nn.Dropout(dropout)
self.dropatt = nn.Dropout(dropatt)
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
self.scale = 1 / (d_head**0.5)
self.pre_lnorm = pre_lnorm
if r_r_bias is None or r_w_bias is None:
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
else:
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
def _rel_shift(self, x):
zero_pad_shape = (x.size(0), 1) + x.size()[2:]
zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
x_padded = torch.cat([zero_pad, x], dim=1)
x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
x_padded = x_padded.view(*x_padded_shape)
x = x_padded[1:].view_as(x)
return x
def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
super().__init__()
self.dec_attn = RelPartialLearnableMultiHeadAttn(
n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
)
self.pos_ff = PositionwiseFF(
d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon
)
def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
attn_outputs = self.dec_attn(
dec_inp,
r,
attn_mask=dec_attn_mask,
mems=mems,
head_mask=head_mask,
output_attentions=output_attentions,
)
ff_output = self.pos_ff(attn_outputs[0])
outputs = [ff_output] + attn_outputs[1:]
return outputs
class AdaptiveEmbedding(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
super().__init__()
self.n_token = n_token
self.d_embed = d_embed
self.cutoffs = cutoffs + [n_token]
self.div_val = div_val
self.d_proj = d_proj
self.emb_scale = d_proj**0.5
self.cutoff_ends = [0] + self.cutoffs
self.emb_layers = nn.ModuleList()
self.emb_projs = nn.ParameterList()
if div_val == 1:
self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
if d_proj != d_embed:
self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
else:
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
d_emb_i = d_embed // (div_val**i)
self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
def forward(self, inp):
if self.div_val == 1:
embed = self.emb_layers[0](inp)
if self.d_proj != self.d_embed:
embed = nn.functional.linear(embed, self.emb_projs[0])
else:
param = next(self.parameters())
inp_flat = inp.view(-1)
emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
indices_i = mask_i.nonzero().squeeze()
if indices_i.numel() == 0:
continue
inp_i = inp_flat.index_select(0, indices_i) - l_idx
emb_i = self.emb_layers[i](inp_i)
emb_i = nn.functional.linear(emb_i, self.emb_projs[i])
emb_flat.index_copy_(0, indices_i, emb_i)
embed_shape = inp.size() + (self.d_proj,)
embed = emb_flat.view(embed_shape)
embed.mul_(self.emb_scale)
return embed
class TransfoXLPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = TransfoXLConfig
load_tf_weights = load_tf_weights_in_transfo_xl
base_model_prefix = "transformer"
def _init_weight(self, weight):
if self.config.init == "uniform":
nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
elif self.config.init == "normal":
nn.init.normal_(weight, 0.0, self.config.init_std)
def _init_bias(self, bias):
nn.init.constant_(bias, 0.0)
def _init_weights(self, m):
classname = m.__class__.__name__
if classname.find("Linear") != -1:
if hasattr(m, "weight") and m.weight is not None:
self._init_weight(m.weight)
if hasattr(m, "bias") and m.bias is not None:
self._init_bias(m.bias)
elif classname.find("AdaptiveEmbedding") != -1:
if hasattr(m, "emb_projs"):
for i in range(len(m.emb_projs)):
if m.emb_projs[i] is not None:
nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
elif classname.find("Embedding") != -1:
if hasattr(m, "weight"):
self._init_weight(m.weight)
elif classname.find("ProjectedAdaptiveLogSoftmax") != -1:
if hasattr(m, "cluster_weight") and m.cluster_weight is not None:
self._init_weight(m.cluster_weight)
if hasattr(m, "cluster_bias") and m.cluster_bias is not None:
self._init_bias(m.cluster_bias)
if hasattr(m, "out_projs"):
for i in range(len(m.out_projs)):
if m.out_projs[i] is not None:
nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
elif classname.find("LayerNorm") != -1:
if hasattr(m, "weight"):
nn.init.normal_(m.weight, 1.0, self.config.init_std)
if hasattr(m, "bias") and m.bias is not None:
self._init_bias(m.bias)
else:
if hasattr(m, "r_emb"):
self._init_weight(m.r_emb)
if hasattr(m, "r_w_bias"):
self._init_weight(m.r_w_bias)
if hasattr(m, "r_r_bias"):
self._init_weight(m.r_r_bias)
if hasattr(m, "r_bias"):
self._init_bias(m.r_bias)
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1):
"""
Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying
weights embeddings afterwards if the model class has a *tie_weights()* method.
Arguments:
new_num_tokens: (*optional*) int:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at
the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and
just returns a pointer to the input tokens `torch.nn.Embeddings` Module of the model.
layer: (*optional*) int:
Layer of the *AdaptiveEmbedding* where the resizing should be done. Per default the last layer will be
resized. Be aware that when resizing other than the last layer, you have to ensure that the new
token(s) in the tokenizer are at the corresponding position.
Return: `torch.nn.Embeddings` Pointer to the input tokens Embeddings Module of the model
"""
base_model = getattr(self, self.base_model_prefix, self)
if new_num_tokens is None:
return self.get_input_embeddings()
new_num_tokens_layer, layer = self._get_new_num_tokens_layer(new_num_tokens, layer)
assert new_num_tokens_layer > 0, "The size of the new embedding layer cannot be 0 or less"
model_embeds = base_model._resize_token_embeddings(new_num_tokens_layer, layer)
self.config.vocab_size = new_num_tokens
base_model.vocab_size = new_num_tokens
base_model.n_token = new_num_tokens
new_embedding_shapes = self._get_embedding_shapes()
self._resize_cutoffs(new_num_tokens, new_num_tokens_layer, new_embedding_shapes, layer)
self.tie_weights()
return model_embeds
def _get_new_num_tokens_layer(self, new_num_tokens, layer):
embeddings = self.get_input_embeddings()
if layer == -1:
layer = len(embeddings.emb_layers) - 1
assert 0 <= layer <= len(embeddings.emb_layers) - 1
new_num_tokens_layer = (
new_num_tokens
- sum([emb.weight.shape[0] for emb in embeddings.emb_layers[:layer]])
- sum([emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1:]])
)
return new_num_tokens_layer, layer
def _get_embedding_shapes(self):
embeddings = self.get_input_embeddings()
return [emb.weight.shape[0] for emb in embeddings.emb_layers]
def _resize_token_embeddings(self, new_num_tokens, layer=-1):
embeddings = self.get_input_embeddings()
if new_num_tokens is None:
return embeddings
new_embeddings_layer = self._get_resized_embeddings(embeddings.emb_layers[layer], new_num_tokens)
embeddings.emb_layers[layer] = new_embeddings_layer
self.set_input_embeddings(embeddings)
return self.get_input_embeddings()
def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
embeddings = self.get_input_embeddings()
for i in range(layer, len(embeddings.cutoffs)):
embeddings.cutoffs[i] = sum(new_embedding_shapes[: i + 1])
embeddings.cutoff_ends = [0] + embeddings.cutoffs
embeddings.n_token = new_num_tokens
self.config.cutoffs = embeddings.cutoffs[:-1]
return embeddings.cutoffs
@dataclass
class TransfoXLModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
input) to speed up sequential decoding. The token ids which have their past given to this model should not
be passed as input ids as they have already been computed.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor
mems: List[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class TransfoXLSequenceClassifierOutputWithPast(ModelOutput):
"""
Base class for outputs of sentence classification models.
"""
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
分类(或回归,如果config.num_labels==1)的损失值。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类(或回归,如果config.num_labels==1)的分数(SoftMax之前)。
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
包含预先计算的隐藏状态(注意力模块中的键和值)。可以用来加速顺序解码。
给定给模型的过去记忆的令牌ID不应作为输入ID传递,因为它们已经被计算过。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
`torch.FloatTensor`的元组(一个用于嵌入层的输出 + 每层的输出),形状为`(batch_size, sequence_length, hidden_size)`。
模型每层的隐藏状态加上初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
`torch.FloatTensor`的元组(每层一个)形状为`(batch_size, num_heads, sequence_length, sequence_length)`。
注意力softmax后的注意力权重,用于计算自注意力头中的加权平均值。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: List[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class TransfoXLLMHeadModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
losses (`torch.FloatTensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
Language modeling losses (not reduced).
prediction_scores (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
input) to speed up sequential decoding. The token ids which have their past given to this model should not
be passed as input ids as they have already been computed.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
loss (`torch.FloatTensor` of shape `()`, *optional*, returned when `labels` is provided)
Reduced language modeling loss.
"""
losses: Optional[torch.FloatTensor] = None
prediction_scores: torch.FloatTensor = None
mems: List[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
loss: Optional[torch.FloatTensor] = None
@property
def logits(self):
return self.prediction_scores
TRANSFO_XL_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
"""
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
mems (`List[torch.FloatTensor]` of length `config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
`mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
given to this model should not be passed as `input_ids` as they have already been computed.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLModel(TransfoXLPreTrainedModel):
"""
TransfoXLModel class inherits from TransfoXLPreTrainedModel and represents the main model for TransfoXL.
This class provides the core Transformer-XL model for processing sequences, without any task-specific head.
Args:
config (:class:`~transformers.TransfoXLConfig`):
The model configuration class that defines the model architecture and its parameters.
Inherits from:
:class:`~transformers.TransfoXLPreTrainedModel`
"""
# 初始化方法,用于初始化Transformer-XL模型的各个参数和层
def __init__(self, config):
# 调用父类的初始化方法
super().__init__(config)
# 设置词汇表大小
self.n_token = config.vocab_size
# 设置词嵌入的维度和模型的维度
self.d_embed = config.d_embed
self.d_model = config.d_model
# 设置注意力头的数量和每个头的维度
self.n_head = config.n_head
self.d_head = config.d_head
# 创建自适应词嵌入层,根据词汇表大小、词嵌入维度、模型维度、截断参数和分割值创建
self.word_emb = AdaptiveEmbedding(
config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
)
# 添加Dropout层,使用指定的丢弃率
self.drop = nn.Dropout(config.dropout)
# 设置层数和记忆长度
self.n_layer = config.n_layer
self.mem_len = config.mem_len
# 设置注意力类型
self.attn_type = config.attn_type
# 如果不是解耦的注意力机制,则创建r_w_bias和r_r_bias作为可学习参数
if not config.untie_r:
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
# 创建多层Transformer-XL解码层
self.layers = nn.ModuleList()
if config.attn_type == 0: # 默认的注意力类型
# 根据层数循环创建RelPartialLearnableDecoderLayer,并添加到self.layers中
for i in range(config.n_layer):
self.layers.append(
RelPartialLearnableDecoderLayer(
config.n_head,
config.d_model,
config.d_head,
config.d_inner,
config.dropout,
dropatt=config.dropatt,
pre_lnorm=config.pre_lnorm,
r_w_bias=None if config.untie_r else self.r_w_bias,
r_r_bias=None if config.untie_r else self.r_r_bias,
layer_norm_epsilon=config.layer_norm_epsilon,
)
)
else:
# 如果不是默认的注意力类型,抛出未实现错误
raise NotImplementedError # 移除这些以避免维护死代码
# 设置同等长度和限制长度
self.same_length = config.same_length
self.clamp_len = config.clamp_len
# 根据注意力类型设置位置编码
if self.attn_type == 0: # 默认注意力类型
self.pos_emb = PositionalEmbedding(self.d_model)
else:
# 如果不是默认注意力类型,抛出未实现错误
raise NotImplementedError # 移除这些以避免维护死代码
# 初始化权重并进行最终处理
self.post_init()
# 返回词嵌入层
def get_input_embeddings(self):
return self.word_emb
# 设置新的输入词嵌入层
def set_input_embeddings(self, new_embeddings):
self.word_emb = new_embeddings
# 向后兼容性方法
def backward_compatible(self):
self.sample_softmax = -1
# 重置记忆长度
def reset_memory_length(self, mem_len):
self.mem_len = mem_len
# 剪枝注意力头
def _prune_heads(self, heads):
logger.info("Head pruning is not implemented for Transformer-XL model")
pass # 留空函数体
def init_mems(self, bsz):
# 如果预定义的记忆长度大于零
if self.mem_len > 0:
# 创建一个空列表 mems 来存储记忆张量
mems = []
# 获取模型的第一个参数(通常是用来确定数据类型和设备)
param = next(self.parameters())
# 对每一层进行循环,创建一个全零张量作为记忆,并加入 mems 列表
for i in range(self.n_layer):
empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device)
mems.append(empty)
# 返回记忆张量列表
return mems
else:
# 如果预定义的记忆长度不大于零,则返回 None
return None
def _update_mems(self, hids, mems, mlen, qlen):
# 如果 mems 为 None,则直接返回 None
if mems is None:
return None
# 断言 hids 和 mems 的长度必须相等,否则抛出异常
assert len(hids) == len(mems), "len(hids) != len(mems)"
# 计算可以缓存到 mems 中的步数总和为 mlen + max(0, qlen)
with torch.no_grad():
# 创建一个新的 mems 列表
new_mems = []
# 计算起始索引和结束索引
end_idx = mlen + max(0, qlen)
beg_idx = max(0, end_idx - self.mem_len)
# 对每一层的隐藏状态进行循环处理
for i in range(len(hids)):
# 将旧的记忆和当前隐藏状态拼接起来,然后截取指定范围的片段,并且分离计算图
cat = torch.cat([mems[i], hids[i]], dim=0)
new_mems.append(cat[beg_idx:end_idx].detach())
# 返回更新后的 mems 列表
return new_mems
@add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TransfoXLModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
mems: Optional[List[torch.FloatTensor]] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
input embeddings)
"""
@add_start_docstrings(
"""
The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
input embeddings)
""",
TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
_tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
def __init__(self, config):
super().__init__(config)
self.transformer = TransfoXLModel(config)
self.sample_softmax = config.sample_softmax
self.trainer_compatible = getattr(config, "trainer_compatible", False)
if not self.trainer_compatible:
warnings.warn(
"The output of TransfoXL will be updated in v5 to support a single loss as first argument. In order "
"to use that updated output, please specify `trainer_compatible=True` as your configuration"
" attribute.",
DeprecationWarning,
)
assert self.sample_softmax <= 0, (
"Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
" https://github.com/huggingface/transformers/issues/3310"
)
self.crit = ProjectedAdaptiveLogSoftmax(
config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
)
# Initialize weights and apply final processing
self.post_init()
def tie_weights(self):
"""
Run this to be sure output and input (adaptive) softmax weights are tied
"""
if self.config.tie_word_embeddings:
for i in range(len(self.crit.out_layers)):
# Tie or clone weights between output and input (adaptive) softmax layers
self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i])
if self.config.tie_projs:
for i, tie_proj in enumerate(self.config.tie_projs):
if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
if self.config.torchscript:
# Clone weights if using torchscript
self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
else:
# Assign weights directly
self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
elif tie_proj and self.config.div_val != 1:
if self.config.torchscript:
# Clone weights if using torchscript
self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
else:
# Assign weights directly
self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
def reset_memory_length(self, mem_len):
# Reset the memory length for the transformer model
self.transformer.reset_memory_length(mem_len)
def init_mems(self, bsz):
# Initialize memories for the transformer model with batch size bsz
return self.transformer.init_mems(bsz)
@add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TransfoXLLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# 定义模型的前向传播方法,接受以下参数:
def forward(
self,
input_ids: Optional[torch.LongTensor] = None, # 输入的词索引序列,类型为可选的长整型张量
mems: Optional[List[torch.FloatTensor]] = None, # 可选的记忆列表,每个元素是浮点数张量
head_mask: Optional[torch.FloatTensor] = None, # 可选的注意力头掩码,浮点数张量
inputs_embeds: Optional[torch.FloatTensor] = None, # 可选的输入嵌入表示,浮点数张量
labels: Optional[torch.LongTensor] = None, # 可选的标签,长整型张量
output_attentions: Optional[bool] = None, # 是否输出注意力权重,布尔值
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态,布尔值
return_dict: Optional[bool] = None, # 是否返回字典格式的输出,布尔值
) -> Union[Tuple, TransfoXLLMHeadModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
# Determine whether to use the return dictionary from the function argument or the model's configuration
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None:
bsz, tgt_len = input_ids.size(0), input_ids.size(1)
elif inputs_embeds is not None:
bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
transformer_outputs = self.transformer(
input_ids,
mems=mems,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:]
if labels is not None:
miss_valid_label = labels[0, 1:].sum() == (labels.size(1) - 1) * -100
if miss_valid_label:
labels[0, 1] = self.config.eos_token_id
softmax_output = self.crit(pred_hid, labels)
prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()
if labels is not None:
losses = softmax_output.view(bsz, tgt_len - 1)
loss = losses[losses != 0].mean()
else:
losses, loss = None, None
if not return_dict:
if self.trainer_compatible:
output = (prediction_scores, losses) if losses is not None else (prediction_scores,)
output += transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
else:
output = (prediction_scores, *transformer_outputs[1:])
output = ((losses,) + output) if losses is not None else output
return (output + (loss,)) if loss is not None else output
return TransfoXLLMHeadModelOutput(
loss=loss,
prediction_scores=prediction_scores,
losses=losses,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def get_output_embeddings(self):
"""Double-check if you are using adaptive softmax."""
if self.sample_softmax > 0:
return self.out_layer
else:
return self.crit.out_layers[-1]
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
inputs = {}
if past_key_values:
inputs["mems"] = past_key_values
inputs["input_ids"] = input_ids[:, -1].unsqueeze(-1)
else:
inputs["input_ids"] = input_ids
return inputs
def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
new_cutoffs = super()._resize_cutoffs(new_num_tokens, new_emb_size, new_embedding_shapes, layer)
self.crit.cutoffs = new_cutoffs
self.crit.cutoff_ends = [0] + new_cutoffs
self.crit.n_token = new_num_tokens
@staticmethod
def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[torch.Tensor]:
"""
This function is used to re-order the `mems` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `mems` with the correct beam_idx at every
generation step.
"""
return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems]
@add_start_docstrings(
"""
The Transformer-XL Model transformer with a sequence classification head on top (linear layer).
[`TransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
models (e.g. GPT-1) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
TRANSFO_XL_START_DOCSTRING,
)
class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
"""
Transformer-XL模型的序列分类器,顶部带有线性层。
[`TransfoXLForSequenceClassification`] 使用最后一个token进行分类,类似于其他因果模型(例如GPT-1)。
由于它在最后一个token上进行分类,因此需要知道最后一个token的位置。如果配置中定义了`pad_token_id`,则在每一行中找到不是填充token的最后一个token。如果未定义`pad_token_id`,则简单地取批次中每一行的最后一个值。当传递`inputs_embeds`而不是`input_ids`时,无法猜测填充token,因此执行相同操作(取批次中每一行的最后一个值)。
"""
def __init__(self, config):
"""
初始化方法,设置模型配置。
Args:
config (:class:`~transformers.TransfoXLConfig`):
模型的配置对象,包含了模型的各种参数和超参数。
"""
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = TransfoXLModel(config)
self.score = nn.Linear(config.d_embed, self.num_labels, bias=False)
self.post_init()
@add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TransfoXLSequenceClassifierOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
mems: Optional[List[torch.FloatTensor]] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
前向传播方法。
Args:
input_ids (:obj:`torch.LongTensor`, 可选):
输入token的ID张量。
mems (:obj:`List[torch.FloatTensor]`, 可选):
Transformer-XL的记忆(memory)部分,用于长序列训练。
head_mask (:obj:`torch.FloatTensor`, 可选):
头部的掩码张量,用于控制层的注意力权重。
inputs_embeds (:obj:`torch.FloatTensor`, 可选):
输入的嵌入张量,替代input_ids。
labels (:obj:`torch.LongTensor`, 可选):
分类任务的标签张量。
output_attentions (:obj:`bool`, 可选):
是否输出注意力权重。
output_hidden_states (:obj:`bool`, 可选):
是否输出隐藏状态。
return_dict (:obj:`bool`, 可选):
是否返回字典格式的输出。
Returns:
:class:`~transformers.modeling_transfo_xl.TransfoXLSequenceClassifierOutputWithPast`:
输出对象,包含分类器的结果和可能的附加信息。
"""
pass
.\models\deprecated\transfo_xl\modeling_transfo_xl_utilities.py
import torch
from torch import nn
class ProjectedAdaptiveLogSoftmax(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
super().__init__()
self.n_token = n_token
self.d_embed = d_embed
self.d_proj = d_proj
self.cutoffs = cutoffs + [n_token]
self.cutoff_ends = [0] + self.cutoffs
self.div_val = div_val
self.shortlist_size = self.cutoffs[0]
self.n_clusters = len(self.cutoffs) - 1
self.head_size = self.shortlist_size + self.n_clusters
if self.n_clusters > 0:
self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
self.out_layers = nn.ModuleList()
self.out_projs = nn.ParameterList()
if div_val == 1:
for i in range(len(self.cutoffs)):
if d_proj != d_embed:
self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
else:
self.out_projs.append(None)
self.out_layers.append(nn.Linear(d_embed, n_token))
else:
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
d_emb_i = d_embed // (div_val**i)
self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx))
self.keep_order = keep_order
def _compute_logit(self, hidden, weight, bias, proj):
if proj is None:
logit = nn.functional.linear(hidden, weight, bias=bias)
else:
proj_hid = nn.functional.linear(hidden, proj.t().contiguous())
logit = nn.functional.linear(proj_hid, weight, bias=bias)
return logit
def log_prob(self, hidden):
r"""
Computes log probabilities for all \\(n\_classes\\) From:
https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p
Args:
hidden (Tensor): a minibatch of example
Returns:
log-probabilities of for each class \\(c\\) in range \\(0 <= c <= n\_classes\\), where \\(n\_classes\\) is
a parameter passed to `AdaptiveLogSoftmaxWithLoss` constructor. Shape:
- Input: \\((N, in\_features)\\)
- Output: \\((N, n\_classes)\\)
"""
if self.n_clusters == 0:
logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
return nn.functional.log_softmax(logit, dim=-1)
else:
weights, biases = [], []
for i in range(len(self.cutoffs)):
if self.div_val == 1:
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
weight_i = self.out_layers[0].weight[l_idx:r_idx]
bias_i = self.out_layers[0].bias[l_idx:r_idx]
else:
weight_i = self.out_layers[i].weight
bias_i = self.out_layers[i].bias
if i == 0:
weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
weights.append(weight_i)
biases.append(bias_i)
head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
out = hidden.new_empty((head_logit.size(0), self.n_token))
head_logprob = nn.functional.log_softmax(head_logit, dim=1)
cutoff_values = [0] + self.cutoffs
for i in range(len(cutoff_values) - 1):
start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
if i == 0:
out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
else:
weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)
logprob_i = head_logprob[:, -i] + tail_logprob_i
out[:, start_idx: stop_idx] = logprob_i
return out
.\models\deprecated\transfo_xl\tokenization_transfo_xl.py
for reg, sub in DETOKENIZE_NUMBERS:
text = re.sub(reg, sub, text)
return text
class TransfoXLTokenizer(PreTrainedTokenizer):
"""
Construct a Transformer-XL tokenizer adapted from Vocab class in [the original
code](https://github.com/kimiyoung/transformer-xl). The Transformer-XL tokenizer is a word-level tokenizer (no
sub-word tokenization).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
special (`List[str]`, *optional*):
A list of special tokens (to be treated by the original implementation of this tokenizer).
min_freq (`int`, *optional*, defaults to 0):
The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
will be mapped to `unk_token`).
max_size (`int`, *optional*):
The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
after excluding the tokens according to the `min_freq` rule.
lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
delimiter (`str`, *optional*):
The delimiter used between tokens.
vocab_file (`str`, *optional*):
File containing the vocabulary (from the original implementation).
pretrained_vocab_file (`str`, *optional*):
File containing the vocabulary as saved with the `save_pretrained()` method.
never_split (`List[str]`, *optional*):
List of tokens that should never be split. If no list is specified, will simply use the existing special
tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
eos_token (`str`, *optional*, defaults to `"<eos>"`):
The end of sequence token.
additional_special_tokens (`List[str]`, *optional*, defaults to `['<formula>']`):
A list of additional special tokens (for the HuggingFace functionality).
language (`str`, *optional*, defaults to `"en"`):
The language of this tokenizer (used for mose preprocessing).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids"]
def __init__(
self,
special=None,
min_freq=0,
max_size=None,
lower_case=False,
delimiter=None,
vocab_file=None,
pretrained_vocab_file: str = None,
never_split=None,
unk_token="<unk>",
eos_token="<eos>",
additional_special_tokens=["<formula>"],
language="en",
**kwargs,
):
super().__init__(
unk_token=unk_token,
eos_token=eos_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
@property
def do_lower_case(self):
return self.lower_case
def _compile_space_around_punctuation_pattern(self):
look_ahead_for_special_token = f"(?=[{self.punctuation_symbols}])"
look_ahead_to_match_all_except_space = r"(?=[^\s])"
return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space)
def count_file(self, path, verbose=False, add_eos=False):
if verbose:
logger.info(f"counting file {path} ...")
assert os.path.exists(path), f"Input file {path} not found"
sents = []
with open(path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(f" line {idx}")
symbols = self.tokenize(line, add_eos=add_eos)
self.counter.update(symbols)
sents.append(symbols)
return sents
def count_sents(self, sents, verbose=False):
"""
sents : a list of sentences, each a list of tokenized symbols
"""
if verbose:
logger.info(f"counting {len(sents)} sents ...")
for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(f" line {idx}")
self.counter.update(symbols)
def _build_from_file(self, vocab_file):
self.idx2sym = []
self.sym2idx = OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as f:
for line in f:
symb = line.strip().split()[0]
self.add_symbol(symb)
if "<UNK>" in self.sym2idx:
self.unk_idx = self.sym2idx["<UNK>"]
elif "<unk>" in self.sym2idx:
self.unk_idx = self.sym2idx["<unk>"]
else:
raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["pretrained_vocab_file"],
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "wb") as f:
pickle.dump(self.__dict__, f)
return (vocab_file,)
def build_vocab(self):
if self.vocab_file:
logger.info(f"building vocab from {self.vocab_file}")
self._build_from_file(self.vocab_file)
logger.info(f"Final vocab size {len(self.sym2idx)}")
else:
logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
self.idx2sym = []
self.sym2idx = OrderedDict()
for sym in self.special:
self.add_special(sym)
for sym, cnt in self.counter.most_common(self.max_size):
if cnt < self.min_freq:
break
self.add_symbol(sym)
logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens")
@torch_only_method
def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
if verbose:
logger.info(f"encoding file {path} ...")
assert os.path.exists(path), f"Output file {path} not found"
encoded = []
with open(path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(f" line {idx}")
symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
encoded.append(self.convert_to_tensor(symbols))
if ordered:
encoded = torch.cat(encoded)
return encoded
@torch_only_method
def encode_sents(self, sents, ordered=False, verbose=False):
if verbose:
logger.info(f"encoding {len(sents)} sents ...")
encoded = []
for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(f" line {idx}")
encoded.append(self.convert_to_tensor(symbols))
if ordered:
encoded = torch.cat(encoded)
return encoded
def add_special(self, sym):
if sym not in self.sym2idx:
self.idx2sym.append(sym)
self.sym2idx[sym] = len(self.idx2sym) - 1
setattr(self, f"{sym.strip('<>')}_idx", self.sym2idx[sym])
def add_symbol(self, sym):
if sym not in self.sym2idx:
self.idx2sym.append(sym)
self.sym2idx[sym] = len(self.idx2sym) - 1
def move_added_token(self, token: str, target_idx: int):
"""
Moves an added token to a specific position in the vocab. This method should be used when resizing an embedding
layer other than the last one in the `AdaptiveEmbedding` in order to move the token in the tokenizer from the
default position (at the very end) to the desired one.
Args:
token: The token to move to a specific position in the vocab.
target_idx: The position where the token should be moved to.
"""
assert token in self.added_tokens_encoder, "Token which should be moved has to be an added token"
assert token not in self.idx2sym, "Token which should be moved is already in vocab"
self.idx2sym.insert(target_idx, token)
self.sym2idx[token] = target_idx
for idx in range(target_idx + 1, len(self.idx2sym)):
current_sym = self.idx2sym[idx]
self.sym2idx[current_sym] = idx
old_index = self._added_tokens_encoder.pop(token)
self._added_tokens_decoder.pop(old_index)
def moses_punct_norm(self, text):
"""
Normalize punctuation in the text using MosesPunctNormalizer.
Args:
text: Input text to be normalized.
Returns:
Normalized text with standardized punctuation.
"""
return self.moses_punct_normalizer.normalize(text)
def moses_tokenize(self, text):
"""
Tokenizes text using MosesTokenizer.
Args:
text: Input text to be tokenized.
Returns:
List of tokens extracted from the input text.
"""
return self.moses_tokenizer.tokenize(
text, aggressive_dash_splits=True, return_str=False, escape=False, protected_patterns=self.never_split
)
def moses_pipeline(self, text: str) -> List[str]:
"""
Performs a pipeline of basic text preprocessing tasks using MosesPunctNormalizer and MosesTokenizer. Also handles
splitting of large comma-separated numbers and floating point values.
Args:
text: Text to be tokenized and preprocessed.
Returns:
A list of tokenized strings.
"""
text = self.moses_punct_norm(text)
text = self.moses_tokenize(text)
text = tokenize_numbers(text)
return text
def _convert_id_to_token(self, idx):
"""
Converts an index to a token using the vocabulary.
Args:
idx: Index to be converted into a token.
Returns:
Corresponding token based on the index.
"""
assert 0 <= idx < len(self), f"Index {idx} out of vocabulary range"
return self.idx2sym[idx]
def _convert_token_to_id(self, sym):
"""Converts a token (str) into an id using the vocabulary."""
if sym in self.sym2idx:
return self.sym2idx[sym]
else:
if hasattr(self, "unk_idx"):
return self.sym2idx.get(sym, self.unk_idx)
elif "<unk>" in self.sym2idx:
return self.sym2idx["<unk>"]
elif "<UNK>" in self.sym2idx:
return self.sym2idx["<UNK>"]
else:
raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
def convert_tokens_to_string(self, tokens):
"""
Converts a sequence of tokens (string) into a single string.
Additionally, converts split numbers back to their original form.
"""
out_string = self.moses_detokenizer.detokenize(tokens)
return detokenize_numbers(out_string).strip()
@torch_only_method
def convert_to_tensor(self, symbols):
"""Converts a list of symbols into a PyTorch tensor of Long type."""
return torch.LongTensor(self.convert_tokens_to_ids(symbols))
@property
def vocab_size(self):
"""Returns the size of the vocabulary."""
return len(self.idx2sym)
def get_vocab(self):
"""Returns a dictionary containing the entire vocabulary."""
vocab = self.sym2idx.copy()
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, line, add_eos=False, add_double_eos=False):
"""
Tokenizes a line of text with optional end-of-sequence tokens.
"""
line = line.strip()
if self.lower_case:
line = line.lower()
if self.delimiter == "":
symbols = line
else:
symbols = self.moses_pipeline(line)
if add_double_eos:
return ["<S>"] + symbols + ["<S>"]
elif add_eos:
return symbols + ["<eos>"]
else:
return symbols
class LMOrderedIterator(object):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
"""
data -- LongTensor -- the LongTensor is strictly ordered
"""
self.bsz = bsz
self.bptt = bptt
self.ext_len = ext_len if ext_len is not None else 0
self.device = device
self.n_step = data.size(0) // bsz
data = data.narrow(0, 0, self.n_step * bsz)
self.data = data.view(bsz, -1).t().contiguous().to(device)
self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
def get_batch(self, i, bptt=None):
if bptt is None:
bptt = self.bptt
seq_len = min(bptt, self.data.size(0) - 1 - i)
end_idx = i + seq_len
beg_idx = max(0, i - self.ext_len)
data = self.data[beg_idx:end_idx]
target = self.data[i + 1 : i + 1 + seq_len]
data_out = data.transpose(0, 1).contiguous().to(self.device)
target_out = target.transpose(0, 1).contiguous().to(self.device)
return data_out, target_out, seq_len
def get_fixlen_iter(self, start=0):
for i in range(start, self.data.size(0) - 1, self.bptt):
yield self.get_batch(i)
def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
max_len = self.bptt + max_deviation * std
i = start
while True:
bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
data, target, seq_len = self.get_batch(i, bptt)
i += seq_len
yield data, target, seq_len
if i >= self.data.size(0) - 2:
break
def __iter__(self):
return self.get_fixlen_iter()
class LMShuffledIterator(object):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
"""
data -- list[LongTensor] -- there is no order among the LongTensors
"""
self.data = data
self.bsz = bsz
self.bptt = bptt
self.ext_len = ext_len if ext_len is not None else 0
self.device = device
self.shuffle = shuffle
def get_sent_stream(self):
epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
for idx in epoch_indices:
yield self.data[idx]
@torch_only_method
def stream_iterator(self, sent_stream):
streams = [None] * self.bsz
data = torch.LongTensor(self.bptt, self.bsz)
target = torch.LongTensor(self.bptt, self.bsz)
n_retain = 0
while True:
data[n_retain:].fill_(-1)
target.fill_(-1)
valid_batch = True
for i in range(self.bsz):
n_filled = 0
try:
while n_filled < self.bptt:
if streams[i] is None or len(streams[i]) <= 1:
streams[i] = next(sent_stream)
n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
streams[i] = streams[i][n_new:]
n_filled += n_new
except StopIteration:
valid_batch = False
break
if not valid_batch:
return
data_out = data.transpose(0, 1).contiguous().to(self.device)
target_out = target.transpose(0, 1).contiguous().to(self.device)
yield data_out, target_out, self.bptt
n_retain = min(data.size(0), self.ext_len)
if n_retain > 0:
data[:n_retain] = data[-n_retain:]
data.resize_(n_retain + self.bptt, data.size(1))
def __iter__(self):
sent_stream = self.get_sent_stream()
for batch in self.stream_iterator(sent_stream):
yield batch
class LMMultiFileIterator(LMShuffledIterator):
def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
self.paths = paths
self.vocab = vocab
self.bsz = bsz
self.bptt = bptt
self.ext_len = ext_len if ext_len is not None else 0
self.device = device
self.shuffle = shuffle
def get_sent_stream(self, path):
sents = self.vocab.encode_file(path, add_double_eos=True)
if self.shuffle:
np.random.shuffle(sents)
sent_stream = iter(sents)
return sent_stream
def __iter__(self):
if self.shuffle:
np.random.shuffle(self.paths)
for path in self.paths:
sent_stream = self.get_sent_stream(path)
for batch in self.stream_iterator(sent_stream):
yield batch
class TransfoXLCorpus(object):
@classmethod
@torch_only_method
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a pre-processed corpus.
"""
vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
is_local = os.path.isdir(pretrained_model_name_or_path)
try:
resolved_corpus_file = cached_file(pretrained_model_name_or_path, CORPUS_NAME, cache_dir=cache_dir)
except EnvironmentError:
logger.error(
f"Corpus '{pretrained_model_name_or_path}' was not found in corpus list"
f" ({', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys())}. We assumed '{pretrained_model_name_or_path}'"
f" was a path or url but couldn't find files {CORPUS_NAME} at this path or url."
)
return None
if is_local:
logger.info(f"loading corpus file {resolved_corpus_file}")
else:
logger.info(f"loading corpus file {CORPUS_NAME} from cache at {resolved_corpus_file}")
corpus = cls(*inputs, **kwargs)
corpus_dict = torch.load(resolved_corpus_file)
for key, value in corpus_dict.items():
corpus.__dict__[key] = value
corpus.vocab = vocab
if corpus.train is not None:
corpus.train = torch.tensor(corpus.train, dtype=torch.long)
if corpus.valid is not None:
corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
if corpus.test is not None:
corpus.test = torch.tensor(corpus.test, dtype=torch.long)
return corpus
def __init__(self, *args, **kwargs):
self.vocab = TransfoXLTokenizer(*args, **kwargs)
self.dataset = None
self.train = None
self.valid = None
self.test = None
def build_corpus(self, path, dataset):
self.dataset = dataset
if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
self.vocab.count_file(os.path.join(path, "train.txt"))
self.vocab.count_file(os.path.join(path, "valid.txt"))
self.vocab.count_file(os.path.join(path, "test.txt"))
elif self.dataset == "wt103":
self.vocab.count_file(os.path.join(path, "train.txt"))
elif self.dataset == "lm1b":
train_path_pattern = os.path.join(
path,
"1-billion-word-language-modeling-benchmark-r13output",
"training-monolingual.tokenized.shuffled",
"news.en-*",
)
train_paths = glob.glob(train_path_pattern)
self.vocab.build_vocab()
if self.dataset in ["ptb", "wt2", "wt103"]:
self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
elif self.dataset in ["enwik8", "text8"]:
self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
elif self.dataset == "lm1b":
self.train = train_paths
self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
def get_iterator(self, split, *args, **kwargs):
if split == "train":
if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
data_iter = LMOrderedIterator(self.train, *args, **kwargs)
elif self.dataset == "lm1b":
kwargs["shuffle"] = True
data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
elif split in ["valid", "test"]:
data = self.valid if split == "valid" else self.test
if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
data_iter = LMOrderedIterator(data, *args, **kwargs)
elif self.dataset == "lm1b":
data_iter = LMShuffledIterator(data, *args, **kwargs)
else:
data_iter = None
raise ValueError(f"Split not recognized: {split}")
return data_iter
@torch_only_method
def get_lm_corpus(datadir, dataset):
fn = os.path.join(datadir, "cache.pt")
fn_pickle = os.path.join(datadir, "cache.pkl")
if os.path.exists(fn):
logger.info("Loading cached dataset...")
corpus = torch.load(fn_pickle)
elif os.path.exists(fn):
logger.info("Loading cached dataset from pickle...")
if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
raise ValueError(
"This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
"malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
"that could have been tampered with. If you already verified the pickle data and decided to use it, "
"you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
)
with open(fn, "rb") as fp:
corpus = pickle.load(fp)
else:
logger.info(f"Producing dataset {dataset}...")
kwargs = {}
if dataset in ["wt103", "wt2"]:
kwargs["special"] = ["<eos>"]
kwargs["lower_case"] = False
elif dataset == "ptb":
kwargs["special"] = ["<eos>"]
kwargs["lower_case"] = True
elif dataset == "lm1b":
kwargs["special"] = []
kwargs["lower_case"] = False
kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
elif dataset in ["enwik8", "text8"]:
pass
corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
torch.save(corpus, fn)
return corpus
.\models\deprecated\transfo_xl\__init__.py
from typing import TYPE_CHECKING
from ....utils import OptionalDependencyNotAvailable, _LazyModule
from ....utils import is_tf_available, is_torch_available
_import_structure = {
"configuration_transfo_xl": ["TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP", "TransfoXLConfig"],
"tokenization_transfo_xl": ["TransfoXLCorpus", "TransfoXLTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_transfo_xl"] = [
"TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
"AdaptiveEmbedding",
"TransfoXLForSequenceClassification",
"TransfoXLLMHeadModel",
"TransfoXLModel",
"TransfoXLPreTrainedModel",
"load_tf_weights_in_transfo_xl",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_transfo_xl"] = [
"TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFAdaptiveEmbedding",
"TFTransfoXLForSequenceClassification",
"TFTransfoXLLMHeadModel",
"TFTransfoXLMainLayer",
"TFTransfoXLModel",
"TFTransfoXLPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_transfo_xl import (
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
AdaptiveEmbedding,
TransfoXLForSequenceClassification,
TransfoXLLMHeadModel,
TransfoXLModel,
TransfoXLPreTrainedModel,
load_tf_weights_in_transfo_xl,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_transfo_xl import (
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
TFAdaptiveEmbedding,
TFTransfoXLForSequenceClassification,
TFTransfoXLLMHeadModel,
TFTransfoXLMainLayer,
TFTransfoXLModel,
TFTransfoXLPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\van\configuration_van.py
""" VAN 模型配置"""
from ....configuration_utils import PretrainedConfig
from ....utils import logging
logger = logging.get_logger(__name__)
VAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"Visual-Attention-Network/van-base": (
"https://huggingface.co/Visual-Attention-Network/van-base/blob/main/config.json"
),
}
class VanConfig(PretrainedConfig):
r"""
这是配置类,用于存储 [`VanModel`] 的配置。根据指定的参数实例化 VAN 模型,
定义模型架构。使用默认值实例化配置将生成与 VAN
[Visual-Attention-Network/van-base](https://huggingface.co/Visual-Attention-Network/van-base)
架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。阅读来自 [`PretrainedConfig`] 的文档获取更多信息。
"""
Args:
image_size (`int`, *optional*, defaults to 224):
每个图像的大小(分辨率)。
num_channels (`int`, *optional*, defaults to 3):
输入通道的数量。
patch_sizes (`List[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
每个阶段嵌入层使用的补丁大小。
strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
每个阶段嵌入层用来降采样输入的步幅大小。
hidden_sizes (`List[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
每个阶段的隐藏层维度。
depths (`List[int]`, *optional*, defaults to `[3, 3, 12, 3]`):
每个阶段的层数。
mlp_ratios (`List[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
每个阶段MLP层的扩展比率。
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
每一层的非线性激活函数(可以是函数或字符串)。支持的字符串有 "gelu", "relu", "selu" 和 "gelu_new"。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准差。
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
层归一化层使用的 epsilon。
layer_scale_init_value (`float`, *optional*, defaults to 0.01):
层缩放的初始值。
drop_path_rate (`float`, *optional*, defaults to 0.0):
随机深度(stochastic depth)的丢弃概率。
dropout_rate (`float`, *optional*, defaults to 0.0):
丢弃的概率。
Example:
```
>>> from transformers import VanModel, VanConfig
>>>
>>> configuration = VanConfig()
>>>
>>> model = VanModel(configuration)
>>>
>>> configuration = model.config
```
):
super().__init__(**kwargs)
self.image_size = image_size
self.num_channels = num_channels
self.patch_sizes = patch_sizes
self.strides = strides
self.hidden_sizes = hidden_sizes
self.depths = depths
self.mlp_ratios = mlp_ratios
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.layer_scale_init_value = layer_scale_init_value
self.drop_path_rate = drop_path_rate
self.dropout_rate = dropout_rate
.\models\deprecated\van\convert_van_to_pytorch.py
"""Convert VAN checkpoints from the original repository.
将原始仓库的 VAN 检查点转换为特定格式。
URL: https://github.com/Visual-Attention-Network/VAN-Classification"""
import argparse
import json
import sys
from dataclasses import dataclass, field
from functools import partial
from pathlib import Path
from typing import List
import torch
import torch.nn as nn
from huggingface_hub import cached_download, hf_hub_download
from torch import Tensor
from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
from transformers.models.deprecated.van.modeling_van import VanLayerScaling
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
@dataclass
class Tracker:
module: nn.Module
traced: List[nn.Module] = field(default_factory=list)
handles: list = field(default_factory=list)
def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
if has_not_submodules:
if not isinstance(m, VanLayerScaling):
self.traced.append(m)
def __call__(self, x: Tensor):
for m in self.module.modules():
self.handles.append(m.register_forward_hook(self._forward_hook))
self.module(x)
[x.remove() for x in self.handles]
return self
@property
def parametrized(self):
return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
@dataclass
class ModuleTransfer:
src: nn.Module
dest: nn.Module
verbose: int = 0
src_skip: List = field(default_factory=list)
dest_skip: List = field(default_factory=list))
def __call__(self, x: Tensor):
"""
Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
hood we tracked all the operations in both modules.
"""
dest_traced = Tracker(self.dest)(x).parametrized
src_traced = Tracker(self.src)(x).parametrized
src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
if len(dest_traced) != len(src_traced):
raise Exception(
f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
f" destination module has {len(dest_traced)}."
)
for dest_m, src_m in zip(dest_traced, src_traced):
dest_m.load_state_dict(src_m.state_dict())
if self.verbose == 1:
print(f"Transfered from={src_m} to={dest_m}")
def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module:
from_state_dict = from_model.state_dict()
our_state_dict = our_model.state_dict()
config = our_model.config
all_keys = []
for stage_idx in range(len(config.hidden_sizes)):
for block_id in range(config.depths[stage_idx]):
from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_1"
to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.attention_scaling.weight"
all_keys.append((from_key, to_key))
from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_2"
to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.mlp_scaling.weight"
all_keys.append((from_key, to_key))
for from_key, to_key in all_keys:
our_state_dict[to_key] = from_state_dict.pop(from_key)
our_model.load_state_dict(our_state_dict)
return our_model
def convert_weight_and_push(
name: str,
config: VanConfig,
checkpoint: str,
from_model: nn.Module,
save_directory: Path,
push_to_hub: bool = True,
):
print(f"Downloading weights for {name}...")
checkpoint_path = cached_download(checkpoint)
print(f"Converting {name}...")
from_state_dict = torch.load(checkpoint_path)["state_dict"]
from_model.load_state_dict(from_state_dict)
from_model.eval()
with torch.no_grad():
our_model = VanForImageClassification(config).eval()
module_transfer = ModuleTransfer(src=from_model, dest=our_model)
x = torch.randn((1, 3, 224, 224))
module_transfer(x)
our_model = copy_parameters(from_model, our_model)
if not torch.allclose(from_model(x), our_model(x).logits):
raise ValueError("The model logits don't match the original one.")
checkpoint_name = name
print(checkpoint_name)
if push_to_hub:
our_model.push_to_hub(
repo_path_or_name=save_directory / checkpoint_name,
commit_message="Add model",
use_temp_dir=True,
)
image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
image_processor.push_to_hub(
repo_path_or_name=save_directory / checkpoint_name,
commit_message="Add image processor",
use_temp_dir=True,
)
print(f"Pushed {checkpoint_name}")
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
filename = "imagenet-1k-id2label.json"
num_labels = 1000
repo_id = "huggingface/label-files"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
id2label = id2label
label2id = {v: k for k, v in id2label.items()}
ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
names_to_config = {
"van-tiny": ImageNetPreTrainedConfig(
hidden_sizes=[32, 64, 160, 256],
depths=[3, 3, 5, 2],
mlp_ratios=[8, 8, 4, 4],
),
"van-small": ImageNetPreTrainedConfig(
hidden_sizes=[64, 128, 320, 512],
depths=[2, 2, 4, 2],
mlp_ratios=[8, 8, 4, 4],
),
"van-base": ImageNetPreTrainedConfig(
hidden_sizes=[64, 128, 320, 512],
depths=[3, 3, 12, 3],
mlp_ratios=[8, 8, 4, 4],
),
"van-large": ImageNetPreTrainedConfig(
hidden_sizes=[64, 128, 320, 512],
depths=[3, 5, 27, 3],
mlp_ratios=[8, 8, 4, 4],
),
}
names_to_original_models = {
"van-tiny": van_tiny,
"van-small": van_small,
"van-base": van_base,
"van-large": van_large,
}
names_to_original_checkpoints = {
"van-tiny": (
"https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar"
),
"van-small": (
"https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar"
),
"van-base": (
"https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar"
),
"van-large": (
"https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar"
),
}
if model_name:
convert_weight_and_push(
model_name,
names_to_config[model_name],
checkpoint=names_to_original_checkpoints[model_name],
from_model=names_to_original_models[model_name](),
save_directory=save_directory,
push_to_hub=push_to_hub,
)
else:
for model_name, config in names_to_config.items():
convert_weight_and_push(
model_name,
config,
checkpoint=names_to_original_checkpoints[model_name],
from_model=names_to_original_models[model_name](),
save_directory=save_directory,
push_to_hub=push_to_hub,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model-name",
default=None,
type=str,
help=(
"The name of the model you wish to convert, it must be one of the supported resnet* architecture,"
" currently: van-tiny/small/base/large. If `None`, all of them will the converted."
),
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=Path,
required=True,
help="Path to the output PyTorch model directory.",
)
parser.add_argument(
"--van_dir",
required=True,
type=Path,
help=(
"A path to VAN's original implementation directory. You can download from here:"
" https://github.com/Visual-Attention-Network/VAN-Classification"
),
)
parser.add_argument(
"--push_to_hub",
default=True,
type=bool,
required=False,
help="If True, push model and image processor to the hub.",
)
args = parser.parse_args()
pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
van_dir = args.van_dir
sys.path.append(str(van_dir.parent))
from van.models.van import van_base, van_large, van_small, van_tiny
convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
.\models\deprecated\van\modeling_van.py
"""
PyTorch Visual Attention Network (VAN) model.
"""
import math
from collections import OrderedDict
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ....activations import ACT2FN
from ....modeling_outputs import (
BaseModelOutputWithNoAttention,
BaseModelOutputWithPoolingAndNoAttention,
ImageClassifierOutputWithNoAttention,
)
from ....modeling_utils import PreTrainedModel
from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_van import VanConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "VanConfig"
_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
VAN_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Visual-Attention-Network/van-base",
]
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
按样本(在残差块的主路径中应用)丢弃路径(随机深度)。
Comment by Ross Wightman: 这与我为 EfficientNet 等网络创建的 DropConnect 实现相同,
然而,原始名称具有误导性,因为“Drop Connect”是另一篇论文中的不同形式的 dropout…
参见讨论: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 …
我选择更改层和参数名称为 'drop path',而不是将 DropConnect 作为层名称,并使用 'survival rate' 作为参数。
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class VanDropPath(nn.Module):
"""每个样本使用丢弃路径(Stochastic Depth)(当应用于残差块的主路径时)。"""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class VanOverlappingPatchEmbedder(nn.Module):
"""
使用 patchify 操作对输入进行下采样,默认使用步幅为 4 的窗口使相邻窗口重叠一半区域。
来自 [PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)。
"""
def __init__(self, in_channels: int, hidden_size: int, patch_size: int = 7, stride: int = 4):
super().__init__()
self.convolution = nn.Conv2d(
in_channels, hidden_size, kernel_size=patch_size, stride=stride, padding=patch_size // 2
)
self.normalization = nn.BatchNorm2d(hidden_size)
def forward(self, input: torch.Tensor) -> torch.Tensor:
hidden_state = self.convolution(input)
hidden_state = self.normalization(hidden_state)
return hidden_state
class VanMlpLayer(nn.Module):
"""
带有深度卷积的 MLP,来自 [PVTv2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)。
"""
def __init__(
self,
in_channels: int,
hidden_size: int,
out_channels: int,
hidden_act: str = "gelu",
dropout_rate: float = 0.5,
):
super().__init__()
self.in_dense = nn.Conv2d(in_channels, hidden_size, kernel_size=1)
self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=3, padding=1, groups=hidden_size)
self.activation = ACT2FN[hidden_act]
self.dropout1 = nn.Dropout(dropout_rate)
self.out_dense = nn.Conv2d(hidden_size, out_channels, kernel_size=1)
self.dropout2 = nn.Dropout(dropout_rate)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.in_dense(hidden_state)
hidden_state = self.depth_wise(hidden_state)
hidden_state = self.activation(hidden_state)
hidden_state = self.dropout1(hidden_state)
hidden_state = self.out_dense(hidden_state)
hidden_state = self.dropout2(hidden_state)
return hidden_state
class VanLargeKernelAttention(nn.Module):
"""
基础的大核注意力(LKA)。
"""
def __init__(self, hidden_size: int):
super().__init__()
self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=5, padding=2, groups=hidden_size)
self.depth_wise_dilated = nn.Conv2d(
hidden_size, hidden_size, kernel_size=7, dilation=3, padding=9, groups=hidden_size
)
self.point_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.depth_wise(hidden_state)
hidden_state = self.depth_wise_dilated(hidden_state)
hidden_state = self.point_wise(hidden_state)
return hidden_state
class VanLargeKernelAttentionLayer(nn.Module):
"""
Computes attention using Large Kernel Attention (LKA) and attends the input.
"""
def __init__(self, hidden_size: int):
super().__init__()
self.attention = VanLargeKernelAttention(hidden_size)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
attention = self.attention(hidden_state)
attended = hidden_state * attention
return attended
class VanSpatialAttentionLayer(nn.Module):
"""
Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention ->
projection (via conv) + residual connection.
"""
def __init__(self, hidden_size: int, hidden_act: str = "gelu"):
super().__init__()
self.pre_projection = nn.Sequential(
OrderedDict(
[
("conv", nn.Conv2d(hidden_size, hidden_size, kernel_size=1)),
("act", ACT2FN[hidden_act]),
]
)
)
self.attention_layer = VanLargeKernelAttentionLayer(hidden_size)
self.post_projection = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
residual = hidden_state
hidden_state = self.pre_projection(hidden_state)
hidden_state = self.attention_layer(hidden_state)
hidden_state = self.post_projection(hidden_state)
hidden_state = hidden_state + residual
return hidden_state
class VanLayerScaling(nn.Module):
"""
Scales the inputs by a learnable parameter initialized by `initial_value`.
"""
def __init__(self, hidden_size: int, initial_value: float = 1e-2):
super().__init__()
self.weight = nn.Parameter(initial_value * torch.ones((hidden_size)), requires_grad=True)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
hidden_state = self.weight.unsqueeze(-1).unsqueeze(-1) * hidden_state
return hidden_state
class VanLayer(nn.Module):
"""
Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP).
"""
def __init__(
self,
config: VanConfig,
hidden_size: int,
mlp_ratio: int = 4,
drop_path_rate: float = 0.5,
):
super().__init__()
self.drop_path = VanDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.pre_normomalization = nn.BatchNorm2d(hidden_size)
self.attention = VanSpatialAttentionLayer(hidden_size, config.hidden_act)
self.attention_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)
self.post_normalization = nn.BatchNorm2d(hidden_size)
self.mlp = VanMlpLayer(
hidden_size, hidden_size * mlp_ratio, hidden_size, config.hidden_act, config.dropout_rate
)
self.mlp_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
residual = hidden_state
hidden_state = self.pre_normomalization(hidden_state)
hidden_state = self.attention(hidden_state)
hidden_state = self.attention_scaling(hidden_state)
hidden_state = self.drop_path(hidden_state)
hidden_state = residual + hidden_state
residual = hidden_state
hidden_state = self.post_normalization(hidden_state)
hidden_state = self.mlp(hidden_state)
hidden_state = self.mlp_scaling(hidden_state)
hidden_state = self.drop_path(hidden_state)
hidden_state = residual + hidden_state
return hidden_state
hidden_state: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
"""
Perform forward pass through the VanEncoder.
Args:
hidden_state (torch.Tensor): Input tensor of shape (batch_size, channels, height, width).
output_hidden_states (bool, optional): Whether to output hidden states of all stages.
return_dict (bool, optional): Whether to return a dictionary with hidden states.
Returns:
torch.Tensor or Tuple[torch.Tensor, Dict[str, torch.Tensor]]: Depending on `return_dict`,
either the final encoded tensor or a tuple containing the final tensor and a dictionary
with hidden states from each stage.
"""
for stage in self.stages:
hidden_state = stage(hidden_state)
if output_hidden_states:
hidden_states_dict = {f"hidden_state_{i}": stage(hidden_state) for i, stage in enumerate(self.stages)}
if return_dict:
return hidden_state, hidden_states_dict
else:
return hidden_state
return hidden_state if not return_dict else (hidden_state, {})
) -> Union[Tuple, BaseModelOutputWithNoAttention]:
all_hidden_states = () if output_hidden_states else None
for _, stage_module in enumerate(self.stages):
hidden_state = stage_module(hidden_state)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
if not return_dict:
return tuple(v for v in [hidden_state, all_hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
class VanPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = VanConfig
base_model_prefix = "van"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
if isinstance(module, nn.Linear) and module.bias is not None:
nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.LayerNorm):
nn.init.constant_(module.bias, 0)
nn.init.constant_(module.weight, 1.0)
elif isinstance(module, nn.Conv2d):
fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
fan_out //= module.groups
module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if module.bias is not None:
module.bias.data.zero_()
VAN_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`VanConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
VAN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`ConvNextImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding"
" layer.",
VAN_START_DOCSTRING,
)
class VanModel(VanPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.encoder = VanEncoder(config)
self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
self.post_init()
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor],
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state.mean(dim=[-2, -1])
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
@add_start_docstrings(
"""
VAN 模型,顶部附带一个图像分类头部(线性层在池化特征之上),例如适用于 ImageNet。
""",
VAN_START_DOCSTRING,
)
class VanForImageClassification(VanPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.van = VanModel(config)
self.classifier = (
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.van(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.config.num_labels == 1:
self.config.problem_type = "regression"
elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.config.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
.\models\deprecated\van\__init__.py
from typing import TYPE_CHECKING
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {"configuration_van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_van"] = [
"VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
"VanForImageClassification",
"VanModel",
"VanPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_van import (
VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
VanForImageClassification,
VanModel,
VanPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\deprecated\__init__.py
import os
import sys
def list_files(directory):
files = []
for dirpath, _, filenames in os.walk(directory):
for f in filenames:
files.append(os.path.abspath(os.path.join(dirpath, f)))
return files
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python list_files.py directory")
sys.exit(1)
directory = sys.argv[1]
files = list_files(directory)
for f in files:
print(f)
.\models\depth_anything\configuration_depth_anything.py
""" DepthAnything model configuration"""
import copy
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"LiheYoung/depth-anything-small-hf": "https://huggingface.co/LiheYoung/depth-anything-small-hf/resolve/main/config.json",
}
class DepthAnythingConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate an DepthAnything
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DepthAnything
[LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
leverage the [`AutoBackbone`] API.
backbone (`str`, *optional*):
Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
Whether to use pretrained weights for the backbone.
patch_size (`int`, *optional*, defaults to 14):
The size of the patches to extract from the backbone features.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
reassemble_hidden_size (`int`, *optional*, defaults to 384):
The number of input channels of the reassemble layers.
reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
The up/downsampling factors of the reassemble layers.
neck_hidden_sizes (`List[str]`, *optional*, defaults to `[48, 96, 192, 384]`):
The hidden sizes to project to for the feature maps of the backbone.
fusion_hidden_size (`int`, *optional*, defaults to 64):
The number of channels before fusion.
head_in_index (`int`, *optional*, defaults to -1):
The index of the features to use in the depth estimation head.
head_hidden_size (`int`, *optional*, defaults to 32):
The number of output channels in the second convolution of the depth estimation head.
Example:
```
>>> from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation
>>> # Initializing a DepthAnything small style configuration
>>> configuration = DepthAnythingConfig()
>>> # Initializing a model from the DepthAnything small style configuration
>>> model = DepthAnythingForDepthEstimation(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
# Define `model_type` as "depth_anything" for identifying the model type.
model_type = "depth_anything"
# Constructor for initializing the DepthAnythingForDepthEstimation class.
def __init__(
self,
backbone_config=None,
backbone=None,
use_pretrained_backbone=False,
patch_size=14,
initializer_range=0.02,
reassemble_hidden_size=384,
reassemble_factors=[4, 2, 1, 0.5],
neck_hidden_sizes=[48, 96, 192, 384],
fusion_hidden_size=64,
head_in_index=-1,
head_hidden_size=32,
**kwargs,
):
super().__init__(**kwargs)
# 调用父类的初始化方法,传递所有关键字参数
if use_pretrained_backbone:
# 如果指定使用预训练的主干网络,则抛出数值错误异常
raise ValueError("Pretrained backbones are not supported yet.")
if backbone_config is not None and backbone is not None:
# 如果同时指定了 `backbone` 和 `backbone_config`,则抛出数值错误异常
raise ValueError("You can't specify both `backbone` and `backbone_config`.")
if backbone_config is None and backbone is None:
# 如果未指定 `backbone_config` 和 `backbone`,记录日志并使用默认的 `Dinov2` 主干网络配置进行初始化
logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.")
backbone_config = CONFIG_MAPPING["dinov2"](
image_size=518,
hidden_size=384,
num_attention_heads=6,
out_indices=[9, 10, 11, 12],
apply_layernorm=True,
reshape_hidden_states=False,
)
elif isinstance(backbone_config, dict):
# 如果 `backbone_config` 是字典类型,则根据字典中的 `model_type` 获取对应的配置类并从字典初始化 `backbone_config`
backbone_model_type = backbone_config.get("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
self.backbone_config = backbone_config
self.backbone = backbone
self.use_pretrained_backbone = use_pretrained_backbone
self.reassemble_hidden_size = reassemble_hidden_size
self.patch_size = patch_size
self.initializer_range = initializer_range
self.reassemble_factors = reassemble_factors
self.neck_hidden_sizes = neck_hidden_sizes
self.fusion_hidden_size = fusion_hidden_size
self.head_in_index = head_in_index
self.head_hidden_size = head_hidden_size
def to_dict(self):
"""
将当前实例序列化为 Python 字典。重写默认的 `PretrainedConfig.to_dict` 方法。返回:
`Dict[str, any]`: 包含此配置实例所有属性的字典,
"""
output = copy.deepcopy(self.__dict__)
if output["backbone_config"] is not None:
# 如果 `backbone_config` 不为 None,则将其转换为字典形式
output["backbone_config"] = self.backbone_config.to_dict()
output["model_type"] = self.__class__.model_type
return output