Transformers 源码解析(八十九)
.\models\pix2struct\processing_pix2struct.py
"""
Processor class for Pix2Struct.
"""
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class Pix2StructProcessor(ProcessorMixin):
r"""
Constructs a PIX2STRUCT processor which wraps a BERT tokenizer and PIX2STRUCT image processor into a single
processor.
[`Pix2StructProcessor`] offers all the functionalities of [`Pix2StructImageProcessor`] and [`T5TokenizerFast`]. See
the docstring of [`~Pix2StructProcessor.__call__`] and [`~Pix2StructProcessor.decode`] for more information.
Args:
image_processor (`Pix2StructImageProcessor`):
An instance of [`Pix2StructImageProcessor`]. The image processor is a required input.
tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "Pix2StructImageProcessor"
tokenizer_class = ("T5Tokenizer", "T5TokenizerFast")
def __init__(self, image_processor, tokenizer):
tokenizer.return_token_type_ids = False
super().__init__(image_processor, tokenizer)
def __call__(
self,
images=None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
max_patches: Optional[int] = 2048,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
):
"""
Process input images and text into a format suitable for PIX2STRUCT tasks.
Args:
images (optional): Input images to process.
text (Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]): Input text data.
add_special_tokens (bool): Whether to add special tokens (like [CLS], [SEP]) or not.
padding (Union[bool, str, PaddingStrategy]): Padding strategy for text sequences.
truncation (Union[bool, str, TruncationStrategy]): Truncation strategy for text sequences.
max_length (Optional[int]): Maximum sequence length to enforce.
max_patches (Optional[int]): Maximum number of patches to consider.
stride (int): Stride length for patch extraction.
pad_to_multiple_of (Optional[int]): Pad the sequence length to a multiple of this value.
return_attention_mask (Optional[bool]): Whether to return attention masks.
return_overflowing_tokens (bool): Whether to return overflowing tokens.
return_special_tokens_mask (bool): Whether to return special tokens mask.
return_offsets_mapping (bool): Whether to return offsets mapping.
return_token_type_ids (bool): Whether to return token type IDs (not used in this processor).
return_length (bool): Whether to return sequence length.
verbose (bool): Whether to print verbose information.
return_tensors (Optional[Union[str, TensorType]]): Desired tensor type for returned tensors.
Returns:
BatchEncoding: Processed inputs in a batch encoding format.
Notes:
This method processes both images and text to prepare them for PIX2STRUCT tasks.
It incorporates functionality from both `Pix2StructImageProcessor` and `T5TokenizerFast`.
"""
pass
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Pix2StructTokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
Please refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Pix2StructTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
"""
This property returns a list of unique model input names by combining tokenizer's and image_processor's input names.
"""
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
.\models\pix2struct\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_pix2struct": [
"PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Pix2StructConfig",
"Pix2StructTextConfig",
"Pix2StructVisionConfig",
],
"processing_pix2struct": ["Pix2StructProcessor"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_pix2struct"] = ["Pix2StructImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_pix2struct"] = [
"PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST",
"Pix2StructPreTrainedModel",
"Pix2StructForConditionalGeneration",
"Pix2StructVisionModel",
"Pix2StructTextModel",
]
if TYPE_CHECKING:
from .configuration_pix2struct import (
PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
Pix2StructConfig,
Pix2StructTextConfig,
Pix2StructVisionConfig,
)
from .processing_pix2struct import Pix2StructProcessor
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_pix2struct import Pix2StructImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_pix2struct import (
PIX2STRUCT_PRETRAINED_MODEL_ARCHIVE_LIST,
Pix2StructForConditionalGeneration,
Pix2StructPreTrainedModel,
Pix2StructTextModel,
Pix2StructVisionModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\plbart\configuration_plbart.py
""" PLBART model configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfigWithPast
from ...utils import logging
logger = logging.get_logger(__name__)
PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"uclanlp/plbart-base": "https://huggingface.co/uclanlp/plbart-base/resolve/main/config.json",
}
class PLBartConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`PLBartModel`]. It is used to instantiate an
PLBART model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the PLBART
[uclanlp/plbart-base](https://huggingface.co/uclanlp/plbart-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import PLBartConfig, PLBartModel
>>> # Initializing a PLBART uclanlp/plbart-base style configuration
>>> configuration = PLBartConfig()
>>> # Initializing a model (with random weights) from the uclanlp/plbart-base style configuration
>>> model = PLBartModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "plbart"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=50005,
max_position_embeddings=1024,
encoder_layers=6,
encoder_ffn_dim=3072,
encoder_attention_heads=12,
decoder_layers=6,
decoder_ffn_dim=3072,
decoder_attention_heads=12,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu",
d_model=768,
dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
init_std=0.02,
classifier_dropout=0.0,
scale_embedding=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
forced_eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
class PLBartOnnxConfig(OnnxConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}),
("attention_mask", {0: "batch", 1: "sequence"}),
]
)
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.use_past:
return OrderedDict(
[
("last_hidden_state", {0: "batch", 1: "sequence"}),
("past_keys", {0: "batch", 2: "sequence"}),
("encoder_last_hidden_state", {0: "batch", 1: "sequence"}),
]
)
else:
return OrderedDict(
[
("last_hidden_state", {0: "batch", 1: "sequence"}),
("encoder_last_hidden_state", {0: "batch", 1: "sequence"}),
]
)
.\models\plbart\convert_plbart_original_checkpoint_to_torch.py
import argparse
import torch
from torch import nn
from transformers import PLBartConfig, PLBartForConditionalGeneration, PLBartForSequenceClassification
def remove_ignore_keys_(state_dict):
ignore_keys = [
"encoder.version",
"decoder.version",
"model.encoder.version",
"model.decoder.version",
"_float_tensor",
"decoder.output_projection.weight",
]
for k in ignore_keys:
state_dict.pop(k, None)
def make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer
def convert_fairseq_plbart_checkpoint_from_disk(
checkpoint_path, hf_config_path="uclanlp/plbart-base", finetuned=False, classification=False
):
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
remove_ignore_keys_(state_dict)
vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
plbart_config = PLBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
if not classification:
model = PLBartForConditionalGeneration(plbart_config)
model.model.load_state_dict(state_dict)
if finetuned:
model.lm_head = make_linear_from_emb(model.model.shared)
else:
classification_head = {}
for key, value in state_dict.copy().items():
if key.startswith("classification_heads.sentence_classification_head"):
classification_head[key.replace("classification_heads.sentence_classification_head.", "")] = value
state_dict.pop(key)
model = PLBartForSequenceClassification(plbart_config)
model.model.load_state_dict(state_dict)
model.classification_head.load_state_dict(classification_head)
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fairseq_path", type=str, help="model.pt on local filesystem.")
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument(
"--hf_config",
default="uclanlp/plbart-base",
type=str,
help="Which huggingface architecture to use: plbart-base",
)
parser.add_argument("--finetuned", action="store_true", help="whether the model is a fine-tuned checkpoint")
parser.add_argument(
"--classification", action="store_true", help="whether the model is a classification checkpoint"
)
args = parser.parse_args()
model = convert_fairseq_plbart_checkpoint_from_disk(
args.fairseq_path,
hf_config_path=args.hf_config,
finetuned=args.finetuned,
classification=args.classification,
)
model.save_pretrained(args.pytorch_dump_folder_path)
.\models\plbart\modeling_plbart.py
""" PyTorch PLBART model."""
import copy
import math
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import (
_prepare_4d_attention_mask,
_prepare_4d_attention_mask_for_sdpa,
_prepare_4d_causal_attention_mask,
_prepare_4d_causal_attention_mask_for_sdpa,
)
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqSequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_plbart import PLBartConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "uclanlp/plbart-base"
_CONFIG_FOR_DOC = "PLBartConfig"
PLBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
"uclanlp/plbart-base",
"uclanlp/plbart-cs-java",
"uclanlp/plbart-multi_task-all",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
"""
Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
have a single `decoder_start_token_id` in contrast to other Bart-like models.
"""
prev_output_tokens = input_ids.clone()
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
prev_output_tokens[:, 0] = decoder_start_tokens
return prev_output_tokens
class PLBartLearnedPositionalEmbedding(nn.Embedding):
"""
这个模块学习位置编码,最大长度固定。
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
"""`input_ids'的形状预期为[bsz x seqlen]。"""
bsz, seq_len = input_ids.shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
).expand(bsz, -1)
return super().forward(positions + self.offset)
class PLBartAttention(nn.Module):
"""来自'Attention Is All You Need'论文的多头注意力模块"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[PLBartConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim必须能被num_heads整除 (得到 `embed_dim`: {self.embed_dim}"
f" 和 `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
pass
class PLBartEncoderLayer(nn.Module):
def __init__(self, config: PLBartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = PLBART_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
PLBART_ATTENTION_CLASSES = {"eager": PLBartAttention}
class PLBartDecoderLayer(nn.Module):
def __init__(self, config: PLBartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = PLBART_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = PLBART_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
):
class PLBartClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(
self,
input_dim: int,
inner_dim: int,
num_classes: int,
pooler_dropout: float,
):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class PLBartPreTrainedModel(PreTrainedModel):
config_class = PLBartConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["PLBartDecoderLayer", "PLBartEncoderLayer"]
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
PLBART_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`PLBartConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
PLBART_GENERATION_EXAMPLE = r"""
Mask-filling example:
```
>>> from transformers import AutoTokenizer, PLBartForConditionalGeneration
>>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
>>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
>>> # en_XX is the language symbol id <LID> for English
>>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
>>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids
>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)
>>> tokenizer.decode(predictions).split()
['first', 'same', 'highest', 'result', 'number']
```
"""
PLBART_INPUTS_DOCSTRING = r"""
"""
class PLBartEncoder(PLBartPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`PLBartEncoderLayer`].
Args:
config: PLBartConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: PLBartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = PLBartLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([PLBartEncoderLayer(config) for _ in range(config.encoder_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self._use_sdpa = config._attn_implementation == "sdpa"
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
class PLBartDecoder(PLBartPreTrainedModel):
"""
Transformer解码器,由config.decoder_layers层组成。每一层是一个[`PLBartDecoderLayer`]
Args:
config: PLBartConfig
embed_tokens (nn.Embedding): 输出的嵌入层
"""
def __init__(self, config: PLBartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = PLBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([PLBartDecoderLayer(config) for _ in range(config.decoder_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self._use_sdpa = config._attn_implementation == "sdpa"
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
def __init__(self, config: PLBartConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = PLBartEncoder(config, self.shared)
self.decoder = PLBartDecoder(config, self.shared)
self.init_weights()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.LongTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.",
PLBART_START_DOCSTRING,
)
class PLBartForConditionalGeneration(PLBartPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: PLBartConfig):
super().__init__(config)
self.model = PLBartModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.init_weights()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(PLBART_GENERATION_EXAMPLE)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.LongTensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Depending on `return_dict`:
- if `False` (default), returns a tuple with `lm_logits` followed by various model outputs.
- if `True`, returns a `Seq2SeqLMOutput` object containing loss, logits, and other outputs.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0])
lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
else:
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids: torch.LongTensor,
past_key_values: Optional[List[torch.FloatTensor]] = None,
attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
**kwargs,
) -> Dict[str, Any]:
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"""
PLBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for code
classification.
""",
PLBART_START_DOCSTRING,
)
class PLBartForSequenceClassification(PLBartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: PLBartConfig, **kwargs):
super().__init__(config, **kwargs)
self.model = PLBartModel(config)
self.classification_head = PLBartClassificationHead(
config.d_model,
config.d_model,
config.num_labels,
config.classifier_dropout,
)
self.post_init()
@add_start_docstrings_to_model_forward(PLBART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
pass
class PLBartDecoderWrapper(PLBartPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
super().__init__(config)
self.decoder = PLBartDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
class PLBartForCausalLM(PLBartPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = PLBartDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\plbart\tokenization_plbart.py
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"uclanlp/plbart-base": "https://huggingface.co/uclanlp/plbart-base/resolve/main/sentencepiece.bpe.model",
"uclanlp/plbart-c-cpp-defect-detection": (
"https://huggingface.co/uclanlp/plbart-c-cpp-defect-detection/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-cs-java": "https://huggingface.co/uclanlp/plbart-cs-java/resolve/main/sentencepiece.bpe.model",
"uclanlp/plbart-en_XX-java": (
"https://huggingface.co/uclanlp/plbart-en_XX-java/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-go-en_XX": (
"https://huggingface.co/uclanlp/plbart-go-en_XX/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-java-clone-detection": (
"https://huggingface.co/uclanlp/plbart-java-clone-detection/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-java-cs": "https://huggingface.co/uclanlp/plbart-java-cs/resolve/main/sentencepiece.bpe.model",
"uclanlp/plbart-java-en_XX": (
"https://huggingface.co/uclanlp/plbart-java-en_XX/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-javascript-en_XX": (
"https://huggingface.co/uclanlp/plbart-javascript-en_XX/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-php-en_XX": (
"https://huggingface.co/uclanlp/plbart-php-en_XX/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-python-en_XX": (
"https://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-refine-java-medium": (
"https://huggingface.co/uclanlp/plbart-refine-java-medium/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-refine-java-small": (
"https://huggingface.co/uclanlp/plbart-refine-java-small/resolve/main/sentencepiece.bpe.model"
),
"uclanlp/plbart-ruby-en_XX": (
"https://huggingface.co/uclanlp/plbart-ruby-en_XX/resolve/main/sentencepiece.bpe.model"
),
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"uclanlp/plbart-base": 1024,
"uclanlp/plbart-c-cpp-defect-detection": 1024,
"uclanlp/plbart-cs-java": 1024,
"uclanlp/plbart-en_XX-java": 1024,
"uclanlp/plbart-go-en_XX": 1024,
"uclanlp/plbart-java-clone-detection": 1024,
"uclanlp/plbart-java-cs": 1024,
"uclanlp/plbart-java-en_XX": 1024,
"uclanlp/plbart-javascript-en_XX": 1024,
"uclanlp/plbart-php-en_XX": 1024,
"uclanlp/plbart-python-en_XX": 1024,
"uclanlp/plbart-refine-java-medium": 1024,
"uclanlp/plbart-refine-java-small": 1024,
"uclanlp/plbart-ruby-en_XX": 1024,
}
FAIRSEQ_LANGUAGE_CODES = {
"base": ["__java__", "__python__", "__en_XX__"],
"multi": ["__java__", "__python__", "__en_XX__", "__javascript__", "__php__", "__ruby__", "__go__"],
}
FAIRSEQ_LANGUAGE_CODES_MAP = {
"java": "__java__",
"python": "__python__",
"en_XX": "__en_XX__",
"javascript": "__javascript__",
"php": "__php__",
"ruby": "__ruby__",
"go": "__go__",
}
class PLBartTokenizer(PreTrainedTokenizer):
"""
Construct an PLBART tokenizer.
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).
The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.
Args:
vocab_file (`str`):
Path to the vocabulary file. This specifies the location of the vocabulary file to be used by the tokenizer.
src_lang (`str`, *optional*):
A string representing the source language. If provided, specifies the source language for the tokenizer.
tgt_lang (`str`, *optional*):
A string representing the target language. If provided, specifies the target language for the tokenizer.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The start of sequence token. Defines the token used to mark the beginning of a sequence.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token. Defines the token used to mark the end of a sequence.
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token. Used in scenarios like sequence classification or question answering to separate sequences.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classification token. This token is used as the first token for all tasks.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. If a token is not found in the vocabulary, it is replaced with this token.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The padding token. Used to pad sequences to the same length during batching.
mask_token(`str`, *optional*, defaults to `"<mask>"`):
The mask token. Used in masking tasks during training. Not used in multi-tokenizer scenarios.
language_codes (`str`, *optional*, defaults to `"base"`):
Specifies what language codes to use. Can be `"base"` or `"multi"`.
sp_model_kwargs (`dict`, *optional*):
Additional arguments passed to the `SentencePieceProcessor.__init__()` method. These parameters can configure
subword regularization and other SentencePiece settings like `enable_sampling`, `nbest_size`, and `alpha`.
See the [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) for details.
Examples:
```
>>> from transformers import PLBartTokenizer
>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
# 定义示例的 Python 代码短语和其对应的英文翻译,用于模型的输入
example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
expected_translation_english = "Returns the maximum value of a b c."
# 使用预训练模型的 tokenizer 处理示例的 Python 代码和其对应的英文翻译,返回 PyTorch 张量
inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
vocab_files_names = VOCAB_FILES_NAMES # 加载预训练模型的词汇文件名列表
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES # 加载预训练模型的最大输入尺寸列表
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP # 加载预训练模型的词汇文件映射表
model_input_names = ["input_ids", "attention_mask"] # 定义模型输入的名称列表
prefix_tokens: List[int] = [] # 初始化前缀 tokens 列表
suffix_tokens: List[int] = [] # 初始化后缀 tokens 列表
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
language_codes="base",
tokenizer_file=None,
src_lang=None,
tgt_lang=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
additional_special_tokens=None,
**kwargs,
):
# 初始化函数,设置各种参数和属性
def __getstate__(self):
# 序列化对象状态时调用,返回对象的字典形式状态
state = self.__dict__.copy()
state["sp_model"] = None
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state
def __setstate__(self, d):
# 反序列化对象状态时调用,恢复对象的状态
self.__dict__ = d
# 为了向后兼容性
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
# 加载 SentencePiece 模型并设置状态
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
@property
def vocab_size(self):
# 计算词汇表的大小,考虑语言编码和偏移量
if self.language_codes == "base":
return (
len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1
) # 加 1 用于 mask token
else:
return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
@property
def src_lang(self) -> str:
# 获取源语言代码
return self._src_lang
@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
# 设置源语言代码,并更新特殊 token
new_src_lang = self._convert_lang_code_special_format(new_src_lang)
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
):
# 获取特殊 token 的掩码,用于处理输入 token 的特殊性
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
# If the token list already has special tokens, delegate to superclass method
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# Create lists of 1s corresponding to prefix and suffix tokens
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
# If token_ids_1 is None, return tokens with prefix, sequence tokens (0s), and suffix
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
# If token_ids_1 is provided, return tokens with prefix, token_ids_0, token_ids_1, and suffix
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An PLBART sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# If token_ids_1 is None, concatenate prefix, token_ids_0, and suffix tokens
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
# Otherwise, concatenate prefix, token_ids_0, token_ids_1, and suffix tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs from a sequence or a pair of sequences for sequence classification tasks. This is used
to distinguish between the two sequences in a model that supports sequence pairs.
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence in a pair.
Returns:
`List[int]`: List of token type IDs (0 or 1) indicating the sequence type for each token.
"""
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. PLBart does not
make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# Separator token ID used in sequence pairs
sep = [self.sep_token_id]
# CLS token ID used in sequence pairs
cls = [self.cls_token_id]
# If only one sequence is provided, return the mask for that sequence
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# If two sequences are provided, return the mask for both sequences concatenated
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
# Ensure source and target languages are provided
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
# Convert source and target language codes to special format
self.src_lang = self._convert_lang_code_special_format(src_lang)
self.tgt_lang = self._convert_lang_code_special_format(tgt_lang)
# Generate model inputs with special tokens and specified return type
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
# Convert target language to its corresponding token ID
tgt_lang_id = self.convert_tokens_to_ids(self.tgt_lang)
# Add forced beginning-of-sequence token ID to inputs
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs
def get_vocab(self):
# Create a vocabulary dictionary mapping token strings to their IDs
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
# Include any additional tokens introduced during model training
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
# Tokenize input text using SentencePiece model and return as list of strings
return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token):
"""Converts a token (str) into an ID using the vocabulary."""
# Check if the token exists in the fairseq mapping
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
# Obtain token ID from SentencePiece model
spm_id = self.sp_model.PieceToId(token)
# Return unknown token ID if SentencePiece returns 0 (unknown token)
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) into a token (str) using the vocabulary."""
# Check if the index exists in the fairseq mapping
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
# Convert index to token using SentencePiece model
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) into a single string."""
# Concatenate tokens into a single string, replacing special sub-word marker with space
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 检查保存目录是否存在,如果不存在则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构建输出词汇表文件的路径
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# 如果当前词汇表文件路径与输出路径不同且当前词汇表文件存在,则复制当前词汇表文件到输出路径
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# 如果当前词汇表文件不存在,则将序列化后的特殊模型内容写入输出路径
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# 返回输出文件路径的元组
return (out_vocab_file,)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "en_XX",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "python",
**kwargs,
) -> BatchEncoding:
# 将源语言代码转换为特殊格式
self.src_lang = self._convert_lang_code_special_format(src_lang)
# 将目标语言代码转换为特殊格式
self.tgt_lang = self._convert_lang_code_special_format(tgt_lang)
# 调用父类方法,准备序列到序列的批处理数据
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
def _switch_to_input_mode(self):
# 切换到输入模式,设置源语言特殊标记
return self.set_src_lang_special_tokens(self.src_lang)
def _switch_to_target_mode(self):
# 切换到目标模式,设置目标语言特殊标记
return self.set_tgt_lang_special_tokens(self.tgt_lang)
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
# 将源语言代码转换为特殊格式
src_lang = self._convert_lang_code_special_format(src_lang)
# 根据转换后的源语言代码获取其对应的语言代码 ID
self.cur_lang_code = self.lang_code_to_id[src_lang] if src_lang is not None else None
# 清空前缀标记
self.prefix_tokens = []
# 如果当前语言代码不为 None,则后缀标记为[eos, 当前语言代码];否则后缀标记为[eos]
if self.cur_lang_code is not None:
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.suffix_tokens = [self.eos_token_id]
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
# 将目标语言代码转换为特殊格式
lang = self._convert_lang_code_special_format(lang)
# 根据转换后的目标语言代码获取其对应的语言代码 ID
self.cur_lang_code = self.lang_code_to_id[lang] if lang is not None else None
# 清空前缀标记
self.prefix_tokens = []
# 如果当前语言代码不为 None,则后缀标记为[eos, 当前语言代码];否则后缀标记为[eos]
if self.cur_lang_code is not None:
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.suffix_tokens = [self.eos_token_id]
def _convert_lang_code_special_format(self, lang: str) -> str:
"""Convert Language Codes to format tokenizer uses if required"""
# 如果输入的语言代码在映射表中,则转换为对应的格式,否则保持不变
lang = FAIRSEQ_LANGUAGE_CODES_MAP[lang] if lang in FAIRSEQ_LANGUAGE_CODES_MAP.keys() else lang
return lang
.\models\plbart\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {"configuration_plbart": ["PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "PLBartConfig"]}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_plbart"] = ["PLBartTokenizer"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_plbart"] = [
"PLBART_PRETRAINED_MODEL_ARCHIVE_LIST",
"PLBartForCausalLM",
"PLBartForConditionalGeneration",
"PLBartForSequenceClassification",
"PLBartModel",
"PLBartPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_plbart import PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP, PLBartConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_plbart import PLBartTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_plbart import (
PLBART_PRETRAINED_MODEL_ARCHIVE_LIST,
PLBartForCausalLM,
PLBartForConditionalGeneration,
PLBartForSequenceClassification,
PLBartModel,
PLBartPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\poolformer\configuration_poolformer.py
""" PoolFormer model configuration"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"sail/poolformer_s12": "https://huggingface.co/sail/poolformer_s12/resolve/main/config.json",
}
class PoolFormerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of [`PoolFormerModel`]. It is used to instantiate a
PoolFormer model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the PoolFormer
[sail/poolformer_s12](https://huggingface.co/sail/poolformer_s12) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "poolformer"
def __init__(
self,
num_channels=3,
patch_size=16,
stride=16,
pool_size=3,
mlp_ratio=4.0,
depths=[2, 2, 6, 2],
hidden_sizes=[64, 128, 320, 512],
patch_sizes=[7, 3, 3, 3],
strides=[4, 2, 2, 2],
padding=[2, 1, 1, 1],
num_encoder_blocks=4,
drop_path_rate=0.0,
hidden_act="gelu",
use_layer_scale=True,
layer_scale_init_value=1e-5,
initializer_range=0.02,
**kwargs,
):
):
self.num_channels = num_channels
self.patch_size = patch_size
self.stride = stride
self.padding = padding
self.pool_size = pool_size
self.hidden_sizes = hidden_sizes
self.mlp_ratio = mlp_ratio
self.depths = depths
self.patch_sizes = patch_sizes
self.strides = strides
self.num_encoder_blocks = num_encoder_blocks
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.use_layer_scale = use_layer_scale
self.layer_scale_init_value = layer_scale_init_value
self.initializer_range = initializer_range
super().__init__(**kwargs)
class PoolFormerOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 2e-3
.\models\poolformer\convert_poolformer_original_to_pytorch.py
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def replace_key_with_offset(key, offset, original_name, new_name):
"""
Replaces the key by subtracting the offset from the original layer number
Args:
key (str): 需要替换的键名
offset (int): 偏移量,用于计算新的块号
original_name (str): 原始层名称,用于定位需要替换的部分
new_name (str): 新的层名称,用于替换原始层名称
Returns:
str: 替换后的新键名
"""
to_find = original_name.split(".")[0]
key_list = key.split(".")
orig_block_num = int(key_list[key_list.index(to_find) - 2])
layer_num = int(key_list[key_list.index(to_find) - 1])
new_block_num = orig_block_num - offset
key = key.replace(f"{orig_block_num}.{layer_num}.{original_name}",
f"block.{new_block_num}.{layer_num}.{new_name}")
return key
def rename_keys(state_dict):
new_state_dict = OrderedDict()
total_embed_found, patch_emb_offset = 0, 0
for key, value in state_dict.items():
if key.startswith("network"):
key = key.replace("network", "poolformer.encoder")
if "proj" in key:
if key.endswith("bias") and "patch_embed" not in key:
patch_emb_offset += 1
to_replace = key[: key.find("proj")]
key = key.replace(to_replace, f"patch_embeddings.{total_embed_found}.")
key = key.replace("proj", "projection")
if key.endswith("bias"):
total_embed_found += 1
if "patch_embeddings" in key:
key = "poolformer.encoder." + key
if "mlp.fc1" in key:
key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc1", "output.conv1")
if "mlp.fc2" in key:
key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc2", "output.conv2")
if "norm1" in key:
key = replace_key_with_offset(key, patch_emb_offset, "norm1", "before_norm")
if "norm2" in key:
key = replace_key_with_offset(key, patch_emb_offset, "norm2", "after_norm")
if "layer_scale_1" in key:
key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_1", "layer_scale_1")
if "layer_scale_2" in key:
key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_2", "layer_scale_2")
if "head" in key:
key = key.replace("head", "classifier")
new_state_dict[key] = value
return new_state_dict
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
return image
@torch.no_grad()
def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
"""
Copy/paste/tweak model's weights to our PoolFormer structure.
"""
config = PoolFormerConfig()
repo_id = "huggingface/label-files"
size = model_name[-3:]
config.num_labels = 1000
filename = "imagenet-1k-id2label.json"
expected_shape = (1, 1000)
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
if size == "s12":
config.depths = [2, 2, 6, 2]
config.hidden_sizes = [64, 128, 320, 512]
config.mlp_ratio = 4.0
crop_pct = 0.9
elif size == "s24":
config.depths = [4, 4, 12, 4]
config.hidden_sizes = [64, 128, 320, 512]
config.mlp_ratio = 4.0
crop_pct = 0.9
elif size == "s36":
config.depths = [6, 6, 18, 6]
config.hidden_sizes = [64, 128, 320, 512]
config.mlp_ratio = 4.0
config.layer_scale_init_value = 1e-6
crop_pct = 0.9
elif size == "m36":
config.depths = [6, 6, 18, 6]
config.hidden_sizes = [96, 192, 384, 768]
config.mlp_ratio = 4.0
config.layer_scale_init_value = 1e-6
crop_pct = 0.95
elif size == "m48":
config.depths = [8, 8, 24, 8]
config.hidden_sizes = [96, 192, 384, 768]
config.mlp_ratio = 4.0
config.layer_scale_init_value = 1e-6
crop_pct = 0.95
else:
raise ValueError(f"Size {size} not supported")
image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
image = prepare_img()
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
logger.info(f"Converting model {model_name}...")
state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
state_dict = rename_keys(state_dict)
model = PoolFormerForImageClassification(config)
model.load_state_dict(state_dict)
model.eval()
image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
outputs = model(pixel_values)
logits = outputs.logits
if size == "s12":
expected_slice = torch.tensor([-0.3045, -0.6758, -0.4869])
elif size == "s24":
expected_slice = torch.tensor([0.4402, -0.1374, -0.8045])
elif size == "s36":
expected_slice = torch.tensor([-0.6080, -0.5133, -0.5898])
elif size == "m36":
expected_slice = torch.tensor([0.3952, 0.2263, -1.2668])
elif size == "m48":
expected_slice = torch.tensor([0.1167, -0.0656, -0.3423])
else:
raise ValueError(f"Size {size} not supported")
assert logits.shape == expected_shape
assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2)
logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="poolformer_s12",
type=str,
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--checkpoint_path",
default=None,
type=str,
help="Path to the original PyTorch checkpoint (.pth file)."
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model."
)
args = parser.parse_args()
convert_poolformer_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
.\models\poolformer\feature_extraction_poolformer.py
"""PoolFormer 的特征提取器类。"""
import warnings
from ...utils import logging
from .image_processing_poolformer import PoolFormerImageProcessor
logger = logging.get_logger(__name__)
class PoolFormerFeatureExtractor(PoolFormerImageProcessor):
def __init__(self, *args, **kwargs) -> None:
warnings.warn(
"The class PoolFormerFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
" Please use PoolFormerImageProcessor instead.",
FutureWarning,
)
super().__init__(*args, **kwargs)
.\models\poolformer\image_processing_poolformer.py
"""PoolFormer的图像处理类。"""
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
class PoolFormerImageProcessor(BaseImageProcessor):
r"""
构造一个PoolFormer图像处理器。
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
crop_pct: int = 0.9,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
rescale_factor: Union[int, float] = 1 / 255,
do_rescale: bool = True,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, param_name="crop_size")
self.do_resize = do_resize
self.size = size
self.crop_pct = crop_pct
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"crop_pct",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
.\models\poolformer\modeling_poolformer.py
""" PyTorch PoolFormer model."""
import collections.abc
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention
from ...modeling_utils import PreTrainedModel
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_poolformer import PoolFormerConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "PoolFormerConfig"
_CHECKPOINT_FOR_DOC = "sail/poolformer_s12"
_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "sail/poolformer_s12"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"sail/poolformer_s12",
]
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
按样本丢弃路径(随机深度)(在残差块的主路径中应用时)。
Ross Wightman 的评论:这与我为 EfficientNet 等网络创建的 DropConnect 实现相同,
但原始名称具有误导性,因为“Drop Connect”是另一篇论文中不同形式的丢弃...
参见讨论:https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
我选择改变层和参数名称为“drop path”,而不是将 DropConnect 作为层名称并使用“生存率”作为参数。
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class PoolFormerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class PoolFormerEmbeddings(nn.Module):
"""
Construct Patch Embeddings.
"""
def __init__(self, hidden_size, num_channels, patch_size, stride, padding, norm_layer=None):
super().__init__()
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
padding = padding if isinstance(padding, collections.abc.Iterable) else (padding, padding)
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=stride, padding=padding)
self.norm = norm_layer(hidden_size) if norm_layer else nn.Identity()
def forward(self, pixel_values):
embeddings = self.projection(pixel_values)
embeddings = self.norm(embeddings)
return embeddings
class PoolFormerGroupNorm(nn.GroupNorm):
"""
Group Normalization with 1 group. Input: tensor in shape [B, C, H, W]
"""
def __init__(self, num_channels, **kwargs):
super().__init__(1, num_channels, **kwargs)
class PoolFormerPooling(nn.Module):
def __init__(self, pool_size):
super().__init__()
self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
def forward(self, hidden_states):
return self.pool(hidden_states) - hidden_states
class PoolFormerOutput(nn.Module):
def __init__(self, config, dropout_prob, hidden_size, intermediate_size):
super().__init__()
self.conv1 = nn.Conv2d(hidden_size, intermediate_size, 1)
self.conv2 = nn.Conv2d(intermediate_size, hidden_size, 1)
self.drop = PoolFormerDropPath(dropout_prob)
if isinstance(config.hidden_act, str):
self.act_fn = ACT2FN[config.hidden_act]
else:
self.act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.conv1(hidden_states)
hidden_states = self.act_fn(hidden_states)
hidden_states = self.drop(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = self.drop(hidden_states)
return hidden_states
class PoolFormerLayer(nn.Module):
"""This corresponds to the 'PoolFormerBlock' class in the original implementation."""
def __init__(self, config, num_channels, pool_size, hidden_size, intermediate_size, drop_path):
super().__init__()
self.pooling = PoolFormerPooling(pool_size)
self.output = PoolFormerOutput(config, drop_path, hidden_size, intermediate_size)
self.before_norm = PoolFormerGroupNorm(num_channels)
self.after_norm = PoolFormerGroupNorm(num_channels)
self.drop_path = PoolFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.use_layer_scale = config.use_layer_scale
if config.use_layer_scale:
self.layer_scale_1 = nn.Parameter(
config.layer_scale_init_value * torch.ones((num_channels)), requires_grad=True
)
self.layer_scale_2 = nn.Parameter(
config.layer_scale_init_value * torch.ones((num_channels)), requires_grad=True
)
def forward(self, hidden_states):
if self.use_layer_scale:
pooling_output = self.pooling(self.before_norm(hidden_states))
scaled_op = self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * pooling_output
hidden_states = hidden_states + self.drop_path(scaled_op)
outputs = ()
layer_output = self.output(self.after_norm(hidden_states))
scaled_op = self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * layer_output
output = hidden_states + self.drop_path(scaled_op)
outputs = (output,) + outputs
return outputs
else:
pooling_output = self.drop_path(self.pooling(self.before_norm(hidden_states)))
hidden_states = pooling_output + hidden_states
outputs = ()
layer_output = self.drop_path(self.output(self.after_norm(hidden_states)))
output = hidden_states + layer_output
outputs = (output,) + outputs
return outputs
class PoolFormerEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
embeddings = []
for i in range(config.num_encoder_blocks):
embeddings.append(
PoolFormerEmbeddings(
patch_size=config.patch_sizes[i],
stride=config.strides[i],
padding=config.padding[i],
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
hidden_size=config.hidden_sizes[i],
)
)
self.patch_embeddings = nn.ModuleList(embeddings)
blocks = []
cur = 0
for i in range(config.num_encoder_blocks):
layers = []
if i != 0:
cur += config.depths[i - 1]
for j in range(config.depths[i]):
layers.append(
PoolFormerLayer(
config,
num_channels=config.hidden_sizes[i],
pool_size=config.pool_size,
hidden_size=config.hidden_sizes[i],
intermediate_size=int(config.hidden_sizes[i] * config.mlp_ratio),
drop_path=dpr[cur + j],
)
)
blocks.append(nn.ModuleList(layers))
self.block = nn.ModuleList(blocks)
def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
all_hidden_states = () if output_hidden_states else None
hidden_states = pixel_values
for idx, layers in enumerate(zip(self.patch_embeddings, self.block)):
embedding_layer, block_layer = layers
hidden_states = embedding_layer(hidden_states)
for _, blk in enumerate(block_layer):
layer_outputs = blk(hidden_states)
hidden_states = layer_outputs[0]
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
class PoolFormerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = PoolFormerConfig
base_model_prefix = "poolformer"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
POOLFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`PoolFormerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
POOLFORMER_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`PoolFormerImageProcessor.__call__`] for details.
"""
@add_start_docstrings(
"The bare PoolFormer Model transformer outputting raw hidden-states without any specific head on top.",
POOLFORMER_START_DOCSTRING,
)
class PoolFormerModel(PoolFormerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.encoder = PoolFormerEncoder(config)
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
@add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
encoder_outputs = self.encoder(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output, None) + encoder_outputs[1:]
return BaseModelOutputWithNoAttention(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
)
class PoolFormerFinalPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
def forward(self, hidden_states):
output = self.dense(hidden_states)
return output
@add_start_docstrings(
"""
PoolFormer Model transformer with an image classification head on top
""",
POOLFORMER_START_DOCSTRING,
)
class PoolFormerForImageClassification(PoolFormerPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.poolformer = PoolFormerModel(config)
self.norm = PoolFormerGroupNorm(config.hidden_sizes[-1])
self.classifier = (
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(POOLFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, ImageClassifierOutputWithNoAttention]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.poolformer(
pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(self.norm(sequence_output).mean([-2, -1]))
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
.\models\poolformer\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_poolformer": [
"POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
"PoolFormerConfig",
"PoolFormerOnnxConfig",
]
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_poolformer"] = ["PoolFormerFeatureExtractor"]
_import_structure["image_processing_poolformer"] = ["PoolFormerImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_poolformer"] = [
"POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
"PoolFormerForImageClassification",
"PoolFormerModel",
"PoolFormerPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_poolformer import (
POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
PoolFormerConfig,
PoolFormerOnnxConfig,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_poolformer import PoolFormerFeatureExtractor
from .image_processing_poolformer import PoolFormerImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_poolformer import (
POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
PoolFormerForImageClassification,
PoolFormerModel,
PoolFormerPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\pop2piano\configuration_pop2piano.py
""" Pop2Piano model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"sweetcocoa/pop2piano": "https://huggingface.co/sweetcocoa/pop2piano/blob/main/config.json"
}
class Pop2PianoConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Pop2PianoForConditionalGeneration`]. It is used
to instantiate a Pop2PianoForConditionalGeneration model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
Pop2Piano [sweetcocoa/pop2piano](https://huggingface.co/sweetcocoa/pop2piano) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 定义 `Pop2PianoForConditionalGeneration` 模型的词汇表大小,默认为 2400
# `inputs_ids` 调用时传入的不同令牌数量,用于 `Pop2PianoForConditionalGeneration`
vocab_size = 2400
# 定义作曲家的数量,默认为 21
composer_vocab_size = 21
# 定义编码器层和池化层的大小,默认为 512
d_model = 512
# 定义每个注意力头中键、查询、值投影的大小,默认为 64
# 投影层的 `inner_dim` 将被定义为 `num_heads * d_kv`
d_kv = 64
# 定义每个 `Pop2PianoBlock` 中中间前馈层的大小,默认为 2048
d_ff = 2048
# 定义Transformer编码器中隐藏层的数量,默认为 6
num_layers = 6
# 定义Transformer解码器中隐藏层的数量,默认与 `num_layers` 相同
# 若未设置,将与 `num_layers` 使用相同的值
num_decoder_layers = None
# 定义Transformer编码器中每个注意力层的注意力头数量,默认为 8
num_heads = 8
# 定义每个注意力层使用的桶数量,默认为 32
relative_attention_num_buckets = 32
# 定义用于桶分离的较长序列的最大距离,默认为 128
relative_attention_max_distance = 128
# 定义所有dropout层的比率,默认为 0.1
dropout_rate = 0.1
# 定义层归一化层使用的 epsilon 值,默认为 1e-6
layer_norm_epsilon = 1e-6
# 初始化所有权重矩阵的因子,默认为 1.0
# 用于初始化测试内部使用,通常应保持为 1.0
initializer_factor = 1.0
# 定义要使用的前馈层类型,默认为 `"gated-gelu"`
# 应为 `"relu"` 或 `"gated-gelu"` 之一
feed_forward_proj = "gated-gelu"
# 模型是否应返回最后的键/值注意力,默认为 `True`
# 并非所有模型都使用此选项
use_cache = True
# 定义在 `Pop2PianoDenseActDense` 和 `Pop2PianoDenseGatedActDense` 中使用的激活函数类型,默认为 `"relu"`
dense_act_fn = "relu"
# 模型类型设置为 `"pop2piano"`
model_type = "pop2piano"
# 在推断时忽略的键列表,默认包含 `"past_key_values"`
keys_to_ignore_at_inference = ["past_key_values"]
# 初始化函数,用于初始化一个自定义的Transformer模型配置
def __init__(
self,
vocab_size=2400, # 词汇表大小,默认为2400
composer_vocab_size=21, # 作曲家词汇表大小,默认为21
d_model=512, # Transformer模型的隐藏层维度,默认为512
d_kv=64, # 注意力机制中key和value的维度,默认为64
d_ff=2048, # Feed Forward网络中间层的维度,默认为2048
num_layers=6, # Transformer模型中的层数,默认为6
num_decoder_layers=None, # 解码器层数,如果为None则与num_layers相同
num_heads=8, # 多头注意力机制中的头数,默认为8
relative_attention_num_buckets=32, # 相对位置编码中的桶数,默认为32
relative_attention_max_distance=128, # 相对位置编码的最大距离,默认为128
dropout_rate=0.1, # Dropout的比率,默认为0.1
layer_norm_epsilon=1e-6, # Layer Normalization中的epsilon,默认为1e-6
initializer_factor=1.0, # 初始化因子,默认为1.0
feed_forward_proj="gated-gelu", # 前向传播的激活函数,默认为"gated-gelu"
is_encoder_decoder=True, # 是否是编码器-解码器模型,默认为True
use_cache=True, # 是否使用缓存,默认为True
pad_token_id=0, # 填充token的ID,默认为0
eos_token_id=1, # 结束token的ID,默认为1
dense_act_fn="relu", # Dense层的激活函数,默认为"relu"
**kwargs, # 其他参数
):
self.vocab_size = vocab_size # 初始化词汇表大小
self.composer_vocab_size = composer_vocab_size # 初始化作曲家词汇表大小
self.d_model = d_model # 初始化隐藏层维度
self.d_kv = d_kv # 初始化key和value的维度
self.d_ff = d_ff # 初始化Feed Forward网络中间层的维度
self.num_layers = num_layers # 初始化Transformer模型中的层数
self.num_decoder_layers = num_decoder_layers if num_decoder_layers is not None else self.num_layers # 初始化解码器层数
self.num_heads = num_heads # 初始化多头注意力机制中的头数
self.relative_attention_num_buckets = relative_attention_num_buckets # 初始化相对位置编码中的桶数
self.relative_attention_max_distance = relative_attention_max_distance # 初始化相对位置编码的最大距离
self.dropout_rate = dropout_rate # 初始化Dropout的比率
self.layer_norm_epsilon = layer_norm_epsilon # 初始化Layer Normalization中的epsilon
self.initializer_factor = initializer_factor # 初始化初始化因子
self.feed_forward_proj = feed_forward_proj # 初始化前向传播的激活函数
self.use_cache = use_cache # 初始化是否使用缓存
self.dense_act_fn = dense_act_fn # 初始化Dense层的激活函数
self.is_gated_act = self.feed_forward_proj.split("-")[0] == "gated" # 检查是否是gated激活函数
self.hidden_size = self.d_model # 初始化隐藏层大小为模型维度
self.num_attention_heads = num_heads # 初始化注意力头数
self.num_hidden_layers = num_layers # 初始化隐藏层的数量
# 调用父类的初始化方法,设置pad_token_id、eos_token_id、is_encoder_decoder等参数
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
**kwargs,
)
.\models\pop2piano\convert_pop2piano_weights_to_hf.py
""" 用于从官方库加载 Pop2Piano 模型权重并展示 tokenizer 词汇构建方法的文件 """
import json
import torch
from transformers import Pop2PianoConfig, Pop2PianoForConditionalGeneration
official_weights = torch.load("./model-1999-val_0.67311615.ckpt")
state_dict = {}
cfg = Pop2PianoConfig.from_pretrained("sweetcocoa/pop2piano")
model = Pop2PianoForConditionalGeneration(cfg)
state_dict["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
"transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
]
state_dict["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
"transformer.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
]
state_dict["encoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.encoder.embed_tokens.weight"]
state_dict["decoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.decoder.embed_tokens.weight"]
state_dict["encoder.final_layer_norm.weight"] = official_weights["state_dict"][
"transformer.encoder.final_layer_norm.weight"
]
state_dict["decoder.final_layer_norm.weight"] = official_weights["state_dict"][
"transformer.decoder.final_layer_norm.weight"
]
state_dict["lm_head.weight"] = official_weights["state_dict"]["transformer.lm_head.weight"]
state_dict["mel_conditioner.embedding.weight"] = official_weights["state_dict"]["mel_conditioner.embedding.weight"]
state_dict["shared.weight"] = official_weights["state_dict"]["transformer.shared.weight"]
for i in range(cfg.num_layers):
state_dict[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.0.SelfAttention.q.weight"
]
state_dict[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.0.SelfAttention.k.weight"
]
state_dict[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.0.SelfAttention.v.weight"
]
state_dict[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.0.SelfAttention.o.weight"
]
state_dict[f"encoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.0.layer_norm.weight"
]
state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"
]
state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"
]
state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wo.weight"
]
state_dict[f"encoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
f"transformer.encoder.block.{i}.layer.1.layer_norm.weight"
]
for i in range(6):
state_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.0.SelfAttention.q.weight"
]
state_dict[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.0.SelfAttention.k.weight"
]
state_dict[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.0.SelfAttention.v.weight"
]
state_dict[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.0.SelfAttention.o.weight"
]
state_dict[f"decoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.0.layer_norm.weight"
]
state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.1.EncDecAttention.q.weight"
]
state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.1.EncDecAttention.k.weight"
]
state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.1.EncDecAttention.v.weight"
]
state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.1.EncDecAttention.o.weight"
]
state_dict[f"decoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.1.layer_norm.weight"
]
state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"
]
state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"
]
state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wo.weight"
]
state_dict[f"decoder.block.{i}.layer.2.layer_norm.weight"] = official_weights["state_dict"][
f"transformer.decoder.block.{i}.layer.2.layer_norm.weight"
]
model.load_state_dict(state_dict, strict=True)
torch.save(state_dict, "./pytorch_model.bin")
def tokenize(idx, token_type, n_special=4, n_note=128, n_velocity=2):
if token_type == "TOKEN_TIME":
return n_special + n_note + n_velocity + idx
elif token_type == "TOKEN_VELOCITY":
return n_special + n_note + idx
elif token_type == "TOKEN_NOTE":
return n_special + idx
elif token_type == "TOKEN_SPECIAL":
return idx
else:
return -1
def detokenize(idx, n_special=4, n_note=128, n_velocity=2, time_idx_offset=0):
if idx >= n_special + n_note + n_velocity:
return "TOKEN_TIME", (idx - (n_special + n_note + n_velocity)) + time_idx_offset
elif idx >= n_special + n_note:
return "TOKEN_VELOCITY", idx - (n_special + n_note)
elif idx >= n_special:
return "TOKEN_NOTE", idx - n_special
else:
return "TOKEN_SPECIAL", idx
decoder = {}
for i in range(cfg.vocab_size):
decoder.update({i: f"{detokenize(i)[1]}_{detokenize(i)[0]}"})
encoder = {v: k for k, v in decoder.items()}
with open("./vocab.json", "w") as file:
file.write(json.dumps(encoder))
.\models\pop2piano\feature_extraction_pop2piano.py
r"""
Constructs a Pop2Piano feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.
This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed
to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as
well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate
extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate
and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.
"""
import warnings
from typing import List, Optional, Union
import numpy
import numpy as np
from ...audio_utils import mel_filter_bank, spectrogram
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import (
TensorType,
is_essentia_available,
is_librosa_available,
is_scipy_available,
logging,
requires_backends,
)
if is_essentia_available():
import essentia
import essentia.standard
if is_librosa_available():
import librosa
if is_scipy_available():
import scipy
logger = logging.get_logger(__name__)
class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a Pop2Piano feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.
This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed
to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as
well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate
extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate
and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.
"""
model_input_names = ["input_features", "beatsteps", "extrapolated_beatstep"]
model_input_names = ["input_features", "beatsteps", "extrapolated_beatstep"]
def __init__(
self,
sampling_rate: int = 22050,
padding_value: int = 0,
window_size: int = 4096,
hop_length: int = 1024,
min_frequency: float = 10.0,
feature_size: int = 512,
num_bars: int = 2,
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
**kwargs,
)
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.window_size = window_size
self.hop_length = hop_length
self.min_frequency = min_frequency
self.feature_size = feature_size
self.num_bars = num_bars
self.mel_filters = mel_filter_bank(
num_frequency_bins=(self.window_size // 2) + 1,
num_mel_filters=self.feature_size,
min_frequency=self.min_frequency,
max_frequency=float(self.sampling_rate // 2),
sampling_rate=self.sampling_rate,
norm=None,
mel_scale="htk",
)
def mel_spectrogram(self, sequence: np.ndarray):
"""
Generates MelSpectrogram.
Args:
sequence (`numpy.ndarray`):
The sequence of which the mel-spectrogram will be computed.
"""
mel_specs = []
for seq in sequence:
window = np.hanning(self.window_size + 1)[:-1]
mel_specs.append(
spectrogram(
waveform=seq,
window=window,
frame_length=self.window_size,
hop_length=self.hop_length,
power=2.0,
mel_filters=self.mel_filters,
)
)
mel_specs = np.array(mel_specs)
return mel_specs
def extract_rhythm(self, audio: np.ndarray):
"""
This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as
tempo in bpm for an audio signal. For more information please visit
https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .
Args:
audio(`numpy.ndarray`):
raw audio waveform which is passed to the Rhythm Extractor.
"""
requires_backends(self, ["essentia"])
essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
bpm, beat_times, confidence, estimates, essentia_beat_intervals = essentia_tracker(audio)
return bpm, beat_times, confidence, estimates, essentia_beat_intervals
def interpolate_beat_times(
self, beat_times: numpy.ndarray, steps_per_beat: numpy.ndarray, n_extend: numpy.ndarray
):
"""
This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is
then used to convert raw audio to log-mel-spectrogram.
Args:
beat_times (`numpy.ndarray`):
beat_times is passed into `scipy.interpolate.interp1d` for processing.
steps_per_beat (`int`):
used as an parameter to control the interpolation.
n_extend (`int`):
used as an parameter to control the interpolation.
"""
requires_backends(self, ["scipy"])
beat_times_function = scipy.interpolate.interp1d(
np.arange(beat_times.size),
beat_times,
bounds_error=False,
fill_value="extrapolate",
)
ext_beats = beat_times_function(
np.linspace(0, beat_times.size + n_extend - 1, beat_times.size * steps_per_beat + n_extend)
)
return ext_beats
def preprocess_mel(self, audio: np.ndarray, beatstep: np.ndarray):
"""
Preprocessing for log-mel-spectrogram
Args:
audio (`numpy.ndarray` of shape `(audio_length, )` ):
Raw audio waveform to be processed.
beatstep (`numpy.ndarray`):
Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by
the value at beatstep[0].
"""
if audio is not None and len(audio.shape) != 1:
raise ValueError(
f"Expected `audio` to be a single channel audio input of shape `(n, )` but found shape {audio.shape}."
)
if beatstep[0] > 0.0:
beatstep = beatstep - beatstep[0]
num_steps = self.num_bars * 4
num_target_steps = len(beatstep)
extrapolated_beatstep = self.interpolate_beat_times(
beat_times=beatstep, steps_per_beat=1, n_extend=(self.num_bars + 1) * 4 + 1
)
sample_indices = []
max_feature_length = 0
for i in range(0, num_target_steps, num_steps):
start_idx = i
end_idx = min(i + num_steps, num_target_steps)
start_sample = int(extrapolated_beatstep[start_idx] * self.sampling_rate)
end_sample = int(extrapolated_beatstep[end_idx] * self.sampling_rate)
sample_indices.append((start_sample, end_sample))
max_feature_length = max(max_feature_length, end_sample - start_sample)
padded_batch = []
for start_sample, end_sample in sample_indices:
feature = audio[start_sample:end_sample]
padded_feature = np.pad(
feature,
((0, max_feature_length - feature.shape[0]),),
"constant",
constant_values=0,
)
padded_batch.append(padded_feature)
padded_batch = np.asarray(padded_batch)
return padded_batch, extrapolated_beatstep
def _pad(self, features: np.ndarray, add_zero_line=True):
features_shapes = [each_feature.shape for each_feature in features]
attention_masks, padded_features = [], []
for i, each_feature in enumerate(features):
if len(each_feature.shape) == 3:
features_pad_value = max([*zip(*features_shapes)][1]) - features_shapes[i][1]
attention_mask = np.ones(features_shapes[i][:2], dtype=np.int64)
feature_padding = ((0, 0), (0, features_pad_value), (0, 0))
attention_mask_padding = (feature_padding[0], feature_padding[1])
else:
each_feature = each_feature.reshape(1, -1)
features_pad_value = max([*zip(*features_shapes)][0]) - features_shapes[i][0]
attention_mask = np.ones(features_shapes[i], dtype=np.int64).reshape(1, -1)
feature_padding = attention_mask_padding = ((0, 0), (0, features_pad_value))
each_padded_feature = np.pad(each_feature, feature_padding, "constant", constant_values=self.padding_value)
attention_mask = np.pad(
attention_mask, attention_mask_padding, "constant", constant_values=self.padding_value
)
if add_zero_line:
zero_array_len = max([*zip(*features_shapes)][1])
each_padded_feature = np.concatenate(
[each_padded_feature, np.zeros([1, zero_array_len, self.feature_size])], axis=0
)
attention_mask = np.concatenate(
[attention_mask, np.zeros([1, zero_array_len], dtype=attention_mask.dtype)], axis=0
)
padded_features.append(each_padded_feature)
attention_masks.append(attention_mask)
padded_features = np.concatenate(padded_features, axis=0).astype(np.float32)
attention_masks = np.concatenate(attention_masks, axis=0).astype(np.int64)
return padded_features, attention_masks
):
"""
Pads the inputs to the same length and returns attention_mask.
Args:
inputs (`BatchFeature`):
Processed audio features.
is_batched (`bool`):
Whether inputs are batched or not.
return_attention_mask (`bool`):
Whether to return attention mask or not.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of a list of Python integers. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
If nothing is specified, it will return a list of `np.ndarray` arrays.
Return:
`BatchFeature` with attention_mask, attention_mask_beatsteps, and attention_mask_extrapolated_beatstep added
to it:
- **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` --
Example:
1, 1, 1, 0, 0 (audio 1, padded to a max length of 5 with 2 zeros indicating padding)
0, 0, 0, 0, 0 (zero padding to separate audio 1 and 2)
1, 1, 1, 1, 1 (audio 2)
0, 0, 0, 0, 0 (zero padding to separate audio 2 and 3)
1, 1, 1, 1, 1 (audio 3)
- **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
- **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size,
max_extrapolated_beatstep_seq_length)`
"""
processed_features_dict = {}
for feature_name, feature_value in inputs.items():
if feature_name == "input_features":
padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=True)
processed_features_dict[feature_name] = padded_feature_values
if return_attention_mask:
processed_features_dict["attention_mask"] = attention_mask
else:
padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=False)
processed_features_dict[feature_name] = padded_feature_values
if return_attention_mask:
processed_features_dict[f"attention_mask_{feature_name}"] = attention_mask
if not is_batched and not return_attention_mask:
processed_features_dict["input_features"] = processed_features_dict["input_features"][:-1, ...]
outputs = BatchFeature(processed_features_dict, tensor_type=return_tensors)
return outputs
def __call__(
self,
audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
sampling_rate: Union[int, List[int]],
steps_per_beat: int = 2,
resample: Optional[bool] = True,
return_attention_mask: Optional[bool] = False,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
.\models\pop2piano\modeling_pop2piano.py
""" PyTorch Pop2Piano model."""
import copy
import math
from typing import Optional, Tuple, Union
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers.generation import GenerationConfig
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_torch_fx_proxy,
logging,
replace_return_docstrings,
)
from .configuration_pop2piano import Pop2PianoConfig
logger = logging.get_logger(__name__)
_load_pop2piano_layer_norm = True
try:
from apex.normalization import FusedRMSNorm
_load_pop2piano_layer_norm = False
logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNorm")
except ImportError:
pass
except Exception:
logger.warning("Discovered apex but it failed to load, falling back to Pop2PianoLayerNorm")
pass
_CONFIG_FOR_DOC = "Pop2PianoConfig"
_CHECKPOINT_FOR_DOC = "sweetcocoa/pop2piano"
POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST = [
"sweetcocoa/pop2piano",
]
POP2PIANO_INPUTS_DOCSTRING = r"""
"""
class Pop2PianoLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
if self.weight.dtype in [torch.float16, torch.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype)
return self.weight * hidden_states
if not _load_pop2piano_layer_norm:
Pop2PianoLayerNorm = FusedRMSNorm
ALL_LAYERNORM_LAYERS.append(Pop2PianoLayerNorm)
class Pop2PianoDenseActDense(nn.Module):
def __init__(self, config: Pop2PianoConfig):
super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
self.act = ACT2FN[config.dense_act_fn]
def forward(self, hidden_states):
hidden_states = self.wi(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.wo.weight, torch.Tensor)
and hidden_states.dtype != self.wo.weight.dtype
and self.wo.weight.dtype != torch.int8
):
hidden_states = hidden_states.to(self.wo.weight.dtype)
hidden_states = self.wo(hidden_states)
return hidden_states
class Pop2PianoDenseGatedActDense(nn.Module):
def __init__(self, config: Pop2PianoConfig):
super().__init__()
self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
self.act = ACT2FN[config.dense_act_fn]
def forward(self, hidden_states):
hidden_gelu = self.act(self.wi_0(hidden_states))
hidden_linear = self.wi_1(hidden_states)
hidden_states = hidden_gelu * hidden_linear
hidden_states = self.dropout(hidden_states)
if (
isinstance(self.wo.weight, torch.Tensor)
and hidden_states.dtype != self.wo.weight.dtype
and self.wo.weight.dtype != torch.int8
):
hidden_states = hidden_states.to(self.wo.weight.dtype)
hidden_states = self.wo(hidden_states)
return hidden_states
class Pop2PianoLayerFF(nn.Module):
def __init__(self, config: Pop2PianoConfig):
super().__init__()
if config.is_gated_act:
self.DenseReluDense = Pop2PianoDenseGatedActDense(config)
else:
self.DenseReluDense = Pop2PianoDenseActDense(config)
self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
forwarded_states = self.layer_norm(hidden_states)
forwarded_states = self.DenseReluDense(forwarded_states)
hidden_states = hidden_states + self.dropout(forwarded_states)
return hidden_states
class Pop2PianoAttention(nn.Module):
def __init__(self, config: Pop2PianoConfig, has_relative_attention_bias=False):
super().__init__()
self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.relative_attention_max_distance = config.relative_attention_max_distance
self.d_model = config.d_model
self.key_value_proj_dim = config.d_kv
self.n_heads = config.num_heads
self.dropout = config.dropout_rate
self.inner_dim = self.n_heads * self.key_value_proj_dim
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
if self.has_relative_attention_bias:
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
self.pruned_heads = set()
self.gradient_checkpointing = False
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
)
self.q = prune_linear_layer(self.q, index)
self.k = prune_linear_layer(self.k, index)
self.v = prune_linear_layer(self.v, index)
self.o = prune_linear_layer(self.o, index, dim=1)
self.n_heads = self.n_heads - len(heads)
self.inner_dim = self.key_value_proj_dim * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
@staticmethod
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor - 相对位置的整数张量
bidirectional: a boolean - 是否是双向注意力
num_buckets: an integer - 桶的数量
max_distance: an integer - 最大距离限制
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
返回一个形状与relative_position相同的张量,包含在区间[0, num_buckets)内的int32值
"""
relative_buckets = 0
if bidirectional:
num_buckets //= 2
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
relative_position = torch.abs(relative_position)
else:
relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.long)
relative_position_if_large = torch.min(
relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
)
relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
return relative_buckets
def compute_bias(self, query_length, key_length, device=None):
"""Compute binned relative position bias"""
if device is None:
device = self.relative_attention_bias.weight.device
context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
relative_position = memory_position - context_position
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance,
)
values = self.relative_attention_bias(relative_position_bucket)
values = values.permute([2, 0, 1]).unsqueeze(0)
return values
def forward(
self,
hidden_states,
mask=None,
key_value_states=None,
position_bias=None,
past_key_value=None,
layer_head_mask=None,
query_length=None,
use_cache=False,
output_attentions=False,
class Pop2PianoLayerSelfAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.SelfAttention = Pop2PianoAttention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
)
hidden_states = hidden_states + self.dropout(attention_output[0])
outputs = (hidden_states,) + attention_output[1:]
return outputs
class Pop2PianoLayerCrossAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.EncDecAttention = Pop2PianoAttention(config, has_relative_attention_bias=False)
self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
query_length=None,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(
normed_hidden_states,
mask=attention_mask,
key_value_states=key_value_states,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
query_length=query_length,
output_attentions=output_attentions,
)
layer_output = hidden_states + self.dropout(attention_output[0])
outputs = (layer_output,) + attention_output[1:]
return outputs
class Pop2PianoBlock(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.is_decoder = config.is_decoder
self.layer = nn.ModuleList()
self.layer.append(Pop2PianoLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
if self.is_decoder:
self.layer.append(Pop2PianoLayerCrossAttention(config))
self.layer.append(Pop2PianoLayerFF(config))
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
encoder_decoder_position_bias=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
return_dict=True,
class Pop2PianoPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Pop2PianoConfig
base_model_prefix = "transformer"
is_parallelizable = False
supports_gradient_checkpointing = True
_no_split_modules = ["Pop2PianoBlock"]
_keep_in_fp32_modules = ["wo"]
def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
if decoder_start_token_id is None:
raise ValueError(
"self.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id."
)
if is_torch_fx_proxy(input_ids):
shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
else:
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class Pop2PianoStack(Pop2PianoPreTrainedModel):
def __init__(self, config, embed_tokens=None):
super().__init__(config)
self.embed_tokens = embed_tokens
self.is_decoder = config.is_decoder
self.block = nn.ModuleList(
[Pop2PianoBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
)
self.final_layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
self.post_init()
self.model_parallel = False
self.device_map = None
self.gradient_checkpointing = False
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, new_embeddings):
self.embed_tokens = new_embeddings
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
inputs_embeds=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
class Pop2PianoConcatEmbeddingToMel(nn.Module):
"""Embedding Matrix for `composer` tokens."""
def __init__(self, config):
super().__init__()
self.embedding = nn.Embedding(num_embeddings=config.composer_vocab_size, embedding_dim=config.d_model)
def forward(self, feature, index_value, embedding_offset):
index_shifted = index_value - embedding_offset
composer_embedding = self.embedding(index_shifted).unsqueeze(1)
inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
return inputs_embeds
Pop2Piano_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`Pop2PianoConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings("""Pop2Piano Model with a `language modeling` head on top.""", Pop2Piano_START_DOCSTRING)
class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: Pop2PianoConfig):
super().__init__(config)
self.config = config
self.model_dim = config.d_model
self.shared = nn.Embedding(config.vocab_size, config.d_model)
self.mel_conditioner = Pop2PianoConcatEmbeddingToMel(config)
encoder_config = copy.deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = Pop2PianoStack(encoder_config, self.shared)
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
self.decoder = Pop2PianoStack(decoder_config, self.shared)
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
self.decoder.set_input_embeddings(new_embeddings)
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def get_output_embeddings(self):
return self.lm_head
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def get_mel_conditioner_outputs(
self,
input_features: torch.FloatTensor,
composer: str,
generation_config: GenerationConfig,
attention_mask: torch.FloatTensor = None,
):
"""
This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
control the type of MIDI token generated by the model.
Args:
input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
input features extracted from the feature extractor.
composer (`str`):
composer token which determines the type of MIDI tokens to be generated.
generation_config (`~generation.GenerationConfig`):
The generation is used to get the composer-feature_token pair.
attention_mask (`torch.FloatTensor`, *optional*):
For batched generation, input_features are padded to have the same shape across all examples.
`attention_mask` helps determine which areas were padded and which were not:
- 1 for tokens that are **not padded**,
- 0 for tokens that are **padded**.
"""
composer_to_feature_token = generation_config.composer_to_feature_token
if composer not in composer_to_feature_token.keys():
raise ValueError(
f"Please choose a composer from {list(composer_to_feature_token.keys())}. Composer received - {composer}"
)
composer_value = composer_to_feature_token[composer]
composer_value = torch.tensor(composer_value, device=self.device)
composer_value = composer_value.repeat(input_features.shape[0])
embedding_offset = min(composer_to_feature_token.values())
input_features = self.mel_conditioner(
feature=input_features,
index_value=composer_value,
embedding_offset=embedding_offset,
)
if attention_mask is not None:
input_features[~attention_mask[:, 0].bool()] = 0.0
attention_mask = torch.cat([attention_mask[:, 0].view(-1, 1), attention_mask], dim=1)
return input_features, attention_mask
return input_features, None
@torch.no_grad()
def generate(
self,
input_features,
attention_mask=None,
composer="composer1",
generation_config=None,
**kwargs,
):
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self._shift_right(labels)
def _reorder_cache(self, past_key_values, beam_idx):
if past_key_values is None:
logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
return past_key_values
reordered_decoder_past = ()
for layer_past_states in past_key_values:
reordered_layer_past_states = ()
for layer_past_state in layer_past_states:
reordered_layer_past_states = reordered_layer_past_states + (
layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
)
if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
raise ValueError(
f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
)
if len(reordered_layer_past_states) != len(layer_past_states):
raise ValueError(
f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
)
reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
return reordered_decoder_past
.\models\pop2piano\processing_pop2piano.py
""" Processor class for Pop2Piano."""
import os
from typing import List, Optional, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...processing_utils import ProcessorMixin
from ...tokenization_utils import BatchEncoding, PaddingStrategy, TruncationStrategy
from ...utils import TensorType
class Pop2PianoProcessor(ProcessorMixin):
r"""
Constructs an Pop2Piano processor which wraps a Pop2Piano Feature Extractor and Pop2Piano Tokenizer into a single
processor.
[`Pop2PianoProcessor`] offers all the functionalities of [`Pop2PianoFeatureExtractor`] and [`Pop2PianoTokenizer`].
See the docstring of [`~Pop2PianoProcessor.__call__`] and [`~Pop2PianoProcessor.decode`] for more information.
Args:
feature_extractor (`Pop2PianoFeatureExtractor`):
An instance of [`Pop2PianoFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`Pop2PianoTokenizer`):
An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input.
"""
attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "Pop2PianoFeatureExtractor"
tokenizer_class = "Pop2PianoTokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def __call__(
self,
audio: Union[np.ndarray, List[float], List[np.ndarray]] = None,
sampling_rate: Union[int, List[int]] = None,
steps_per_beat: int = 2,
resample: Optional[bool] = True,
notes: Union[List, TensorType] = None,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
verbose: bool = True,
**kwargs,
):
"""
Call method to process input audio data into features suitable for Pop2Piano model.
Args:
audio (Union[np.ndarray, List[float], List[np.ndarray]], optional):
Input audio data. Can be a numpy array, list of floats, or list of numpy arrays.
sampling_rate (Union[int, List[int]], optional):
Sampling rate of the input audio. Can be an integer or a list of integers.
steps_per_beat (int, optional):
Number of steps per beat in the musical sequence.
resample (bool, optional):
Whether to resample the input audio to the specified sampling rate.
notes (Union[List, TensorType], optional):
Musical notes associated with the audio data. Can be a list or TensorType.
padding (Union[bool, str, PaddingStrategy], optional):
Padding strategy to apply to the input data.
truncation (Union[bool, str, TruncationStrategy], optional):
Truncation strategy to apply to the input data.
max_length (int, optional):
Maximum length of the output sequence.
pad_to_multiple_of (int, optional):
Pad the sequence length to be a multiple of this value.
verbose (bool, optional):
Whether to print verbose information during processing.
**kwargs:
Additional keyword arguments for processing.
Returns:
BatchEncoding:
Processed batch of encoded inputs suitable for Pop2Piano model.
"""
pass
) -> Union[BatchFeature, BatchEncoding]:
"""
使用 [`Pop2PianoFeatureExtractor.__call__`] 方法准备模型的对数梅尔频谱图(log-mel-spectrograms),
并使用 [`Pop2PianoTokenizer.__call__`] 方法从音符中准备 token_ids。
请查阅上述两个方法的文档字符串以获取更多信息。
"""
if (audio is None and sampling_rate is None) and (notes is None):
raise ValueError(
"You have to specify at least audios and sampling_rate in order to use feature extractor or "
"notes to use the tokenizer part."
)
if audio is not None and sampling_rate is not None:
inputs = self.feature_extractor(
audio=audio,
sampling_rate=sampling_rate,
steps_per_beat=steps_per_beat,
resample=resample,
**kwargs,
)
if notes is not None:
encoded_token_ids = self.tokenizer(
notes=notes,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
if notes is None:
return inputs
elif audio is None or sampling_rate is None:
return encoded_token_ids
else:
inputs["token_ids"] = encoded_token_ids["token_ids"]
return inputs
def batch_decode(
self,
token_ids,
feature_extractor_output: BatchFeature,
return_midi: bool = True,
) -> BatchEncoding:
"""
使用 [`Pop2PianoTokenizer.batch_decode`] 方法将模型生成的 token_ids 转换为 midi_notes。
请查阅上述方法的文档字符串以获取更多信息。
"""
return self.tokenizer.batch_decode(
token_ids=token_ids, feature_extractor_output=feature_extractor_output, return_midi=return_midi
)
@property
def model_input_names(self):
"""
返回模型输入的名称列表,包括分词器和特征提取器的输入名称。
使用 `self.tokenizer.model_input_names` 和 `self.feature_extractor.model_input_names` 获取输入名称列表,
并将两者合并后去除重复项后返回。
"""
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
def save_pretrained(self, save_directory, **kwargs):
"""
将模型的预训练文件保存到指定目录中。
如果 `save_directory` 是文件而不是目录,将引发 ValueError。
如果目录不存在,则创建目录。
最后,调用父类的 `save_pretrained` 方法保存预训练文件。
Args:
save_directory (str): 要保存预训练文件的目录路径。
**kwargs: 其他参数传递给 `save_pretrained` 方法。
Returns:
Any: `save_pretrained` 方法的返回值。
Raises:
ValueError: 如果 `save_directory` 是文件路径而不是目录路径。
"""
if os.path.isfile(save_directory):
raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
os.makedirs(save_directory, exist_ok=True)
return super().save_pretrained(save_directory, **kwargs)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(*args)