Transformers 源码解析(九十)
.\models\pop2piano\tokenization_pop2piano.py
"""
Tokenization class for Pop2Piano.
"""
import json
import os
from typing import List, Optional, Tuple, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...tokenization_utils import AddedToken, BatchEncoding, PaddingStrategy, PreTrainedTokenizer, TruncationStrategy
from ...utils import TensorType, is_pretty_midi_available, logging, requires_backends, to_numpy
if is_pretty_midi_available():
import pretty_midi
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab": "vocab.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab": {
"sweetcocoa/pop2piano": "https://huggingface.co/sweetcocoa/pop2piano/blob/main/vocab.json",
},
}
def token_time_to_note(number, cutoff_time_idx, current_idx):
"""
将时间令牌转换为音符索引。
Args:
number (int): 时间令牌的数量。
cutoff_time_idx (int or None): 时间截止索引(可选)。
current_idx (int): 当前索引位置。
Returns:
int: 更新后的当前索引位置。
"""
current_idx += number
if cutoff_time_idx is not None:
current_idx = min(current_idx, cutoff_time_idx)
return current_idx
def token_note_to_note(number, current_velocity, default_velocity, note_onsets_ready, current_idx, notes):
"""
将音符令牌转换为音符。
Args:
number (int): 音符令牌的数量。
current_velocity (int): 当前速度。
default_velocity (int): 默认速度。
note_onsets_ready (list or None): 准备好的音符发生时刻的列表或 None。
current_idx (int): 当前索引位置。
notes (list): 音符列表。
Returns:
list: 更新后的音符列表。
"""
if note_onsets_ready[number] is not None:
onset_idx = note_onsets_ready[number]
if onset_idx < current_idx:
offset_idx = current_idx
notes.append([onset_idx, offset_idx, number, default_velocity])
onsets_ready = None if current_velocity == 0 else current_idx
note_onsets_ready[number] = onsets_ready
else:
note_onsets_ready[number] = current_idx
return notes
class Pop2PianoTokenizer(PreTrainedTokenizer):
"""
构造 Pop2Piano 分词器。此分词器不需要训练。
Args:
vocab (`str`): 包含词汇表的文件路径。
default_velocity (`int`, *optional*, 默认为 77):
创建 MIDI 音符时使用的默认速度。
num_bars (`int`, *optional*, 默认为 2):
每个令牌的截止时间索引。
"""
model_input_names = ["token_ids", "attention_mask"]
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
def __init__(
self,
vocab,
default_velocity=77,
num_bars=2,
unk_token="-1",
eos_token="1",
pad_token="0",
bos_token="2",
**kwargs,
):
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
self.default_velocity = default_velocity
self.num_bars = num_bars
with open(vocab, "rb") as file:
self.encoder = json.load(file)
self.decoder = {v: k for k, v in self.encoder.items()}
super().__init__(
unk_token=unk_token,
eos_token=eos_token,
pad_token=pad_token,
bos_token=bos_token,
**kwargs,
)
@property
def vocab_size(self):
"""Returns the vocabulary size of the tokenizer."""
return len(self.encoder)
def get_vocab(self):
"""Returns the vocabulary of the tokenizer."""
return dict(self.encoder, **self.added_tokens_encoder)
def _convert_id_to_token(self, token_id: int) -> list:
"""
Decodes the token ids generated by the transformer into notes.
Args:
token_id (`int`):
This denotes the ids generated by the transformers to be converted to Midi tokens.
Returns:
`List`: A list consists of token_type (`str`) and value (`int`).
"""
token_type_value = self.decoder.get(token_id, f"{self.unk_token}_TOKEN_TIME")
token_type_value = token_type_value.split("_")
token_type, value = "_".join(token_type_value[1:]), int(token_type_value[0])
return [token_type, value]
def _convert_token_to_id(self, token, token_type="TOKEN_TIME") -> int:
"""
Encodes the Midi tokens to transformer generated token ids.
Args:
token (`int`):
This denotes the token value.
token_type (`str`):
This denotes the type of the token. There are four types of midi tokens such as "TOKEN_TIME",
"TOKEN_VELOCITY", "TOKEN_NOTE" and "TOKEN_SPECIAL".
Returns:
`int`: returns the id of the token.
"""
return self.encoder.get(f"{token}_{token_type}", int(self.unk_token))
def relative_batch_tokens_ids_to_notes(
self,
tokens: np.ndarray,
beat_offset_idx: int,
bars_per_batch: int,
cutoff_time_idx: int,
):
"""
Converts relative tokens to notes which are then used to generate pretty midi object.
Args:
tokens (`numpy.ndarray`):
Tokens to be converted to notes.
beat_offset_idx (`int`):
Denotes beat offset index for each note in generated Midi.
bars_per_batch (`int`):
A parameter to control the Midi output generation.
cutoff_time_idx (`int`):
Denotes the cutoff time index for each note in generated Midi.
"""
notes = None
for index in range(len(tokens)):
_tokens = tokens[index]
_start_idx = beat_offset_idx + index * bars_per_batch * 4
_cutoff_time_idx = cutoff_time_idx + _start_idx
_notes = self.relative_tokens_ids_to_notes(
_tokens,
start_idx=_start_idx,
cutoff_time_idx=_cutoff_time_idx,
)
if len(_notes) == 0:
pass
elif notes is None:
notes = _notes
else:
notes = np.concatenate((notes, _notes), axis=0)
if notes is None:
return []
return notes
def relative_batch_tokens_ids_to_midi(
self,
tokens: np.ndarray,
beatstep: np.ndarray,
beat_offset_idx: int = 0,
bars_per_batch: int = 2,
cutoff_time_idx: int = 12,
):
"""
Converts tokens to Midi. This method calls `relative_batch_tokens_ids_to_notes` method to convert batch tokens
to notes then uses `notes_to_midi` method to convert them to Midi.
Args:
tokens (`numpy.ndarray`):
Denotes tokens which alongside beatstep will be converted to Midi.
beatstep (`np.ndarray`):
We get beatstep from feature extractor which is also used to get Midi.
beat_offset_idx (`int`, *optional*, defaults to 0):
Denotes beat offset index for each note in generated Midi.
bars_per_batch (`int`, *optional*, defaults to 2):
A parameter to control the Midi output generation.
cutoff_time_idx (`int`, *optional*, defaults to 12):
Denotes the cutoff time index for each note in generated Midi.
"""
beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
notes = self.relative_batch_tokens_ids_to_notes(
tokens=tokens,
beat_offset_idx=beat_offset_idx,
bars_per_batch=bars_per_batch,
cutoff_time_idx=cutoff_time_idx,
)
midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
return midi
def relative_tokens_ids_to_notes(self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: float = None):
"""
Converts relative tokens to notes which will then be used to create Pretty Midi objects.
Args:
tokens (`numpy.ndarray`):
Relative Tokens which will be converted to notes.
start_idx (`float`):
A parameter which denotes the starting index.
cutoff_time_idx (`float`, *optional*):
A parameter used while converting tokens to notes.
"""
words = [self._convert_id_to_token(token) for token in tokens]
current_idx = start_idx
current_velocity = 0
note_onsets_ready = [None for i in range(sum([k.endswith("NOTE") for k in self.encoder.keys()]) + 1)]
notes = []
for token_type, number in words:
if token_type == "TOKEN_SPECIAL":
if number == 1:
break
elif token_type == "TOKEN_TIME":
current_idx = token_time_to_note(
number=number, cutoff_time_idx=cutoff_time_idx, current_idx=current_idx
)
elif token_type == "TOKEN_VELOCITY":
current_velocity = number
elif token_type == "TOKEN_NOTE":
notes = token_note_to_note(
number=number,
current_velocity=current_velocity,
default_velocity=self.default_velocity,
note_onsets_ready=note_onsets_ready,
current_idx=current_idx,
notes=notes,
)
else:
raise ValueError("Token type not understood!")
for pitch, note_onset in enumerate(note_onsets_ready):
if note_onset is not None:
if cutoff_time_idx is None:
cutoff = note_onset + 1
else:
cutoff = max(cutoff_time_idx, note_onset + 1)
offset_idx = max(current_idx, cutoff)
notes.append([note_onset, offset_idx, pitch, self.default_velocity])
if len(notes) == 0:
return []
else:
notes = np.array(notes)
note_order = notes[:, 0] * 128 + notes[:, 1]
notes = notes[note_order.argsort()]
return notes
def notes_to_midi(self, notes: np.ndarray, beatstep: np.ndarray, offset_sec: int = 0.0):
"""
Converts notes to Midi.
Args:
notes (`numpy.ndarray`):
This is used to create Pretty Midi objects.
beatstep (`numpy.ndarray`):
This is the extrapolated beatstep that we get from feature extractor.
offset_sec (`int`, *optional*, defaults to 0.0):
This represents the offset seconds which is used while creating each Pretty Midi Note.
"""
requires_backends(self, ["pretty_midi"])
new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
new_inst = pretty_midi.Instrument(program=0)
new_notes = []
for onset_idx, offset_idx, pitch, velocity in notes:
new_note = pretty_midi.Note(
velocity=velocity,
pitch=pitch,
start=beatstep[onset_idx] - offset_sec,
end=beatstep[offset_idx] - offset_sec,
)
new_notes.append(new_note)
new_inst.notes = new_notes
new_pm.instruments.append(new_inst)
new_pm.remove_invalid_notes()
return new_pm
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Saves the tokenizer's vocabulary dictionary to the provided save_directory.
Args:
save_directory (`str`):
A path to the directory where to saved. It will be created if it doesn't exist.
filename_prefix (`Optional[str]`, *optional*):
A prefix to add to the names of the files saved by the tokenizer.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"]
)
with open(out_vocab_file, "w") as file:
file.write(json.dumps(self.encoder))
return (out_vocab_file,)
def encode_plus(
self,
notes: Union[np.ndarray, List[pretty_midi.Note]],
truncation_strategy: Optional[TruncationStrategy] = None,
max_length: Optional[int] = None,
**kwargs,
):
"""
Placeholder function for encoding notes into a format suitable for model input.
This is meant to be overridden by subclasses.
"""
pass
def batch_encode_plus(
self,
notes: Union[np.ndarray, List[pretty_midi.Note]],
truncation_strategy: Optional[TruncationStrategy] = None,
max_length: Optional[int] = None,
**kwargs,
):
"""
Placeholder function for batch encoding notes into a format suitable for model input.
This is meant to be overridden by subclasses.
"""
pass
) -> BatchEncoding:
r"""
This is the `batch_encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer
generated token ids. It works on multiple batches by calling `encode_plus` multiple times in a loop.
Args:
notes (`numpy.ndarray` of shape `[batch_size, sequence_length, 4]` or `list` of `pretty_midi.Note` objects):
This represents the midi notes. If `notes` is a `numpy.ndarray`:
- Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`.
If `notes` is a `list` containing `pretty_midi.Note` objects:
- Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`.
truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*):
Indicates the truncation strategy that is going to be used during truncation.
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
Returns:
`BatchEncoding` containing the tokens ids.
"""
encoded_batch_token_ids = []
for i in range(len(notes)):
encoded_batch_token_ids.append(
self.encode_plus(
notes[i],
truncation_strategy=truncation_strategy,
max_length=max_length,
**kwargs,
)["token_ids"]
)
return BatchEncoding({"token_ids": encoded_batch_token_ids})
def __call__(
self,
notes: Union[
np.ndarray,
List[pretty_midi.Note],
List[List[pretty_midi.Note]],
],
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
**kwargs,
):
"""
This method allows the tokenizer object to be called as a function, enabling batch encoding of midi notes.
Args:
notes (Union[np.ndarray, List[pretty_midi.Note], List[List[pretty_midi.Note]]]):
Midi notes to be tokenized. Can be a numpy array or a nested list of pretty_midi.Note objects.
padding (Union[bool, str, PaddingStrategy], optional): Whether to pad sequences to the same length. Defaults to False.
truncation (Union[bool, str, TruncationStrategy], optional): Truncation strategy for sequences longer than `max_length`. Defaults to None.
max_length (int, optional): Maximum length of the returned sequences after padding/truncation. Defaults to None.
pad_to_multiple_of (int, optional): Pad the sequence length to a multiple of this value. Defaults to None.
return_attention_mask (bool, optional): Whether to return attention masks. Defaults to None.
return_tensors (Union[str, TensorType], optional): Return tensors format. Defaults to None.
verbose (bool, optional): Whether to print information about encoding. Defaults to True.
**kwargs: Additional keyword arguments passed to `encode_plus`.
Returns:
BatchEncoding: Contains token ids and optionally attention masks and tensor format.
"""
pass
def batch_decode(
self,
token_ids,
feature_extractor_output: BatchFeature,
return_midi: bool = True,
**kwargs,
):
"""
This method decodes a batch of token ids back into MIDI representation.
Args:
token_ids (list): List of token ids to be decoded.
feature_extractor_output (BatchFeature): Output from feature extractor.
return_midi (bool, optional): Whether to return MIDI objects. Defaults to True.
**kwargs: Additional keyword arguments.
Returns:
Dependent on `return_midi`, returns MIDI objects or other format as specified.
"""
pass
.\models\pop2piano\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_essentia_available,
is_librosa_available,
is_pretty_midi_available,
is_scipy_available,
is_torch_available,
)
_import_structure = {
"configuration_pop2piano": ["POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Pop2PianoConfig"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_pop2piano"] = [
"POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST",
"Pop2PianoForConditionalGeneration",
"Pop2PianoPreTrainedModel",
]
try:
if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_pop2piano"] = ["Pop2PianoFeatureExtractor"]
try:
if not (is_pretty_midi_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_pop2piano"] = ["Pop2PianoTokenizer"]
try:
if not (
is_pretty_midi_available()
and is_torch_available()
and is_librosa_available()
and is_essentia_available()
and is_scipy_available()
):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["processing_pop2piano"] = ["Pop2PianoProcessor"]
if TYPE_CHECKING:
from .configuration_pop2piano import POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP, Pop2PianoConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_pop2piano import (
POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST,
Pop2PianoForConditionalGeneration,
Pop2PianoPreTrainedModel,
)
try:
if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_pop2piano import Pop2PianoFeatureExtractor
try:
if not (is_pretty_midi_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_pop2piano import Pop2PianoTokenizer
try:
if not (
is_pretty_midi_available()
and is_torch_available()
and is_librosa_available()
and is_essentia_available()
and is_scipy_available()
):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .processing_pop2piano import Pop2PianoProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\prophetnet\configuration_prophetnet.py
""" ProphetNet model configuration"""
from typing import Callable, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/prophetnet-large-uncased": (
"https://huggingface.co/microsoft/prophetnet-large-uncased/resolve/main/config.json"
),
}
class ProphetNetConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ProphetNetModel`]. It is used to instantiate a
ProphetNet model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the ProphetNet
[microsoft/prophetnet-large-uncased](https://huggingface.co/microsoft/prophetnet-large-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "prophetnet"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "num_encoder_attention_heads",
}
def __init__(
self,
activation_dropout: Optional[float] = 0.1,
activation_function: Optional[Union[str, Callable]] = "gelu",
vocab_size: Optional[int] = 30522,
hidden_size: Optional[int] = 1024,
encoder_ffn_dim: Optional[int] = 4096,
num_encoder_layers: Optional[int] = 12,
num_encoder_attention_heads: Optional[int] = 16,
decoder_ffn_dim: Optional[int] = 4096,
num_decoder_layers: Optional[int] = 12,
num_decoder_attention_heads: Optional[int] = 16,
attention_dropout: Optional[float] = 0.1,
dropout: Optional[float] = 0.1,
max_position_embeddings: Optional[int] = 512,
init_std: Optional[float] = 0.02,
is_encoder_decoder: Optional[bool] = True,
add_cross_attention: Optional[bool] = True,
decoder_start_token_id: Optional[int] = 0,
ngram: Optional[int] = 2,
num_buckets: Optional[int] = 32,
relative_max_distance: Optional[int] = 128,
disable_ngram_loss: Optional[bool] = False,
eps: Optional[float] = 0.0,
use_cache: Optional[bool] = True,
pad_token_id: Optional[int] = 0,
bos_token_id: Optional[int] = 1,
eos_token_id: Optional[int] = 2,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.encoder_ffn_dim = encoder_ffn_dim
self.num_encoder_layers = num_encoder_layers
self.num_encoder_attention_heads = num_encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.num_decoder_layers = num_decoder_layers
self.num_decoder_attention_heads = num_decoder_attention_heads
self.max_position_embeddings = max_position_embeddings
self.init_std = init_std
self.activation_function = activation_function
self.ngram = ngram
self.num_buckets = num_buckets
self.relative_max_distance = relative_max_distance
self.disable_ngram_loss = disable_ngram_loss
self.eps = eps
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.dropout = dropout
self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
add_cross_attention=add_cross_attention,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
@property
def num_hidden_layers(self) -> int:
return self.num_encoder_layers + self.num_decoder_layers
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and"
" `num_decoder_layers`."
)
.\models\prophetnet\convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
"""Convert ProphetNet checkpoint."""
import argparse
from torch import nn
from transformers_old.modeling_prophetnet import (
ProphetNetForConditionalGeneration as ProphetNetForConditionalGenerationOld,
)
from transformers_old.modeling_xlm_prophetnet import (
XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld,
)
from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
logger = logging.get_logger(__name__)
logging.set_verbosity_info()
def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, pytorch_dump_folder_path: str):
"""
Copy/paste/tweak prohpetnet's weights to our prophetnet structure.
将 ProphetNet 的权重复制/粘贴/调整到我们的 ProphetNet 结构中。
"""
if "xprophetnet" in prophetnet_checkpoint_path:
prophet_old = XLMProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
prophet, loading_info = XLMProphetNetForConditionalGeneration.from_pretrained(
prophetnet_checkpoint_path, output_loading_info=True
)
else:
prophet_old = ProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
prophet, loading_info = ProphetNetForConditionalGeneration.from_pretrained(
prophetnet_checkpoint_path, output_loading_info=True
)
special_keys = ["key_proj", "value_proj", "query_proj"]
mapping = {
"self_attn": "ngram_self_attn",
"cross_attn": "encoder_attn",
"cross_attn_layer_norm": "encoder_attn_layer_norm",
"feed_forward_layer_norm": "final_layer_norm",
"feed_forward": "",
"intermediate": "fc1",
"output": "fc2",
"key_proj": "k_proj",
"query_proj": "q_proj",
"value_proj": "v_proj",
"word_embeddings": "embed_tokens",
"embeddings_layer_norm": "emb_layer_norm",
"relative_pos_embeddings": "relative_linear",
"ngram_embeddings": "ngram_input_embed",
"position_embeddings": "embed_positions",
}
print(f"Saving model to {pytorch_dump_folder_path}")
prophet.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--prophetnet_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_prophetnet_checkpoint_to_pytorch(args.prophetnet_checkpoint_path, args.pytorch_dump_folder_path)
.\models\prophetnet\modeling_prophetnet.py
import copy
import math
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import LayerNorm
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_prophetnet import ProphetNetConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "ProphenetConfig"
_CHECKPOINT_FOR_DOC = "microsoft/prophetnet-large-uncased"
PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/prophetnet-large-uncased",
]
PROPHETNET_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
Original ProphetNet code can be found [here](https://github.com/microsoft/ProphetNet). Checkpoints were converted
from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`.
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and
behavior.
Parameters:
config ([`ProphetNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
PROPHETNET_INPUTS_DOCSTRING = r"""
"""
PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
# 输入序列标记在词汇表中的索引。默认情况下,会忽略填充标记。
# 可以使用`AutoTokenizer`获取这些索引。详见`PreTrainedTokenizer.encode`和`PreTrainedTokenizer.__call__`。
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
# 遮罩,用于避免在填充标记的位置进行注意力计算。遮罩值在 `[0, 1]` 范围内:
- 1 表示**不被遮罩**的标记,
- 0 表示**被遮罩**的标记。
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
# 用于在编码器的注意力模块中屏蔽选定头部的遮罩。遮罩值在 `[0, 1]` 范围内:
- 1 表示**不被遮罩**的头部,
- 0 表示**被遮罩**的头部。
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。返回的张量中会有关于注意力的更多细节。
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。返回的张量中会有关于隐藏状态的更多细节。
return_dict (`bool`, *optional*):
# 是否返回一个[`~utils.ModelOutput`]而不是普通的元组。
# 计算主流相对位置
main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1)
# 从主流位置中减去每个位置的索引,得到相对位置
main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1)
# 预测流相对位置
predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1)
# 将预测流位置重复以匹配主流位置数目,并计算相对位置
predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1)
predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1)
# 获取主流和预测流的位置桶
# 计算主要流相对位置的桶
main_relative_position_buckets = compute_relative_buckets(
num_buckets, # 桶的数量
max_distance, # 最大距离
main_stream_relative_positions, # 主要流的相对位置
is_bidirectional=False # 是否双向(这里为单向,即不考虑双向)
)
# 计算预测流相对位置的桶
predict_relative_position_buckets = compute_relative_buckets(
num_buckets, # 桶的数量
max_distance, # 最大距离
predicting_stream_relative_positions, # 预测流的相对位置
is_bidirectional=False # 是否双向(这里为单向,即不考虑双向)
)
# 返回计算得到的主要流和预测流的相对位置桶
return main_relative_position_buckets, predict_relative_position_buckets
@dataclass
class ProphetNetSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
"""
# 损失值,可选的浮点张量
loss: Optional[torch.FloatTensor] = None
# 模型输出的 logits,浮点张量
logits: torch.FloatTensor = None
# ngram 模型输出的 logits,可选的浮点张量
logits_ngram: Optional[torch.FloatTensor] = None
# 过去的键/值对,可选的张量元组,用于加速顺序解码
past_key_values: Optional[Tuple[torch.FloatTensor]] = None
# 解码器隐藏状态的元组,可选的浮点张量元组
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# ngram 解码器隐藏状态的元组,可选的浮点张量元组
decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 解码器注意力权重的元组,可选的浮点张量元组
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
# ngram 解码器注意力权重的元组,可选的浮点张量元组
decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 交叉注意力权重的元组,可选的浮点张量元组
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 编码器最后隐藏状态,可选的浮点张量
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
# 编码器隐藏状态的元组,可选的浮点张量元组
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 编码器注意力权重的元组,可选的浮点张量元组
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@property
def decoder_cross_attentions(self):
# 警告信息,提示 'decoder_cross_attentions' 将被移除,请使用 'cross_attentions' 替代
warnings.warn(
"`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
" instead.",
FutureWarning,
)
# 返回交叉注意力权重的元组
return self.cross_attentions
@dataclass
class ProphetNetSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.
"""
# 最后一个隐藏状态的浮点张量
last_hidden_state: torch.FloatTensor
# ngram 模型的最后一个隐藏状态,可选的浮点张量
last_hidden_state_ngram: Optional[torch.FloatTensor] = None
# 过去的键/值对,可选的张量元组,用于加速顺序解码
past_key_values: Optional[Tuple[torch.FloatTensor]] = None
# 解码器隐藏状态的元组,可选的浮点张量元组
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# ngram 解码器隐藏状态的元组,可选的浮点张量元组
decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 解码器注意力权重的元组,可选的浮点张量元组
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
# ngram 解码器注意力权重的元组,可选的浮点张量元组
decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 交叉注意力权重的元组,可选的浮点张量元组
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# 编码器最后隐藏状态,可选的浮点张量
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
# 编码器隐藏状态的元组,可选的浮点张量元组
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 编码器注意力权重的元组,可选的浮点张量元组
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@property
def decoder_cross_attentions(self):
# 警告信息,提示 'decoder_cross_attentions' 将被移除,请使用 'cross_attentions' 替代
warnings.warn(
"`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
" instead.",
FutureWarning,
)
# 返回交叉注意力权重的元组
return self.cross_attentions
@dataclass
class ProphetNetDecoderModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
"""
# 最后一个隐藏状态的浮点张量
last_hidden_state: torch.FloatTensor
# ngram 模型的最后一个隐藏状态,可选的浮点张量
last_hidden_state_ngram: Optional[torch.FloatTensor] = None
# 过去的键/值对,可选的张量元组,用于加速顺序解码
past_key_values: Optional[Tuple[torch.FloatTensor]] = None
# 隐藏状态的元组,可选的浮点张量元组
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# ngram 隐藏状态的元组,可选的浮点张量元组
hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None
# 注意力权重的元组,可选的浮点张量元组
attentions: Optional[Tuple[torch.FloatTensor]] = None
# ngram 注意力权重的元组,可选的浮点张量元组
ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class ProphetNetDecoderLMOutput(ModelOutput):
"""
Model output class for the ProphetNet decoder, inheriting from ModelOutput.
Contains various tensors representing model predictions and intermediate states.
"""
loss: Optional[torch.FloatTensor] = None # Optional tensor for model training loss
logits: torch.FloatTensor = None # Tensor containing logits (predictions) from the decoder
logits_ngram: Optional[torch.FloatTensor] = None # Optional tensor for n-gram logits
past_key_values: Optional[Tuple[torch.FloatTensor]] = None # Optional tuple of past key/values for fast decoding
hidden_states: Optional[Tuple[torch.FloatTensor]] = None # Optional tuple of hidden states
hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None # Optional tuple of n-gram hidden states
attentions: Optional[Tuple[torch.FloatTensor]] = None # Optional tuple of attention tensors
ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None # Optional tuple of n-gram attention tensors
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None # Optional tuple of cross-attention tensors
class ProphetNetPreTrainedModel(PreTrainedModel):
"""
Base class for all models in the ProphetNet series, inheriting from PreTrainedModel.
"""
config_class = ProphetNetConfig # Configuration class for ProphetNet models
base_model_prefix = "prophetnet" # Prefix used for the base model
supports_gradient_checkpointing = True # Indicates whether the model supports gradient checkpointing
def _init_weights(self, module):
"""
Initialize weights of linear and embedding modules based on configuration.
"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.init_std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
def _shift_right(self, input_ids):
"""
Shift input ids to the right for autoregressive decoding.
"""
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
assert decoder_start_token_id is not None, (
"self.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the"
" pad_token_id. See ProphetNet docs for more information"
)
# shift inputs to the right
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
return shifted_input_ids
class ProphetNetPositionalEmbeddings(nn.Embedding):
"""
Positional embedding module for ProphetNet models.
Learns positional embeddings up to a fixed maximum size, handling padding ids.
"""
def __init__(self, config: ProphetNetConfig) -> None:
self.max_length = config.max_position_embeddings
super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
# 定义前向传播函数,接受输入形状、设备信息,可选的注意力掩码、过去的键值对和位置 ID
def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
# 断言:如果位置 ID 已预先计算,则填充索引不应设置
assert (position_ids is None) or (
self.padding_idx is None
), "If position_ids is pre-computed then padding_idx should not be set."
# 如果位置 ID 未提供
if position_ids is None:
# 如果有过去的键值对
if past_key_values is not None:
# 位置 ID 在解码单步时对每个令牌相同
# 在导出到 ONNX 时,如果没有 int() 转换,在某些情况下可能无法正常工作
prev_num_input_ids = past_key_values[0][0].shape[2]
num_input_ids = inputs_shape[1] + prev_num_input_ids
# 计算位置 ID,确保它为填充索引加上输入令牌数
position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * (
int(self.padding_idx + num_input_ids)
)
else:
# 如果没有过去的键值对,并且没有提供注意力掩码,则创建全一的注意力掩码
if attention_mask is None:
attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device)
# 从输入令牌 / 注意力掩码中检索位置 ID
position_ids = (
torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
).long() + self.padding_idx
# 确保位置 ID 不超过最大长度减一
position_ids = position_ids.clamp(0, self.max_length - 1)
# 调用父类的前向传播函数,返回结果和计算得到的位置 ID
return super().forward(position_ids), position_ids
# 私有方法 _forward,接受位置 ID 参数
def _forward(self, position_ids):
# 调用父类的前向传播函数,传递位置 ID 参数
return super().forward(position_ids)
class ProphetNetAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
config: ProphetNetConfig,
num_attn_heads: int,
):
super().__init__()
hidden_size = config.hidden_size
self.attention_dropout = config.attention_dropout # 从配置中获取注意力丢弃率
self.dropout = config.dropout # 从配置中获取全连接层输出的丢弃率
self.num_attn_heads = num_attn_heads # 设置注意力头的数量
self.head_dim = hidden_size // num_attn_heads # 计算每个注意力头的维度
assert self.head_dim * num_attn_heads == hidden_size, (
"`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and"
" `config.num_decoder_attention_heads`"
)
self.key_proj = nn.Linear(hidden_size, hidden_size) # 初始化键的投影矩阵
self.value_proj = nn.Linear(hidden_size, hidden_size) # 初始化值的投影矩阵
self.query_proj = nn.Linear(hidden_size, hidden_size) # 初始化查询的投影矩阵
self.out_proj = nn.Linear(hidden_size, hidden_size) # 初始化输出投影矩阵
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
# 重新形状张量以便进行多头注意力计算
def forward(
self,
hidden_states,
key_value_states: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
layer_head_mask: Optional[Tensor] = None,
past_key_value: Optional[Tuple[Tensor]] = None,
output_attentions: bool = False,
):
# 前向传播函数定义,执行注意力计算
# 初始化函数,接受一个ProphetNetConfig类型的参数config
def __init__(self, config: ProphetNetConfig):
# 调用父类的初始化函数
super().__init__()
# 设置隐藏层大小
self.hidden_size = config.hidden_size
# 设置桶的数量
self.num_buckets = config.num_buckets
# 设置相对位置的最大距离
self.relative_max_distance = config.relative_max_distance
# 设置注意力头的数量
self.num_attn_heads = config.num_decoder_attention_heads
# 设置dropout率
self.dropout = config.dropout
# 设置注意力dropout率
self.attention_dropout = config.attention_dropout
# 计算每个注意力头的维度
self.head_dim = config.hidden_size // self.num_attn_heads
# 设置ngram
self.ngram = config.ngram
# 断言确保隐藏层大小能够被注意力头的数量整除
assert (
self.head_dim * self.num_attn_heads == config.hidden_size
), "config.hidden_size must be divisible by num_attn_heads"
# key, value, query的投影层
self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.query_proj = nn.Linear(config.hidden_size, config.hidden_size)
# 输出投影层
self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
# 相对位置嵌入
self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads)
# 用于ONNX运行时的标志
self.onnx_trace = False
# 将张量形状重新整理为(batch_size, seq_len, num_attn_heads, head_dim),并进行转置
def _shape(self, tensor, seq_len, batch_size):
return tensor.view(batch_size, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
# 准备用于导出到ONNX的设置
def prepare_for_onnx_export_(self):
self.onnx_trace = True
# 前向传播函数,接受一系列输入参数,并返回输出结果
def forward(
self,
hidden_states,
past_key_value: Optional[Tuple[Tensor]] = None,
attention_mask=None,
layer_head_mask=None,
extended_predict_attention_mask=None,
main_relative_position_buckets=None,
predict_relative_position_buckets=None,
position_ids=None,
):
# 省略部分前向传播函数的具体实现
):
# input hidden_states [batch_size, sequence_length, hidden_size]
# input attn_weights [batch_size, num_heads, sequence_length, sequence_length]
# input position_ids [batch_size, sequence_length] or [1,1]
# 解构输入参数中的维度信息
batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape
# 调整注意力权重张量的形状,以匹配后续计算需求
attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len)
# 如果未提供主要相对位置桶,则计算默认相对位置信息
if main_relative_position_buckets is None:
# 获取隐藏状态张量的形状信息
batch_size, sequence_length = hidden_states.shape[:2]
# 计算相对位置张量,减去给定的位置标识
relative_positions = (
torch.arange(1, attn_weights.shape[-1] + 1)
.unsqueeze(0)
.unsqueeze(0)
.repeat(batch_size, sequence_length, 1)
.to(position_ids.device)
)
relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
# 计算主要相对位置桶,用于多头注意力机制
main_relative_position_buckets = compute_relative_buckets(
self.num_buckets, self.relative_max_distance, relative_positions, False
)
# 计算相对位置编码张量
rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
# 调整相对位置编码张量的形状,以匹配注意力权重和多头数目
rel_pos_embeddings = rel_pos_embeddings.view(
rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
)
rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2)
# 将相对位置编码张量重塑为适合注意力权重形状的张量
rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,))
# 复制主要相对位置桶以适配多头数目
main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
# 将主要相对位置桶的形状调整为适合索引操作的形式
main_relative_position_buckets = main_relative_position_buckets.view(
-1, main_relative_position_buckets.shape[-1]
)
# 将主要相对位置桶转换为长整型
main_relative_position_buckets = main_relative_position_buckets.long()
# 将相对位置编码张量重塑为适合索引操作的形式
rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
# 使用索引操作从相对位置编码张量中获取主要相对位置编码
main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets)
# 将获取的主要相对位置编码重新调整为原始形状
main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1)
# 返回主要相对位置编码张量
return main_relative_pos_embeddings
def get_predict_relative_pos_embeddings(
self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
):
# input hidden_states [batch_size, sequence_length, ngram, hidden_size]
# input attn_weights [batch_size, ngram, num_heads, sequence_length, 2*sequence_length]
# input position_ids [batch_size, sequence_length] or [1,1]
# input predict_relative_position_buckets [batch_size, sequence_length, 2*sequence_length] or None
# 获取 batch_size 和 sequence_length
batch_size, sequence_length = hidden_states.shape[0:2]
# 如果 predict_relative_position_buckets 为 None,则计算相对位置信息
if predict_relative_position_buckets is None:
# 获取 attn_weights 的 key_sequence_length
key_sequence_length = attn_weights.shape[-1]
# 检查 position_ids 的有效性,确保格式为 1 2 3 4 5 ... (key_sequence_length - 1)
assert (
position_ids[0][0] == key_sequence_length - 1
), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
# 生成相对位置信息 relative_positions
relative_positions = (
torch.arange(0, key_sequence_length)
.unsqueeze(0)
.unsqueeze(0)
.repeat(batch_size, sequence_length, 1)
.to(position_ids.device)
)
# 计算相对位置差值
relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
# 计算预测相对位置桶
predict_relative_position_buckets = compute_relative_buckets(
self.num_buckets, self.relative_max_distance, relative_positions, False
)
# 将 hidden_states 的维度 [batch_size, sequence_length, ngram, hidden_size] 转置为 [batch_size, ngram, sequence_length, hidden_size]
hidden_states = hidden_states.transpose(1, 2)
# 计算相对位置嵌入 rel_pos_embeddings
rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
# 将 rel_pos_embeddings 的维度调整为 [batch_size, ngram, sequence_length, num_buckets, num_heads]
rel_pos_embeddings = rel_pos_embeddings.view(
hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
)
# 将 rel_pos_embeddings 的维度重新排列为 [batch_size, ngram, num_heads, sequence_length, num_buckets]
rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3)
# 将 rel_pos_embeddings 的形状调整为 [batch_size * ngram * sequence_length * num_heads, num_buckets]
rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets)
# 将 predict_relative_position_buckets 的形状调整为 [ngram, batch_size, num_heads, sequence_length, -1]
predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0)
predict_relative_position_buckets = predict_relative_position_buckets.repeat(
self.ngram, 1, self.num_attn_heads, 1
)
# 将 predict_relative_position_buckets 的形状调整为 [ngram * batch_size * num_heads * sequence_length, -1]
predict_relative_position_buckets = predict_relative_position_buckets.view(
-1, predict_relative_position_buckets.size(-1)
).long()
# 使用 torch.gather 获取预测的相对位置嵌入 predict_relative_pos_embeddings
predict_relative_pos_embeddings = torch.gather(
rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
)
# 将 predict_relative_pos_embeddings 的形状调整为 [batch_size, gram, num_heads, sequence_length, -1]
predict_relative_pos_embeddings = predict_relative_pos_embeddings.view(
batch_size, self.ngram, self.num_attn_heads, sequence_length, -1
)
# 返回预测的相对位置嵌入 predict_relative_pos_embeddings
return predict_relative_pos_embeddings
class ProphetNetEncoderLayer(nn.Module):
"""
Encoder block for Prophetnet
"""
def __init__(self, config: ProphetNetConfig):
super().__init__()
# 1st residual block
# 创建自注意力机制模块,使用 ProphetNetAttention
self.self_attn = ProphetNetAttention(config, config.num_encoder_attention_heads)
# 创建自注意力机制的 LayerNorm 层
self.self_attn_layer_norm = LayerNorm(config.hidden_size)
# 2nd residual block
# 创建前馈神经网络模块,使用 ProphetNetFeedForward
self.feed_forward = ProphetNetFeedForward(config, config.encoder_ffn_dim)
# 创建前馈神经网络的 LayerNorm 层
self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
def forward(
self,
hidden_states,
attention_mask,
layer_head_mask,
output_attentions: bool = False,
):
# 1st residual block
# 执行自注意力机制,得到注意力输出、注意力权重和无用的变量 _
attention_output, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
# 应用 LayerNorm 到注意力输出和输入状态的残差连接
hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)
# 2nd residual block
# 执行前馈神经网络
feed_forward_output = self.feed_forward(hidden_states)
# 应用 LayerNorm 到前馈神经网络输出和输入状态的残差连接
hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class ProphetNetDecoderLayer(nn.Module):
"""
Decoder block for Prophetnet
"""
def __init__(self, config: ProphetNetConfig):
super().__init__()
# 1st residual block
# 创建 N-gram 自注意力机制模块,使用 ProphetNetNgramSelfAttention
self.self_attn = ProphetNetNgramSelfAttention(config)
# 创建自注意力机制的 LayerNorm 层
self.self_attn_layer_norm = LayerNorm(config.hidden_size)
# 2nd residual block
# 如果配置要求添加跨注意力机制
if config.add_cross_attention:
# 创建跨注意力机制模块,使用 ProphetNetAttention
self.cross_attn = ProphetNetAttention(config, config.num_decoder_attention_heads)
# 创建跨注意力机制的 LayerNorm 层
self.cross_attn_layer_norm = LayerNorm(config.hidden_size)
# 3rd residual block
# 创建解码器前馈神经网络模块,使用 ProphetNetFeedForward
self.feed_forward = ProphetNetFeedForward(config, config.decoder_ffn_dim)
# 创建前馈神经网络的 LayerNorm 层
self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
def forward(
self,
hidden_states,
attention_mask=None,
encoder_hidden_states=None,
encoder_attn_mask=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
extended_predict_attention_mask=None,
main_relative_position_buckets=None,
predict_relative_position_buckets=None,
position_ids=None,
past_key_value=None,
use_cache: bool = True,
output_attentions: bool = False,
):
):
# 1st residual block
# 如果过去的键/值对存在,则从中选择前两个作为自注意力的过去键/值对;否则为 None
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
# 调用自注意力机制,计算输出和注意力权重
ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
extended_predict_attention_mask=extended_predict_attention_mask,
main_relative_position_buckets=main_relative_position_buckets,
predict_relative_position_buckets=predict_relative_position_buckets,
position_ids=position_ids,
)
# 应用 Layer Normalization 到自注意力输出和原始隐藏状态的和
hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)
# 如果过去的键/值对存在,则从中选择后两个作为跨注意力的过去键/值对;否则为 None
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attn_weights = None
if encoder_hidden_states is not None:
# 2nd residual block
# 调用跨注意力机制,计算输出、注意力权重和当前键/值对
attention_output, cross_attn_weights, cross_attn_present_key_value = self.cross_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attn_mask,
layer_head_mask=cross_attn_layer_head_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
)
# 应用 Layer Normalization 到跨注意力输出和原始隐藏状态的和
hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)
# 将跨注意力的当前键/值对添加到当前键/值对中的第三、第四个位置
present_key_value = present_key_value + cross_attn_present_key_value
# 3rd residual block
# 应用前馈网络层到隐藏状态,得到前馈网络输出
feed_forward_output = self.feed_forward(hidden_states)
# 应用 Layer Normalization 到前馈网络输出和原始隐藏状态的和
hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
# 输出结果初始化为包含隐藏状态的元组
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将自注意力和跨注意力的权重添加到输出结果中
if output_attentions:
outputs += (self_attn_weights, self_attn_weights_ngram, cross_attn_weights)
# 如果需要使用缓存,则将当前键/值对添加到输出结果中
if use_cache:
outputs += (present_key_value,)
# 返回最终的输出结果
return outputs
# 定义封装了ProphetNet模型的独立编码器部分 的类
@add_start_docstrings(
"The standalone encoder part of the ProphetNetModel.",
PROPHETNET_START_DOCSTRING,
)
class ProphetNetEncoder(ProphetNetPreTrainedModel):
"""
代表ProphetNet编码器部分,用于封装模型的编码器参数。这个类可以使用预定义的词嵌入初始化,而不是随机初始化的词嵌入。
"""
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
# 初始化ProphetNet编码器
super().__init__(config)
# 根据传入的参数,创建词嵌入层。如果没有提供,使用预设的随机初始化和填充索引。
self.word_embeddings = (
word_embeddings
if word_embeddings is not None
else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
)
# 初始化位置嵌入层
self.position_embeddings = ProphetNetPositionalEmbeddings(config)
# 初始化嵌入层归一化层
self.embeddings_layer_norm = LayerNorm(config.hidden_size)
# 创建编码器层列表
self.layers = nn.ModuleList([
ProphetNetEncoderLayer(config)
for _ in range(config.num_encoder_layers)
])
# 初始化梯度检查点
self.gradient_checkpointing = False
# 执行最后的操作,初始化权重和处理
self.post_init()
# 获取输入嵌入的函数
def get_input_embeddings(self):
return self.word_embeddings
# 设置输入嵌入的函数
def set_input_embeddings(self, value):
self.word_embeddings = value
# 定义处理输入的数据格式和参数的函数,实现前馈操作
@add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
return MethodName(...)
# 定义封装了ProphetNet模型的独立解码器部分 的类
@add_start_docstrings(
"The standalone decoder part of the ProphetNetModel.",
PROPHETNET_START_DOCSTRING,
)
class ProphetNetDecoder(ProphetNetPreTrainedModel):
"""
用于封装ProphetNet模型的独立解码器部分。此类可用于通过提供预定义的词嵌入初始化模型,而不是随机初始化词嵌入。
"""
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
# 初始化ProphetNet解码器
super().__init__(config)
# 根据传入的参数,创建词嵌入层。如果没有提供,使用预设的随机初始化和填充索引。
self.word_embeddings = (
word_embeddings
if word_embeddings is not None
else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
)
# 这里可添加额外的初始化操作和参数初始化
# 初始化函数,用于初始化ProphetNetDecoder模型的各种参数和组件
def __init__(self, config: ProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None):
# 调用父类的初始化函数,初始化模型的基本配置
super().__init__(config)
# 设置模型使用的N-gram大小
self.ngram = config.ngram
# 设置模型使用的桶(bucket)数量
self.num_buckets = config.num_buckets
# 设置模型使用的相对最大距离
self.relative_max_distance = config.relative_max_distance
# 设置模型使用的dropout比例
self.dropout = config.dropout
# 设置模型允许的最大目标位置
self.max_target_positions = config.max_position_embeddings
# 初始化词嵌入层,如果给定了外部的词嵌入则使用外部的,否则创建新的词嵌入
self.word_embeddings = (
word_embeddings
if word_embeddings is not None
else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
)
# 初始化位置编码层
self.position_embeddings = ProphetNetPositionalEmbeddings(config)
# 初始化N-gram编码层
self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None)
# 初始化多层ProphetNet解码器层
self.layers = nn.ModuleList([ProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
# 初始化嵌入层的LayerNorm层
self.embeddings_layer_norm = LayerNorm(config.hidden_size)
# 设置梯度检查点为False,通常用于内存优化
self.gradient_checkpointing = False
# 执行后续的初始化和权重设置
self.post_init()
# 返回模型的输入词嵌入层
def get_input_embeddings(self):
return self.word_embeddings
# 设置模型的输入词嵌入层
def set_input_embeddings(self, value):
self.word_embeddings = value
# ProphetNetDecoder模型的前向传播函数,接受多个输入参数并返回相应的输出
@add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 输入参数的详细描述已经通过装饰器添加到模型前向函数的文档中
def compute_buffered_relative_buckets(self, position_ids):
# 获取批处理大小和序列长度
batch_size, sequence_length = position_ids.shape
# 创建位置ID序列,范围从1到self.max_target_positions,复制到当前设备
position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(1, 1)
# 计算主要和预测相对桶
main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets(
self.num_buckets, self.relative_max_distance, position_ids
)
# 缓冲相对桶
main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1)
predict_relative_buckets = torch.cat(
[
predict_relative_buckets[:, :sequence_length, :sequence_length],
predict_relative_buckets[
:, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
],
],
2,
).repeat(batch_size, 1, 1)
return main_relative_buckets, predict_relative_buckets
def prepare_attention_mask(self, hidden_states, attention_mask):
# 获取批处理大小和序列长度
batch_size, seq_length = hidden_states.shape[:2]
# 获取因果遮罩
causal_mask = torch.full(
(seq_length, seq_length),
torch.finfo(hidden_states.dtype).min, # 用隐藏状态的最小浮点数填充
dtype=hidden_states.dtype,
device=hidden_states.device,
)
causal_mask = torch.triu(causal_mask, 1) # 获取因果上三角遮罩
extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand(
(batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape
)
# 添加常规的注意力遮罩
if attention_mask is not None:
extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(hidden_states.dtype).min
extended_attention_mask = extended_causal_mask + extended_attention_mask
else:
extended_attention_mask = extended_causal_mask
return extended_attention_mask.to(hidden_states.dtype)
# 定义一个方法用于准备预测时的注意力掩码
def prepare_predict_attention_mask(self, hidden_states, attention_mask):
# 获取批处理大小和序列长度
batch_size, seq_length = hidden_states.shape[:2]
# 获取因果掩码
predict_causal_mask = ngram_attention_bias(
self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
)
# 将因果掩码按列连接,形成预测时的完整因果掩码
predict_causal_mask = torch.cat(
[
predict_causal_mask[:, :seq_length, :seq_length],
predict_causal_mask[
:, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
],
],
dim=-1,
)
# 扩展因果掩码以适应批处理维度和注意力头的数量
extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand(
(batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape
)
# 添加普通的注意力掩码
if attention_mask is not None:
# 创建扩展的注意力掩码,并确保预测流的注意力掩码始终为0
extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min
extended_attention_mask = extended_attention_mask.expand(
(batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length)
)
extended_attention_mask = torch.cat(
[extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
)
extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
else:
extended_predict_attention_mask = extended_predict_causal_mask
# 将最终的扩展预测注意力掩码转换为隐藏状态的数据类型并返回
return extended_predict_attention_mask.to(hidden_states.dtype)
@add_start_docstrings(
"The bare ProphetNet Model outputting raw hidden-states without any specific head on top.",
PROPHETNET_START_DOCSTRING,
)
# 定义 ProphetNetModel 类,继承自 ProphetNetPreTrainedModel
class ProphetNetModel(ProphetNetPreTrainedModel):
# 定义 tied_weights_keys 列表,用于存储需要绑定权重的键名
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
# 初始化方法,接收 ProphetNetConfig 类型的 config 参数
def __init__(self, config: ProphetNetConfig):
# 调用父类 ProphetNetPreTrainedModel 的初始化方法
super().__init__(config)
# 创建词嵌入层,使用 nn.Embedding 类,设置词汇量大小、隐藏层大小和填充标记ID
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
# 复制 config 以创建编码器的配置,设置为不是编码-解码器模式且不使用缓存
encoder_config = copy.deepcopy(config)
encoder_config.is_encoder_decoder = False
encoder_config.use_cache = False
# 创建编码器实例,使用 ProphetNetEncoder 类,并传入配置和词嵌入层
self.encoder = ProphetNetEncoder(encoder_config, self.word_embeddings)
# 复制 config 以创建解码器的配置,设置为解码器模式且不是编码-解码器模式
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
# 创建解码器实例,使用 ProphetNetDecoder 类,并传入配置和词嵌入层
self.decoder = ProphetNetDecoder(decoder_config, self.word_embeddings)
# 初始化权重并应用最终处理
self.post_init()
# 获取输入词嵌入层的方法
def get_input_embeddings(self):
return self.word_embeddings
# 设置输入词嵌入层的方法,接收 value 参数
def set_input_embeddings(self, value):
# 设置词嵌入层为 value
self.word_embeddings = value
# 设置编码器和解码器的词嵌入层为相同的 value
self.encoder.word_embeddings = self.word_embeddings
self.decoder.word_embeddings = self.word_embeddings
# 绑定权重的私有方法
def _tie_weights(self):
# 如果配置中指定了绑定词嵌入层的权重
if self.config.tie_word_embeddings:
# 将编码器和解码器的词嵌入层权重绑定到同一个实例
self._tie_or_clone_weights(self.encoder.word_embeddings, self.word_embeddings)
self._tie_or_clone_weights(self.decoder.word_embeddings, self.word_embeddings)
# 获取编码器实例的方法
def get_encoder(self):
return self.encoder
# 获取解码器实例的方法
def get_decoder(self):
return self.decoder
# 前向传播方法,接收多个输入参数,并设置了输出文档的注释和返回值类型
@add_start_docstrings_to_model_forward(PROPHETNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 定义一个类变量,包含需要共享权重的模型层的名称列表
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
# 初始化方法,接收一个ProphetNetConfig类型的配置对象作为参数
def __init__(self, config: ProphetNetConfig):
# 调用父类初始化方法,传入配置对象
super().__init__(config)
# 创建ProphetNetModel对象,并将其保存在self.prophetnet中
self.prophetnet = ProphetNetModel(config)
# 设置padding_idx为配置对象中的pad_token_id属性
self.padding_idx = config.pad_token_id
# 根据配置对象的disable_ngram_loss属性设置self.disable_ngram_loss
self.disable_ngram_loss = config.disable_ngram_loss
# 创建一个线性层,将输入维度设为配置对象的hidden_size,输出维度设为配置对象的vocab_size
# 不使用偏置项(bias=False)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 调用初始化权重并应用最终处理方法
self.post_init()
# 返回lm_head作为输出的嵌入层对象
def get_output_embeddings(self):
return self.lm_head
# 将新的嵌入层对象赋值给lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 如果配置对象指定了tie_word_embeddings,则共享权重
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head)
# 返回prophetnet模型中的word_embeddings作为输入嵌入层对象
def get_input_embeddings(self):
return self.prophetnet.word_embeddings
# 前向传播方法,接受一系列可能为空的张量作为输入参数
@add_start_docstrings_to_model_forward(PROPHETNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 定义一个方法用于计算损失,输入参数包括 logits(预测值)、labels(真实标签)、ignore_index(忽略的索引,默认为-100)
def _compute_loss(self, logits, labels, ignore_index=-100):
# 创建一个与 labels 维度相同的全零张量,用于存储扩展后的标签,填充值为 ignore_index
expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
# 根据 config 中的 ngram 参数,扩展标签,将 labels 复制到 expend_targets 的不同维度中
for i in range(self.config.ngram):
if i > 0 and self.disable_ngram_loss:
break
expend_targets[i, :, :] = labels
# 调整 logits 的维度顺序,并确保其连续性
logits = logits.transpose(0, 1).contiguous()
# 计算 log_softmax,得到 lprobs,用于后续的负对数似然损失计算
lprobs = nn.functional.log_softmax(
logits.view(-1, logits.size(-1)), # 展平 logits 张量的前两个维度
dim=-1,
dtype=torch.float32,
)
# 使用负对数似然损失函数计算损失值,reduction 参数为 "mean" 表示计算平均损失
loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
# 如果 config 中的 eps 大于 0.0,则执行 label 平滑操作
if self.config.eps > 0.0:
# 计算平滑损失
smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
smooth_loss = smooth_loss[non_masked_tokens]
smooth_loss = smooth_loss.mean()
# 计算 eps_i
eps_i = self.config.eps / lprobs.size(-1)
# 结合 label 平滑和原始损失值,得到最终的损失值
loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
# 返回计算得到的损失值
return loss
# 为生成准备输入的方法,返回一个包含所需输入的字典
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
# 断言 encoder_outputs 参数不为 None,确保其在生成时被传递
assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation."
# 如果 past_key_values 存在,仅保留 decoder_input_ids 的最后一个 token
if past_key_values:
decoder_input_ids = decoder_input_ids[:, -1:]
# 返回包含生成所需输入的字典
return {
"input_ids": None, # encoder_outputs 已定义,不需要 input_ids
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
# 根据标签准备 decoder_input_ids 的静态方法
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self._shift_right(labels)
# 从 past_key_values 中重新排序缓存的静态方法,用于 beam search 生成
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
# 对每一层的过去状态执行重新排序,以适应 beam search 的索引变化
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
# 获取 encoder 的方法,返回 prophetnet 模型的 encoder 部分
def get_encoder(self):
return self.prophetnet.encoder
# 获取 decoder 的方法,返回 prophetnet 模型的 decoder 部分
def get_decoder(self):
return self.prophetnet.decoder
# 为 ProphetNetForCausalLM 类添加文档字符串,描述其作为 ProphetNetModel 的解码器部分,用于有因果关系的语言建模
@add_start_docstrings(
"The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal"
" language modeling.",
PROPHETNET_START_DOCSTRING,
)
class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
# 定义绑定权重的关键词列表,用于共享或复制权重
_tied_weights_keys = [
"prophetnet.word_embeddings.weight",
"prophetnet.decoder.word_embeddings.weight",
"lm_head.weight",
]
# 初始化方法,接收 ProphetNetConfig 类型的配置参数
def __init__(self, config: ProphetNetConfig):
# 深拷贝配置对象,设置为解码器模式,关闭编码-解码模式
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
# 调用父类初始化方法
super().__init__(config)
# 创建 ProphetNetDecoderWrapper 对象
self.prophetnet = ProphetNetDecoderWrapper(config)
# 设置填充 token 的索引
self.padding_idx = config.pad_token_id
# 是否禁用 ngram 损失的标志
self.disable_ngram_loss = config.disable_ngram_loss
# 创建线性层 lm_head,用于预测词汇表中词的概率分布
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 调用初始化权重和应用最终处理的方法
self.post_init()
# 获取输入嵌入的方法,返回 ProphetNet 解码器的词嵌入层
def get_input_embeddings(self):
return self.prophetnet.decoder.word_embeddings
# 设置输入嵌入的方法,设置 ProphetNet 解码器的词嵌入层
def set_input_embeddings(self, value):
self.prophetnet.decoder.word_embeddings = value
# 获取输出嵌入的方法,返回 lm_head 线性层,用于预测词汇表中词的概率分布
def get_output_embeddings(self):
return self.lm_head
# 设置输出嵌入的方法,设置 lm_head 线性层
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 绑定权重的方法,如果配置指定了共享词嵌入,则共享 ProphetNet 解码器的词嵌入层和 lm_head 线性层
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head)
# 设置解码器的方法,用给定的解码器替换当前的 ProphetNet 解码器
def set_decoder(self, decoder):
self.prophetnet.decoder = decoder
# 获取解码器的方法,返回当前 ProphetNet 解码器
def get_decoder(self):
return self.prophetnet.decoder
# 前向传播方法,执行 ProphetNet 解码器的前向传播,预测下一个词的分布
@add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 前向传播的参数列表,支持 ProphetNetDecoderLMOutput 类型的输出
**kwargs,
):
def _compute_loss(self, logits, labels, ignore_index=-100):
# 创建一个与labels具有相同大小的张量,填充为ignore_index,用于扩展目标张量
expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
for i in range(self.config.ngram):
# 如果当前ngram大于0并且禁用了ngram损失,则退出循环
if i > 0 and self.disable_ngram_loss:
break
# 将labels复制到扩展目标张量的第i层
expend_targets[i, :, :] = labels
# 调整logits的维度顺序,并确保内存连续
logits = logits.transpose(0, 1).contiguous()
# 计算log_softmax以获取概率对数
lprobs = nn.functional.log_softmax(
logits.view(-1, logits.size(-1)),
dim=-1,
dtype=torch.float32,
)
# 计算负对数似然损失
loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
if self.config.eps > 0.0:
# 计算平滑损失,排除掩码标记,并计算平均值
smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
smooth_loss = smooth_loss[non_masked_tokens]
smooth_loss = smooth_loss.mean()
# 计算eps_i
eps_i = self.config.eps / lprobs.size(-1)
# 应用平滑损失到总损失中
loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
# 返回最终的损失值
return loss
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
use_cache=None,
**kwargs,
):
# 如果attention_mask为空,则创建全为1的张量,表示所有token都被attention
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
# 如果past_key_values存在,则仅使用最后一个token作为输入
input_ids = input_ids[:, -1:]
# 返回用于生成的输入字典
return {
"input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
"attention_mask": attention_mask,
"head_mask": head_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
# 重新排序缓存中的过去键值,以匹配beam search的顺序
# 从transformers.models.bart.modeling_bart.BartForCausalLM._reorder_cache复制而来
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
# 根据beam_idx重新排序每一层的过去状态
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
# 定义一个名为 ProphetNetDecoderWrapper 的类,继承自 ProphetNetPreTrainedModel 类
class ProphetNetDecoderWrapper(ProphetNetPreTrainedModel):
"""
This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
classes.
"""
# 初始化方法,接受一个 ProphetNetConfig 类型的参数 config
def __init__(self, config: ProphetNetConfig):
# 调用父类的初始化方法,传入 config 参数
super().__init__(config)
# 创建一个 nn.Embedding 对象,用于词嵌入,参数包括词汇表大小、隐藏层大小和填充标记的索引
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
# 创建 ProphetNetDecoder 对象,传入 config 参数和之前创建的词嵌入对象
self.decoder = ProphetNetDecoder(config, word_embeddings=self.word_embeddings)
# 初始化权重并应用最终处理
self.post_init()
# 方法,用于将词嵌入层的权重与解码器的输入词嵌入层权重相绑定
def _tie_weights(self):
self._tie_or_clone_weights(self.word_embeddings, self.decoder.get_input_embeddings())
# 前向传播方法,将调用解码器的前向传播方法
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
.\models\prophetnet\tokenization_prophetnet.py
import collections
import os
import unicodedata
from typing import Iterable, List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/prophetnet-large-uncased": (
"https://huggingface.co/microsoft/prophetnet-large-uncased/resolve/main/prophetnet.tokenizer"
),
}
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/prophetnet-large-uncased": {"do_lower_case": True},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/prophetnet-large-uncased": 512,
}
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else set()
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
[`PreTrainedTokenizer.tokenize`]) List of token not to split.
"""
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""根据标点符号分割文本。"""
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""在每个中日韩(CJK)字符周围添加空格。"""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""检查给定的码点是否是中日韩字符的码点。"""
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)
):
return True
return False
def _clean_text(self, text):
"""对文本执行无效字符删除和空白字符清理。"""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\
")
vocab[token] = index
return vocab
class ProphetNetTokenizer(PreTrainedTokenizer):
r"""
Construct a ProphetNetTokenizer. Based on WordPiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (`bool`, *optional*, defaults to `True`):
Whether or not to do basic tokenization before WordPiece.
never_split (`Iterable`, *optional*):
Collection of tokens which will never be split during tokenization. Only has an effect when
`do_basic_tokenize=True`
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
x_sep_token (`str`, *optional*, defaults to `"[X_SEP]"`):
Special second separator token, which can be generated by [`ProphetNetForConditionalGeneration`]. It is
used to separate bullet-point like sentences in summarization, *e.g.*.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
"""
# Define constants related to vocabulary files, pretrained models, and configurations
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# Define model input names required for `tokenizer.pad(...)` to function correctly
# For `ProphetNet`, `token_type_ids` is not a required argument.
model_input_names: List[str] = ["input_ids", "attention_mask"]
# 初始化方法,接受多个参数来配置分词器实例
def __init__(
self,
vocab_file: str,
do_lower_case: Optional[bool] = True,
do_basic_tokenize: Optional[bool] = True,
never_split: Optional[Iterable] = None,
unk_token: Optional[str] = "[UNK]",
sep_token: Optional[str] = "[SEP]",
x_sep_token: Optional[str] = "[X_SEP]",
pad_token: Optional[str] = "[PAD]",
mask_token: Optional[str] = "[MASK]",
tokenize_chinese_chars: Optional[bool] = True,
strip_accents: Optional[bool] = None,
**kwargs,
):
# 检查给定的词汇文件是否存在,如果不存在则抛出异常
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# 加载词汇表文件内容到实例变量中
self.vocab = load_vocab(vocab_file)
# 创建一个从id到token的有序字典,以便根据id查找对应的token
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
# 根据参数决定是否进行基本分词
self.do_basic_tokenize = do_basic_tokenize
# 如果需要进行基本分词,则初始化BasicTokenizer实例
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
# 使用给定的词汇表和未知token初始化WordpieceTokenizer实例
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
# 调用父类的初始化方法,传递相同的参数和额外的关键字参数
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
x_sep_token=x_sep_token,
pad_token=pad_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 返回词汇表大小的属性方法
@property
def vocab_size(self):
return len(self.vocab)
# 返回包含词汇表和添加token编码器的字典
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
# 对给定文本进行分词,返回分词后的token列表
def _tokenize(self, text):
split_tokens = []
# 如果需要进行基本分词
if self.do_basic_tokenize:
# 使用BasicTokenizer分词器对文本进行分词
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
# 如果token在never_split集合中,则直接添加到分词结果列表中
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
# 否则,使用WordpieceTokenizer对token进行进一步分词,并将结果扩展到split_tokens列表中
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
# 如果不需要基本分词,则直接使用WordpieceTokenizer对文本进行分词
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
# 根据token查找其在词汇表中对应的id,如果不存在则返回unk_token对应的id
def _convert_token_to_id(self, token: str):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
# 根据id查找其在词汇表中对应的token,如果不存在则返回unk_token
def _convert_id_to_token(self, index: int):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens: str):
"""
Converts a sequence of tokens (string) into a single string.
Args:
tokens (`str`): A sequence of tokens.
Returns:
`str`: The concatenated string without '##' symbols.
"""
# Join tokens into a single string, remove '##' and strip leading/trailing spaces
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: Optional[bool] = False,
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`): List of IDs.
token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*): Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
# Return a list of zeros of the same length as token_ids_0, with a single 1 appended
return ([0] * len(token_ids_0)) + [1]
else:
# Return a list of zeros of the combined length of token_ids_0 and token_ids_1, each followed by a 1
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`): List of IDs.
token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
sep = [self.sep_token_id]
if token_ids_1 is None:
# Return a list of zeros with a length equal to the sum of token_ids_0 and one separator token
return len(token_ids_0 + sep) * [0]
else:
# Return a list of zeros with a length equal to the combined sum of token_ids_0, token_ids_1, and two separator tokens
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# 将词汇表保存到指定目录下的文件中
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 初始化索引
index = 0
# 检查保存目录是否存在
if os.path.isdir(save_directory):
# 构建词汇表文件路径,包括可选的文件名前缀和默认的词汇表文件名
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
# 如果保存目录不存在,则直接将其作为文件路径
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
# 打开文件,写入词汇表内容
with open(vocab_file, "w", encoding="utf-8") as writer:
# 遍历词汇表中的每个词汇和对应的索引
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
# 检查当前索引是否连续
if index != token_index:
# 如果不连续,记录警告信息
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
# 更新索引
index = token_index
# 将词汇写入文件,每个词汇后面加上换行符
writer.write(token + "\n")
# 更新索引
index += 1
# 返回保存的文件路径,以元组形式返回
return (vocab_file,)
# 构建包含特殊标记的模型输入
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
通过连接和添加特殊标记,从序列或序列对构建用于序列分类任务的模型输入。BERT 序列的格式如下:
- 单个序列:`[CLS] X [SEP]`
- 序列对:`[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
将要添加特殊标记的 ID 列表。
token_ids_1 (`List[int]`, *可选*):
第二个序列对的 ID 列表(可选)。
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
# 如果没有第二个序列对,则直接返回第一个序列加上分隔标记的结果
if token_ids_1 is None:
return token_ids_0 + [self.sep_token_id]
# 构造分隔标记列表
sep = [self.sep_token_id]
# 返回连接后的两个序列及其之间的分隔标记列表
return token_ids_0 + sep + token_ids_1 + sep
.\models\prophetnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig"],
"tokenization_prophetnet": ["ProphetNetTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_prophetnet"] = [
"PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"ProphetNetDecoder",
"ProphetNetEncoder",
"ProphetNetForCausalLM",
"ProphetNetForConditionalGeneration",
"ProphetNetModel",
"ProphetNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig
from .tokenization_prophetnet import ProphetNetTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_prophetnet import (
PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
ProphetNetDecoder,
ProphetNetEncoder,
ProphetNetForCausalLM,
ProphetNetForConditionalGeneration,
ProphetNetModel,
ProphetNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\pvt\configuration_pvt.py
""" Pvt model configuration"""
from collections import OrderedDict
from typing import Callable, List, Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
PVT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"pvt-tiny-224": "https://huggingface.co/Zetatech/pvt-tiny-224",
}
class PvtConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`PvtModel`]. It is used to instantiate an Pvt
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Pvt
[Xrenya/pvt-tiny-224](https://huggingface.co/Xrenya/pvt-tiny-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
Args:
image_size (`int`, *optional*, defaults to 224):
输入图像的大小,默认为224
num_channels (`int`, *optional*, defaults to 3):
输入通道的数量,默认为3
num_encoder_blocks (`int`, *optional*, defaults to 4):
编码器块的数量(Mix Transformer 编码器中的阶段数),默认为4
depths (`List[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
每个编码器块中的层数,默认为 `[2, 2, 2, 2]`
sequence_reduction_ratios (`List[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
每个编码器块中的序列减少比例,默认为 `[8, 4, 2, 1]`
hidden_sizes (`List[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
每个编码器块的维度,默认为 `[64, 128, 320, 512]`
patch_sizes (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
每个编码器块之前的补丁大小,默认为 `[4, 2, 2, 2]`
strides (`List[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
每个编码器块之前的步长,默认为 `[4, 2, 2, 2]`
num_attention_heads (`List[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
每个 Transformer 编码器块中每个注意力层的注意力头数,默认为 `[1, 2, 5, 8]`
mlp_ratios (`List[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
Mix FFNs 中隐藏层大小与输入层大小的比例,默认为 `[8, 8, 4, 4]`
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
编码器和池化器中的非线性激活函数(函数或字符串),支持 `"gelu"`, `"relu"`, `"selu"` 和 `"gelu_new"`, 默认为 `"gelu"`
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
嵌入层、编码器和池化器中所有全连接层的 dropout 概率,默认为 0.0
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
注意力概率的 dropout 比率,默认为 0.0
initializer_range (`float`, *optional*, defaults to 0.02):
初始化所有权重矩阵的截断正态分布的标准差,默认为 0.02
drop_path_rate (`float`, *optional*, defaults to 0.0):
用于随机深度的 dropout 概率,在 Transformer 编码器的块中使用,默认为 0.0
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
层归一化层使用的 epsilon,默认为 1e-06
qkv_bias (`bool`, *optional*, defaults to `True`):
是否为查询、键和值添加可学习偏置,默认为 True
num_labels ('int', *optional*, defaults to 1000):
类别数量,默认为 1000
Example:
```
>>> from transformers import PvtModel, PvtConfig
>>>
>>> configuration = PvtConfig()
```
>>> model = PvtModel(configuration)
>>>
>>> configuration = model.config
```
model_type = "pvt"
def __init__(
self,
image_size: int = 224,
num_channels: int = 3,
num_encoder_blocks: int = 4,
depths: List[int] = [2, 2, 2, 2],
sequence_reduction_ratios: List[int] = [8, 4, 2, 1],
hidden_sizes: List[int] = [64, 128, 320, 512],
patch_sizes: List[int] = [4, 2, 2, 2],
strides: List[int] = [4, 2, 2, 2],
num_attention_heads: List[int] = [1, 2, 5, 8],
mlp_ratios: List[int] = [8, 8, 4, 4],
hidden_act: Mapping[str, Callable] = "gelu",
hidden_dropout_prob: float = 0.0,
attention_probs_dropout_prob: float = 0.0,
initializer_range: float = 0.02,
drop_path_rate: float = 0.0,
layer_norm_eps: float = 1e-6,
qkv_bias: bool = True,
num_labels: int = 1000,
**kwargs,
):
super().__init__(**kwargs)
self.image_size = image_size
self.num_channels = num_channels
self.num_encoder_blocks = num_encoder_blocks
self.depths = depths
self.sequence_reduction_ratios = sequence_reduction_ratios
self.hidden_sizes = hidden_sizes
self.patch_sizes = patch_sizes
self.strides = strides
self.mlp_ratios = mlp_ratios
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.drop_path_rate = drop_path_rate
self.layer_norm_eps = layer_norm_eps
self.num_labels = num_labels
self.qkv_bias = qkv_bias
class PvtOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-4
@property
def default_onnx_opset(self) -> int:
return 12
.\models\pvt\convert_pvt_to_pytorch.py
"""Convert Pvt checkpoints from the original library."""
import argparse
from pathlib import Path
import requests
import torch
from PIL import Image
from transformers import PvtConfig, PvtForImageClassification, PvtImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(config):
rename_keys = []
rename_keys.extend(
[
("cls_token", "pvt.encoder.patch_embeddings.3.cls_token"),
]
)
rename_keys.extend(
[
("norm.weight", "pvt.encoder.layer_norm.weight"),
("norm.bias", "pvt.encoder.layer_norm.bias"),
("head.weight", "classifier.weight"),
("head.bias", "classifier.bias"),
]
)
return rename_keys
def read_in_k_v(state_dict, config):
for i in range(config.num_encoder_blocks):
for j in range(config.depths[i]):
kv_weight = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
kv_bias = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias")
state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
config.hidden_sizes[i] :, :
]
state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
"""
Copy/paste/tweak model's weights to our PVT structure.
"""
if pvt_size == "tiny":
config_path = "Zetatech/pvt-tiny-224"
elif pvt_size == "small":
config_path = "Zetatech/pvt-small-224"
elif pvt_size == "medium":
config_path = "Zetatech/pvt-medium-224"
elif pvt_size == "large":
config_path = "Zetatech/pvt-large-224"
else:
raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
config = PvtConfig(name_or_path=config_path)
state_dict = torch.load(pvt_checkpoint, map_location="cpu")
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_k_v(state_dict, config)
model = PvtForImageClassification(config).eval()
model.load_state_dict(state_dict)
image_processor = PvtImageProcessor(size=config.image_size)
encoding = image_processor(images=prepare_img(), return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values)
logits = outputs.logits.detach().cpu()
if pvt_size == "tiny":
expected_slice_logits = torch.tensor([-1.4192, -1.9158, -0.9702])
elif pvt_size == "small":
expected_slice_logits = torch.tensor([0.4353, -0.1960, -0.2373])
elif pvt_size == "medium":
expected_slice_logits = torch.tensor([-0.2914, -0.2231, 0.0321])
elif pvt_size == "large":
expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
else:
raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pvt_size",
default="tiny",
type=str,
help="Size of the PVT pretrained model you'd like to convert.",
)
parser.add_argument(
"--pvt_checkpoint",
default="pvt_tiny.pth",
type=str,
help="Checkpoint of the PVT pretrained model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
args = parser.parse_args()
convert_pvt_checkpoint(args.pvt_size, args.pvt_checkpoint, args.pytorch_dump_folder_path)
.\models\pvt\image_processing_pvt.py
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
"""
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
self.do_resize = do_resize
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.size = size
self.resample = resample
self.rescale_factor = rescale_factor
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"return_tensors",
"data_format",
"input_data_format",
]
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
.\models\pvt\modeling_pvt.py
""" PyTorch PVT model."""
import collections
import math
from typing import Iterable, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_pvt import PvtConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "PvtConfig"
_CHECKPOINT_FOR_DOC = "Zetatech/pvt-tiny-224"
_EXPECTED_OUTPUT_SHAPE = [1, 50, 512]
_IMAGE_CLASS_CHECKPOINT = "Zetatech/pvt-tiny-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
PVT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Zetatech/pvt-tiny-224"
]
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class PvtDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class PvtPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(
self,
config: PvtConfig,
image_size: Union[int, Iterable[int]],
patch_size: Union[int, Iterable[int]],
stride: int,
num_channels: int,
hidden_size: int,
cls_token: bool = False,
):
super().__init__()
self.config = config
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.position_embeddings = nn.Parameter(
torch.randn(1, num_patches + 1 if cls_token else num_patches, hidden_size)
)
self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size)) if cls_token else None
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=stride, stride=patch_size)
self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
num_patches = height * width
if num_patches == self.config.image_size * self.config.image_size:
return self.position_embeddings
embeddings = embeddings.reshape(1, height, width, -1).permute(0, 3, 1, 2)
interpolated_embeddings = F.interpolate(embeddings, size=(height, width), mode="bilinear")
interpolated_embeddings = interpolated_embeddings.reshape(1, -1, height * width).permute(0, 2, 1)
return interpolated_embeddings
def forward(self, pixel_values: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
patch_embed = self.projection(pixel_values)
*_, height, width = patch_embed.shape
patch_embed = patch_embed.flatten(2).transpose(1, 2)
embeddings = self.layer_norm(patch_embed)
if self.cls_token is not None:
cls_token = self.cls_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_token, embeddings), dim=1)
position_embeddings = self.interpolate_pos_encoding(self.position_embeddings[:, 1:], height, width)
position_embeddings = torch.cat((self.position_embeddings[:, :1], position_embeddings), dim=1)
else:
position_embeddings = self.interpolate_pos_encoding(self.position_embeddings, height, width)
embeddings = self.dropout(embeddings + position_embeddings)
return embeddings, height, width
class PvtSelfOutput(nn.Module):
def __init__(self, config: PvtConfig, hidden_size: int):
super().__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class PvtEfficientSelfAttention(nn.Module):
"""Efficient self-attention mechanism with reduction of the sequence [PvT paper](https://arxiv.org/abs/2102.12122)."""
def __init__(
self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
):
super().__init__()
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
f"heads ({self.num_attention_heads})"
)
self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.sequences_reduction_ratio = sequences_reduction_ratio
if sequences_reduction_ratio > 1:
self.sequence_reduction = nn.Conv2d(
hidden_size, hidden_size, kernel_size=sequences_reduction_ratio, stride=sequences_reduction_ratio
)
self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
def transpose_for_scores(self, hidden_states: int) -> torch.Tensor:
new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
hidden_states = hidden_states.view(new_shape)
return hidden_states.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
height: int,
width: int,
output_attentions: bool = False,
) -> Tuple[torch.Tensor]:
query_layer = self.transpose_for_scores(self.query(hidden_states))
if self.sequences_reduction_ratio > 1:
batch_size, seq_len, num_channels = hidden_states.shape
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
hidden_states = self.sequence_reduction(hidden_states)
hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
hidden_states = self.layer_norm(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class PvtAttention(nn.Module):
def __init__(
self, config: PvtConfig, hidden_size: int, num_attention_heads: int, sequences_reduction_ratio: float
):
super().__init__()
self.self = PvtEfficientSelfAttention(
config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequences_reduction_ratio=sequences_reduction_ratio,
)
self.output = PvtSelfOutput(config, hidden_size=hidden_size)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False
) -> Tuple[torch.Tensor]:
self_outputs = self.self(hidden_states, height, width, output_attentions)
attention_output = self.output(self_outputs[0])
outputs = (attention_output,) + self_outputs[1:]
return outputs
class PvtFFN(nn.Module):
def __init__(
self,
config: PvtConfig,
in_features: int,
hidden_features: Optional[int] = None,
out_features: Optional[int] = None,
):
super().__init__()
out_features = out_features if out_features is not None else in_features
self.dense1 = nn.Linear(in_features, hidden_features)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.dense2 = nn.Linear(hidden_features, out_features)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense1(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense2(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class PvtLayer(nn.Module):
def __init__(
self,
config: PvtConfig,
hidden_size: int,
num_attention_heads: int,
drop_path: float,
sequences_reduction_ratio: float,
mlp_ratio: float,
):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
self.attention = PvtAttention(
config=config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
sequences_reduction_ratio=sequences_reduction_ratio,
)
self.drop_path = PvtDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = PvtFFN(config=config, in_features=hidden_size, hidden_features=mlp_hidden_size)
def forward(self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False):
self_attention_outputs = self.attention(
hidden_states=self.layer_norm_1(hidden_states),
height=height,
width=width,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
attention_output = self.drop_path(attention_output)
hidden_states = attention_output + hidden_states
mlp_output = self.mlp(self.layer_norm_2(hidden_states))
mlp_output = self.drop_path(mlp_output)
layer_output = hidden_states + mlp_output
outputs = (layer_output,) + outputs
return outputs
class PvtEncoder(nn.Module):
def __init__(self, config: PvtConfig):
super().__init__()
self.config = config
drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist()
embeddings = []
for i in range(config.num_encoder_blocks):
embeddings.append(
PvtPatchEmbeddings(
config=config,
image_size=config.image_size if i == 0 else self.config.image_size // (2 ** (i + 1)),
patch_size=config.patch_sizes[i],
stride=config.strides[i],
num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
hidden_size=config.hidden_sizes[i],
cls_token=i == config.num_encoder_blocks - 1,
)
)
self.patch_embeddings = nn.ModuleList(embeddings)
blocks = []
cur = 0
for i in range(config.num_encoder_blocks):
layers = []
if i != 0:
cur += config.depths[i - 1]
for j in range(config.depths[i]):
layers.append(
PvtLayer(
config=config,
hidden_size=config.hidden_sizes[i],
num_attention_heads=config.num_attention_heads[i],
drop_path=drop_path_decays[cur + j],
sequences_reduction_ratio=config.sequence_reduction_ratios[i],
mlp_ratio=config.mlp_ratios[i],
)
)
blocks.append(nn.ModuleList(layers))
self.block = nn.ModuleList(blocks)
self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
batch_size = pixel_values.shape[0]
num_blocks = len(self.block)
hidden_states = pixel_values
for idx, (embedding_layer, block_layer) in enumerate(zip(self.patch_embeddings, self.block)):
hidden_states, height, width = embedding_layer(hidden_states)
for block in block_layer:
layer_outputs = block(hidden_states, height, width, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if idx != num_blocks - 1:
hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
hidden_states = self.layer_norm(hidden_states)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
PVT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~PvtConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
PVT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`PvtImageProcessor.__call__`]
for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
PVT_START_DOCSTRING,
注释:
)
class PvtModel(PvtPreTrainedModel):
def __init__(self, config: PvtConfig):
super().__init__(config)
self.config = config
self.encoder = PvtEncoder(config)
self.post_init()
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""
Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
""",
PVT_START_DOCSTRING,
)
class PvtForImageClassification(PvtPreTrainedModel):
def __init__(self, config: PvtConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.pvt = PvtModel(config)
self.classifier = (
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(PVT_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor],
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.pvt(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output[:, 0, :])
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
.\models\pvt\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig", "PvtOnnxConfig"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_pvt"] = ["PvtImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_pvt"] = [
"PVT_PRETRAINED_MODEL_ARCHIVE_LIST",
"PvtForImageClassification",
"PvtModel",
"PvtPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig, PvtOnnxConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_pvt import PvtImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_pvt import (
PVT_PRETRAINED_MODEL_ARCHIVE_LIST,
PvtForImageClassification,
PvtModel,
PvtPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\pvt_v2\configuration_pvt_v2.py
"""Pvt V2 模型配置"""
from typing import Callable, List, Tuple, Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
PVT_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"pvt_v2_b0": "https://huggingface.co/OpenGVLab/pvt_v2_b0",
"pvt_v2_b1": "https://huggingface.co/OpenGVLab/pvt_v2_b1",
"pvt_v2_b2": "https://huggingface.co/OpenGVLab/pvt_v2_b2",
"pvt_v2_b2_linear": "https://huggingface.co/OpenGVLab/pvt_v2_b2_linear",
"pvt_v2_b3": "https://huggingface.co/OpenGVLab/pvt_v2_b3",
"pvt_v2_b4": "https://huggingface.co/OpenGVLab/pvt_v2_b4",
"pvt_v2_b5": "https://huggingface.co/OpenGVLab/pvt_v2_b5",
}
class PvtV2Config(BackboneConfigMixin, PretrainedConfig):
r"""
这是一个配置类,用于存储 [`PvtV2Model`] 的配置信息。根据指定的参数实例化 Pvt V2 模型,
定义模型的架构。使用默认配置进行实例化将产生类似于 Pvt V2 B0
[OpenGVLab/pvt_v2_b0](https://huggingface.co/OpenGVLab/pvt_v2_b0) 架构的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。阅读
[`PretrainedConfig`] 的文档以获取更多信息。
示例:
```
>>> from transformers import PvtV2Model, PvtV2Config
>>> # 初始化一个 pvt_v2_b0 风格的配置
>>> configuration = PvtV2Config()
>>> # 从 OpenGVLab/pvt_v2_b0 风格的配置初始化一个模型
>>> model = PvtV2Model(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
model_type = "pvt_v2"
def __init__(
self,
image_size: Union[int, Tuple[int, int]] = 224,
num_channels: int = 3,
num_encoder_blocks: int = 4,
depths: List[int] = [2, 2, 2, 2],
sr_ratios: List[int] = [8, 4, 2, 1],
hidden_sizes: List[int] = [32, 64, 160, 256],
patch_sizes: List[int] = [7, 3, 3, 3],
strides: List[int] = [4, 2, 2, 2],
num_attention_heads: List[int] = [1, 2, 5, 8],
mlp_ratios: List[int] = [8, 8, 4, 4],
hidden_act: Union[str, Callable] = "gelu",
hidden_dropout_prob: float = 0.0,
attention_probs_dropout_prob: float = 0.0,
initializer_range: float = 0.02,
drop_path_rate: float = 0.0,
layer_norm_eps: float = 1e-6,
qkv_bias: bool = True,
linear_attention: bool = False,
out_features=None,
out_indices=None,
**kwargs,
):
super().__init__(**kwargs)
image_size = (image_size, image_size) if isinstance(image_size, int) else image_size
self.image_size = image_size
self.num_channels = num_channels
self.num_encoder_blocks = num_encoder_blocks
self.depths = depths
self.sr_ratios = sr_ratios
self.hidden_sizes = hidden_sizes
self.patch_sizes = patch_sizes
self.strides = strides
self.mlp_ratios = mlp_ratios
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.drop_path_rate = drop_path_rate
self.layer_norm_eps = layer_norm_eps
self.qkv_bias = qkv_bias
self.linear_attention = linear_attention
self.stage_names = [f"stage{idx}" for idx in range(1, len(depths) + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
.\models\pvt_v2\convert_pvt_v2_to_pytorch.py
"""
Convert PvtV2 checkpoints from the original library.
"""
import argparse
from pathlib import Path
import requests
import torch
from PIL import Image
from transformers import PvtImageProcessor, PvtV2Config, PvtV2ForImageClassification
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def create_rename_keys(config):
rename_keys = []
rename_keys.extend(
[
("head.weight", "classifier.weight"),
("head.bias", "classifier.bias"),
]
)
return rename_keys
def read_in_k_v(state_dict, config):
for i in range(config.num_encoder_blocks):
for j in range(config.depths[i]):
kv_weight = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.weight")
kv_bias = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.bias")
state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.weight"] = kv_weight[
: config.hidden_sizes[i], :
]
state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.bias"] = kv_bias[: config.hidden_sizes[i]]
state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.weight"] = kv_weight[
config.hidden_sizes[i] :, :
]
state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.bias"] = kv_bias[
config.hidden_sizes[i] :
]
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folder_path, verify_imagenet_weights=False):
"""
Copy/paste/tweak model's weights to our PVT structure.
"""
if pvt_v2_size == "b0":
config_path = "OpenGVLab/pvt_v2_b0"
elif pvt_v2_size == "b1":
config_path = "OpenGVLab/pvt_v2_b1"
elif pvt_v2_size == "b2":
config_path = "OpenGVLab/pvt_v2_b2"
elif pvt_v2_size == "b2-linear":
config_path = "OpenGVLab/pvt_v2_b2_linear"
elif pvt_v2_size == "b3":
config_path = "OpenGVLab/pvt_v2_b3"
elif pvt_v2_size == "b4":
config_path = "OpenGVLab/pvt_v2_b4"
elif pvt_v2_size == "b5":
config_path = "OpenGVLab/pvt_v2_b5"
else:
raise ValueError(
f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
f"'{pvt_v2_size}' was given"
)
config = PvtV2Config.from_pretrained(config_path)
state_dict = torch.load(pvt_v2_checkpoint, map_location="cpu")
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_k_v(state_dict, config)
model = PvtV2ForImageClassification(config).eval()
model.load_state_dict(state_dict)
image_processor = PvtImageProcessor(size=config.image_size)
if verify_imagenet_weights:
print("Verifying conversion of pretrained ImageNet weights...")
encoding = image_processor(images=prepare_img(), return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values)
logits = outputs.logits.detach().cpu()
if pvt_v2_size == "b0":
expected_slice_logits = torch.tensor([-1.1939, -1.4547, -0.1076])
elif pvt_v2_size == "b1":
expected_slice_logits = torch.tensor([-0.4716, -0.7335, -0.4600])
elif pvt_v2_size == "b2":
expected_slice_logits = torch.tensor([0.0795, -0.3170, 0.2247])
elif pvt_v2_size == "b2-linear":
expected_slice_logits = torch.tensor([0.0968, 0.3937, -0.4252])
elif pvt_v2_size == "b3":
expected_slice_logits = torch.tensor([-0.4595, -0.2870, 0.0940])
elif pvt_v2_size == "b4":
expected_slice_logits = torch.tensor([-0.1769, -0.1747, -0.0143])
elif pvt_v2_size == "b5":
expected_slice_logits = torch.tensor([-0.2943, -0.1008, 0.6812])
else:
raise ValueError(
f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
f"'{pvt_v2_size}' was given"
)
assert torch.allclose(
logits[0, :3], expected_slice_logits, atol=1e-4
), "ImageNet weights not converted successfully."
print("ImageNet weights verified, conversion successful.")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
image_processor.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--pvt_v2_size",
default="b0",
type=str,
help="Size of the PVTv2 pretrained model you'd like to convert.",
)
parser.add_argument(
"--pvt_v2_checkpoint",
default="pvt_v2_b0.pth",
type=str,
help="Checkpoint of the PVTv2 pretrained model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--verify-imagenet-weights",
action="store_true",
default=False,
help="Verifies the correct conversion of author-published pretrained ImageNet weights.",
)
args = parser.parse_args()
convert_pvt_v2_checkpoint(
pvt_v2_size=args.pvt_v2_size,
pvt_v2_checkpoint=args.pvt_v2_checkpoint,
pytorch_dump_folder_path=args.pytorch_dump_folder_path,
verify_imagenet_weights=args.verify_imagenet_weights,
)
.\models\pvt_v2\modeling_pvt_v2.py
"""PyTorch PVTv2 模型."""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BackboneOutput, BaseModelOutput, ImageClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_pvt_v2 import PvtV2Config
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "PvtV2Config"
_CHECKPOINT_FOR_DOC = "OpenGVLab/pvt_v2_b0"
_EXPECTED_OUTPUT_SHAPE = [1, 256, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "OpenGVLab/pvt_v2_b0"
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"
PVT_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"OpenGVLab/pvt_v2_b0",
"OpenGVLab/pvt_v2_b1",
"OpenGVLab/pvt_v2_b2",
"OpenGVLab/pvt_v2_b2_linear",
"OpenGVLab/pvt_v2_b3",
"OpenGVLab/pvt_v2_b4",
"OpenGVLab/pvt_v2_b5",
]
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
每个样本都会丢弃路径(随机深度),主要用于残差块的主路径中。
Ross Wightman 的评论:这与我为 EfficientNet 等网络创建的 DropConnect 实现相同,
然而,原始名称具有误导性,因为 'Drop Connect' 是另一篇论文中的不同形式的 dropout……
参见讨论:https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ……
我选择将层和参数名称更改为 'drop path',而不是将 DropConnect 作为层名称混合使用,并使用 'survival rate' 作为参数。
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class PvtV2DropPath(nn.Module):
"""每个样本的随机深度(Drop Path,应用于残差块的主路径)。"""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class PvtV2OverlapPatchEmbeddings(nn.Module):
"""将图像转换为补丁嵌入。"""
def __init__(self, config: PvtV2Config, layer_idx: int):
super().__init__()
patch_size = config.patch_sizes[layer_idx]
patch_size = (patch_size, patch_size) if isinstance(patch_size, int) else patch_size
stride = config.strides[layer_idx]
num_channels = config.num_channels if layer_idx == 0 else config.hidden_sizes[layer_idx - 1]
hidden_size = config.hidden_sizes[layer_idx]
self.patch_size = patch_size
self.proj = nn.Conv2d(
num_channels,
hidden_size,
kernel_size=patch_size,
stride=stride,
padding=(patch_size[0] // 2, patch_size[1] // 2),
)
self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
def forward(self, pixel_values):
embeddings = self.proj(pixel_values)
_, _, height, width = embeddings.shape
embeddings = embeddings.flatten(2).transpose(1, 2)
embeddings = self.layer_norm(embeddings)
return embeddings, height, width
class PvtV2DepthWiseConv(nn.Module):
"""
使用零填充的深度卷积(DW convolution),以融入位置信息。
深度卷积的组数等于输入通道数,即每个输入通道一个滤波器,从而减少参数和计算成本,主要用于位置编码。
"""
def __init__(self, config: PvtV2Config, dim: int = 768):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
def forward(self, hidden_states, height, width):
batch_size, seq_len, num_channels = hidden_states.shape
hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
hidden_states = self.dwconv(hidden_states)
hidden_states = hidden_states.flatten(2).transpose(1, 2)
return hidden_states
class PvtV2SelfAttention(nn.Module):
"""高效的自注意力机制。"""
def __init__(self, config: PvtV2Config, hidden_size: int, num_attention_heads: int, spatial_reduction_ratio: int):
super().__init__()
self.linear_attention = config.linear_attention
self.pruned_heads = set()
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
if self.hidden_size % self.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
f"heads ({self.num_attention_heads})"
)
self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.attn_drop = nn.Dropout(config.attention_probs_dropout_prob)
self.proj = nn.Linear(self.hidden_size, self.hidden_size)
self.proj_drop = nn.Dropout(config.hidden_dropout_prob)
self.spatial_reduction_ratio = spatial_reduction_ratio
if self.linear_attention:
self.pool = nn.AdaptiveAvgPool2d(7)
self.spatial_reduction = nn.Conv2d(self.hidden_size, self.hidden_size, kernel_size=1, stride=1)
self.layer_norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
self.act = nn.GELU()
elif spatial_reduction_ratio > 1:
self.spatial_reduction = nn.Conv2d(
self.hidden_size, self.hidden_size, kernel_size=spatial_reduction_ratio, stride=spatial_reduction_ratio
)
self.layer_norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
def transpose_for_scores(self, hidden_states) -> torch.Tensor:
new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
hidden_states = hidden_states.view(new_shape)
return hidden_states.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
height: int,
width: int,
output_attentions: bool = False,
) -> Tuple[torch.Tensor]:
batch_size, seq_len, num_channels = hidden_states.shape
query_layer = self.transpose_for_scores(self.query(hidden_states))
if self.linear_attention:
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
hidden_states = (
self.spatial_reduction(self.pool(hidden_states)).reshape(batch_size, num_channels, -1).permute(0, 2, 1)
)
hidden_states = self.act(self.layer_norm(hidden_states))
elif self.spatial_reduction_ratio > 1:
hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
hidden_states = (
self.spatial_reduction(hidden_states).reshape(batch_size, num_channels, -1).permute(0, 2, 1)
)
hidden_states = self.layer_norm(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.attn_drop(attention_probs)
context_layer = (attention_probs @ value_layer).transpose(1, 2).reshape(batch_size, seq_len, num_channels)
context_layer = self.proj(context_layer)
context_layer = self.proj_drop(context_layer)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
)
self.query = prune_linear_layer(self.query, index)
self.key = prune_linear_layer(self.key, index)
self.value = prune_linear_layer(self.value, index)
self.proj = prune_linear_layer(self.proj, index, dim=1)
self.num_attention_heads = self.num_attention_heads - len(heads)
self.all_head_size = self.attention_head_size * self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
class PvtV2BlockLayer(nn.Module):
def __init__(self, config: PvtV2Config, layer_idx: int, drop_path: float = 0.0):
super().__init__()
hidden_size: int = config.hidden_sizes[layer_idx]
num_attention_heads: int = config.num_attention_heads[layer_idx]
spatial_reduction_ratio: int = config.sr_ratios[layer_idx]
mlp_ratio: float = config.mlp_ratios[layer_idx]
self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
self.attention = PvtV2SelfAttention(
config=config,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
spatial_reduction_ratio=spatial_reduction_ratio,
)
self.drop_path = PvtV2DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
mlp_hidden_size = int(hidden_size * mlp_ratio)
self.mlp = PvtV2ConvFeedForwardNetwork(config=config, in_features=hidden_size, hidden_features=mlp_hidden_size)
def forward(self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False):
self_attention_outputs = self.attention(
hidden_states=self.layer_norm_1(hidden_states),
height=height,
width=width,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
attention_output = self.drop_path(attention_output)
hidden_states = attention_output + hidden_states
mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
mlp_output = self.drop_path(mlp_output)
layer_output = hidden_states + mlp_output
outputs = (layer_output,) + outputs
return outputs
class PvtV2EncoderLayer(nn.Module):
def __init__(self, config: PvtV2Config, layer_idx: int):
super().__init__()
self.patch_embedding = PvtV2OverlapPatchEmbeddings(
config=config,
layer_idx=layer_idx,
)
drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist()
block_layers = []
for block_idx in range(config.depths[layer_idx]):
block_layers.append(
PvtV2BlockLayer(
config=config,
layer_idx=layer_idx,
drop_path=drop_path_decays[sum(config.depths[:layer_idx]) + block_idx],
)
)
self.blocks = nn.ModuleList(block_layers)
self.layer_norm = nn.LayerNorm(config.hidden_sizes[layer_idx], eps=config.layer_norm_eps)
def forward(self, hidden_states, output_attentions):
all_self_attentions = () if output_attentions else None
hidden_states, height, width = self.patch_embedding(hidden_states)
for block in self.blocks:
layer_outputs = block(hidden_states, height, width, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions += (layer_outputs[1],)
hidden_states = self.layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (all_self_attentions,)
return outputs, height, width
class PvtV2Encoder(nn.Module):
def __init__(self, config: PvtV2Config):
super().__init__()
self.config = config
self.gradient_checkpointing = False
self.layers = nn.ModuleList([PvtV2EncoderLayer(config, i) for i in range(config.num_encoder_blocks)])
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
batch_size = pixel_values.shape[0]
hidden_states = pixel_values
for idx, layer in enumerate(self.layers):
if self.gradient_checkpointing and self.training:
layer_output = self._gradient_checkpointing_func(layer.__call__, hidden_states, output_attentions)
else:
layer_output = layer(hidden_states, output_attentions)
outputs, height, width = layer_output
hidden_states = outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (outputs[1],)
hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class PvtV2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = PvtV2Config
base_model_prefix = "pvt_v2"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv2d):
fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
fan_out //= module.groups
module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if module.bias is not None:
module.bias.data.zero_()
PVT_V2_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~PvtV2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
PVT_V2_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`PvtImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Pvt-v2 encoder outputting raw hidden-states without any specific head on top.",
PVT_V2_START_DOCSTRING,
)
class PvtV2Model(PvtV2PreTrainedModel):
def __init__(self, config: PvtV2Config):
super().__init__(config)
self.config = config
self.encoder = PvtV2Encoder(config)
self.post_init()
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(PVT_V2_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_outputs = self.encoder(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if not return_dict:
return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"""
Pvt-v2 模型的图片分类器,顶部带有一个线性层(放在最终隐藏状态的 [CLS] 标记之上),例如用于 ImageNet。
""",
PVT_V2_START_DOCSTRING,
)
class PvtV2ForImageClassification(PvtV2PreTrainedModel):
def __init__(self, config: PvtV2Config) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.pvt_v2 = PvtV2Model(config)
self.classifier = (
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
)
self.post_init()
@add_start_docstrings_to_model_forward(PVT_V2_INPUTS_DOCSTRING.format("(batch_size, channels, height, width)"))
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor],
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: Optional[torch.Tensor],
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.pvt_v2(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
batch_size = sequence_output.shape[0]
sequence_output = sequence_output.permute(0, 2, 3, 1)
sequence_output = sequence_output.reshape(batch_size, -1, self.config.hidden_sizes[-1])
sequence_output = sequence_output.mean(dim=1)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
PVTv2 backbone, to be used with frameworks like DETR and MaskFormer.
""",
PVT_V2_START_DOCSTRING,
)
class PvtV2Backbone(PvtV2Model, BackboneMixin):
def __init__(self, config: PvtV2Config):
super().__init__(config)
super()._init_backbone(config)
self.num_features = config.hidden_sizes
"""
初始化函数,接受一个配置对象作为参数,并初始化 PVTv2 模型的骨干部分。
设置模型的特征数为配置中的隐藏层大小。
"""
@add_start_docstrings_to_model_forward(PVT_V2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> BackboneOutput:
"""
正向传播函数,接受输入像素值和一些可选参数,返回骨干网络的输出。
Args:
pixel_values (torch.FloatTensor): 输入的像素值张量。
output_attentions (Optional[bool], optional): 是否输出注意力权重。默认为None。
output_hidden_states (Optional[bool], optional): 是否输出隐藏状态。默认为None。
return_dict (Optional[bool], optional): 是否返回字典格式的输出。默认为None。
Returns:
BackboneOutput: 包含特征图、隐藏状态和注意力权重的输出对象。
Examples:
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("OpenGVLab/pvt_v2_b0")
>>> model = AutoBackbone.from_pretrained(
... "OpenGVLab/pvt_v2_b0", out_features=["stage1", "stage2", "stage3", "stage4"]
... )
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 256, 7, 7]
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
"""
确保返回字典的设置为配置中的默认值。
确保输出隐藏状态的设置为配置中的默认值。
"""
outputs = self.encoder(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=True,
return_dict=return_dict,
)
"""
使用编码器进行前向传播,传递输入像素值和其他参数。
总是输出隐藏状态以确保能够提取特征。
"""
hidden_states = outputs.hidden_states
feature_maps = ()
for idx, stage in enumerate(self.stage_names):
if stage in self.out_features:
feature_maps += (hidden_states[idx],)
"""
根据设置的输出特征名称,从隐藏状态中选择对应阶段的特征图。
"""
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (outputs.hidden_states,)
return output
"""
如果不返回字典形式的输出,构建一个包含特征图和隐藏状态的元组返回。
"""
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=None,
)
"""
返回包含特征图、隐藏状态和注意力权重的 BackboneOutput 对象。
"""