Transformers 源码解析(十八)
.\models\big_bird\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_big_bird": ["BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP", "BigBirdConfig", "BigBirdOnnxConfig"],
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_big_bird"] = ["BigBirdTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_big_bird_fast"] = ["BigBirdTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_big_bird"] = [
"BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST",
"BigBirdForCausalLM",
"BigBirdForMaskedLM",
"BigBirdForMultipleChoice",
"BigBirdForPreTraining",
"BigBirdForQuestionAnswering",
"BigBirdForSequenceClassification",
"BigBirdForTokenClassification",
"BigBirdLayer",
"BigBirdModel",
"BigBirdPreTrainedModel",
"load_tf_weights_in_big_bird",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_big_bird"] = [
"FlaxBigBirdForCausalLM",
"FlaxBigBirdForMaskedLM",
"FlaxBigBirdForMultipleChoice",
"FlaxBigBirdForPreTraining",
"FlaxBigBirdForQuestionAnswering",
"FlaxBigBirdForSequenceClassification",
"FlaxBigBirdForTokenClassification",
"FlaxBigBirdModel",
"FlaxBigBirdPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_big_bird import BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdConfig, BigBirdOnnxConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_big_bird import BigBirdTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_big_bird_fast import BigBirdTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_big_bird import (
BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST,
BigBirdForCausalLM,
BigBirdForMaskedLM,
BigBirdForMultipleChoice,
BigBirdForPreTraining,
BigBirdForQuestionAnswering,
BigBirdForSequenceClassification,
BigBirdForTokenClassification,
BigBirdLayer,
BigBirdModel,
BigBirdPreTrainedModel,
load_tf_weights_in_big_bird,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_flax_big_bird import (
FlaxBigBirdForCausalLM,
FlaxBigBirdForMaskedLM,
FlaxBigBirdForMultipleChoice,
FlaxBigBirdForPreTraining,
FlaxBigBirdForQuestionAnswering,
FlaxBigBirdForSequenceClassification,
FlaxBigBirdForTokenClassification,
FlaxBigBirdModel,
FlaxBigBirdPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\biogpt\configuration_biogpt.py
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/biogpt": "https://huggingface.co/microsoft/biogpt/resolve/main/config.json",
}
class BioGptConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BioGptModel`]. It is used to instantiate an
BioGPT model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the BioGPT
[microsoft/biogpt](https://huggingface.co/microsoft/biogpt) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
class BioGptConfig:
def __init__(
self,
vocab_size=42384,
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
intermediate_size=4096,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=1024,
initializer_range=0.02,
layer_norm_eps=1e-12,
scale_embedding=True,
use_cache=True,
layerdrop=0.0,
activation_dropout=0.0,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.scale_embedding = scale_embedding
self.use_cache = use_cache
self.layerdrop = layerdrop
self.activation_dropout = activation_dropout
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
configuration = BioGptConfig()
model = BioGptModel(configuration)
configuration = model.config
.\models\biogpt\convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
import argparse
import json
import os
import re
import shutil
import torch
from transformers import BioGptConfig, BioGptForCausalLM
from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
from transformers.utils import WEIGHTS_NAME, logging
logging.set_verbosity_warning()
json_indent = 2
class Dictionary:
"""A mapping from symbols to consecutive integers"""
def __init__(
self,
*,
bos="<s>",
pad="<pad>",
eos="</s>",
unk="<unk>",
extra_special_symbols=None,
):
self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
self.symbols = []
self.count = []
self.indices = {}
self.bos_index = self.add_symbol(bos)
self.pad_index = self.add_symbol(pad)
self.eos_index = self.add_symbol(eos)
self.unk_index = self.add_symbol(unk)
if extra_special_symbols:
for s in extra_special_symbols:
self.add_symbol(s)
self.nspecial = len(self.symbols)
def __eq__(self, other):
return self.indices == other.indices
def __getitem__(self, idx):
if idx < len(self.symbols):
return self.symbols[idx]
return self.unk_word
def __len__(self):
"""Returns the number of symbols in the dictionary"""
return len(self.symbols)
def __contains__(self, sym):
return sym in self.indices
@classmethod
def load(cls, f):
"""Loads the dictionary from a text file with the format:
```
<symbol0> <count0>
<symbol1> <count1>
...
```
"""
d = cls()
d.add_from_file(f)
return d
def add_symbol(self, word, n=1, overwrite=False):
"""Adds a word to the dictionary"""
if word in self.indices and not overwrite:
idx = self.indices[word]
self.count[idx] = self.count[idx] + n
return idx
else:
idx = len(self.symbols)
self.indices[word] = idx
self.symbols.append(word)
self.count.append(n)
return idx
def _load_meta(self, lines):
return 0
def add_from_file(self, f):
"""
Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
"""
if isinstance(f, str):
try:
with open(f, "r", encoding="utf-8") as fd:
self.add_from_file(fd)
except FileNotFoundError as fnfe:
raise fnfe
except UnicodeError:
raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f))
return
lines = f.readlines()
indices_start_line = self._load_meta(lines)
for line in lines[indices_start_line:]:
try:
line, field = line.rstrip().rsplit(" ", 1)
if field == "#fairseq:overwrite":
overwrite = True
line, field = line.rsplit(" ", 1)
else:
overwrite = False
count = int(field)
word = line
if word in self and not overwrite:
raise RuntimeError(
"Duplicate word found when loading Dictionary: '{}'. "
"Duplicate words can overwrite earlier ones by adding the "
"#fairseq:overwrite flag at the end of the corresponding row "
"in the dictionary file. If using the Camembert model, please "
"download an updated copy of the model file.".format(word)
)
self.add_symbol(word, n=count, overwrite=overwrite)
except ValueError:
raise ValueError("Incorrect dictionary format, expected '<token> <cnt> [flags]'")
def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path):
if not os.path.exists(biogpt_checkpoint_path):
raise ValueError(f"path {biogpt_checkpoint_path} does not exist!")
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
print(f"Writing results to {pytorch_dump_folder_path}")
checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt")
if not os.path.isfile(checkpoint_file):
raise ValueError(f"path to the file {checkpoint_file} does not exist!")
chkpt = torch.load(checkpoint_file, map_location="cpu")
args = chkpt["cfg"]["model"]
dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt")
if not os.path.isfile(dict_file):
raise ValueError(f"path to the file {dict_file} does not exist!")
src_dict = Dictionary.load(dict_file)
src_vocab = rewrite_dict_keys(src_dict.indices)
src_vocab_size = len(src_vocab)
src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"])
print(f"Generating {src_vocab_file} of {src_vocab_size} records")
with open(src_vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes")
if not os.path.isfile(bpecodes_file):
raise ValueError(f"path to the file {bpecodes_file} does not exist!")
merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
shutil.copyfile(bpecodes_file, merges_file)
biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")
model_conf = {
"activation_dropout": args["activation_dropout"],
"architectures": ["BioGptForCausalLM"],
"attention_probs_dropout_prob": args["attention_dropout"],
"bos_token_id": 0,
"eos_token_id": 2,
"hidden_act": args["activation_fn"],
"hidden_dropout_prob": args["dropout"],
"hidden_size": args["decoder_embed_dim"],
"initializer_range": 0.02,
"intermediate_size": args["decoder_ffn_embed_dim"],
"layer_norm_eps": 1e-12,
"layerdrop": args["decoder_layerdrop"],
"max_position_embeddings": args["max_target_positions"],
"model_type": "biogpt",
"num_attention_heads": args["decoder_attention_heads"],
"num_hidden_layers": args["decoder_layers"],
"pad_token_id": 1,
"scale_embedding": not args["no_scale_embedding"],
"tie_word_embeddings": args["share_decoder_input_output_embed"],
"vocab_size": src_vocab_size,
}
print(f"Generating {biogpt_model_config_file}")
with open(biogpt_model_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)
tokenizer_conf = {
"bos_token": "<s>",
"eos_token": "</s>",
"model_max_length": 1024,
"pad_token": "<pad>",
"special_tokens_map_file": None,
"tokenizer_class": "BioGptTokenizer",
"unk_token": "<unk>",
}
print(f"Generating {biogpt_tokenizer_config_file}")
with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
model_state_dict = chkpt["model"]
ignore_keys = [
"decoder.version",
]
for k in ignore_keys:
model_state_dict.pop(k, None)
layer_names = list(model_state_dict.keys())
for layer_name in layer_names:
if layer_name.endswith("output_projection.weight"):
model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name)
else:
model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name)
config = BioGptConfig.from_pretrained(pytorch_dump_folder_path)
model_new = BioGptForCausalLM(config)
model_new.load_state_dict(model_state_dict)
pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
print(f"Generating {pytorch_weights_dump_path}")
torch.save(model_state_dict, pytorch_weights_dump_path)
print("Conversion is done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--biogpt_checkpoint_path",
default=None,
type=str,
required=True,
help=(
"Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
" bpecodes, etc."
),
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
required=True,
help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path)
.\models\biogpt\modeling_biogpt.py
""" PyTorch BioGPT model."""
import math
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_biogpt import BioGptConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "microsoft/biogpt"
_CONFIG_FOR_DOC = "BioGptConfig"
BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/biogpt",
"microsoft/BioGPT-Large",
]
class BioGptLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask = attention_mask.long()
positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
positions = positions[:, past_key_values_length:]
return super().forward(positions + self.offset)
class BioGptAttention(nn.Module):
"""
Placeholder for the BioGPT Attention module.
This class will define the attention mechanism for BioGPT.
Actual implementation details will be filled in later.
"""
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[BioGptConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
class BioGptDecoderLayer(nn.Module):
def __init__(self, config: BioGptConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = BioGptAttention(
embed_dim=self.embed_dim,
num_heads=config.num_attention_heads,
dropout=config.attention_probs_dropout_prob,
is_decoder=True,
)
self.dropout = config.hidden_dropout_prob
self.activation_fn = ACT2FN[config.hidden_act]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
class BioGptPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BioGptConfig
base_model_prefix = "biogpt"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
BIOGPT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`~BioGptConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BIOGPT_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare BioGPT Model transformer outputting raw hidden-states without any specific head on top.",
BIOGPT_START_DOCSTRING,
)
class BioGptModel(BioGptPreTrainedModel):
def __init__(self, config: BioGptConfig):
super().__init__(config)
self.config = config
self.layerdrop = config.layerdrop
self.dropout = config.hidden_dropout_prob
self.embed_dim = config.hidden_size
self.padding_idx = config.pad_token_id
self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
self.embed_tokens = nn.Embedding(config.vocab_size, self.embed_dim, self.padding_idx)
self.embed_positions = BioGptLearnedPositionalEmbedding(config.max_position_embeddings, self.embed_dim)
self.layers = nn.ModuleList([BioGptDecoderLayer(config) for _ in range(config.num_hidden_layers)])
self.layer_norm = nn.LayerNorm(self.embed_dim)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
)
class BioGptForCausalLM(BioGptPreTrainedModel):
_tied_weights_keys = ["output_projection.weight"]
def __init__(self, config):
super().__init__(config)
self.biogpt = BioGptModel(config)
self.output_projection = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_output_embeddings(self):
return self.output_projection
def set_output_embeddings(self, new_embeddings):
self.output_projection = new_embeddings
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=CausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.biogpt(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.output_projection(sequence_output)
lm_loss = None
if labels is not None:
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
loss_fct = CrossEntropyLoss()
lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[1:]
return ((lm_loss,) + output) if lm_loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=lm_loss,
logits=prediction_scores,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
}
)
return model_inputs
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings(
"""
BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
BIOGPT_START_DOCSTRING,
)
class BioGptForTokenClassification(BioGptPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.biogpt = BioGptModel(config)
if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
classifier_dropout = config.classifier_dropout
else:
classifier_dropout = config.hidden_dropout_prob
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pass
这段代码定义了一个 `BioGptForTokenClassification` 类,它是在 `BioGptPreTrainedModel` 基础上构建的,用于处理标记分类任务,例如命名实体识别(NER)。类中的 `forward` 方法尚未具体实现前向传播逻辑,但通过装饰器和注释详细描述了输入参数和输出,以及一些样例和模型配置的文档。
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.biogpt(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
hidden_states = self.dropout(hidden_states)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + transformer_outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings(
"""
The BioGpt Model transformer with a sequence classification head on top (linear layer).
[`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-2) do.
Since it does classification on the last token, it is required to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
BIOGPT_START_DOCSTRING,
)
class BioGptForSequenceClassification(BioGptPreTrainedModel):
def __init__(self, config: BioGptConfig):
super().__init__(config)
self.num_labels = config.num_labels
self.biogpt = BioGptModel(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
self.post_init()
@add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def get_input_embeddings(self):
return self.biogpt.embed_tokens
def set_input_embeddings(self, value):
self.biogpt.embed_tokens = value
.\models\biogpt\tokenization_biogpt.py
"""Tokenization classes for BioGPT."""
import json
import os
from typing import List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/biogpt": "https://huggingface.co/microsoft/biogpt/resolve/main/vocab.json",
},
"merges_file": {"microsoft/biogpt": "https://huggingface.co/microsoft/biogpt/resolve/main/merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/biogpt": 1024,
}
def get_pairs(word):
"""
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class BioGptTokenizer(PreTrainedTokenizer):
"""
Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
def __init__(
self,
vocab_file,
merges_file,
unk_token="<unk>",
eos_token="</s>",
pad_token="<pad>",
**kwargs
):
super().__init__(
unk_token=unk_token,
eos_token=eos_token,
pad_token=pad_token,
**kwargs
)
self.vocab_file = vocab_file
self.merges_file = merges_file
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens.
"""
if token_ids_1 is None:
return token_ids_0
else:
return token_ids_0 + token_ids_1
def get_vocab(self):
with open(self.vocab_file, "r", encoding="utf-8") as reader:
vocab = json.load(reader)
return vocab
def tokenize(self, text):
return text.split()
def convert_tokens_to_ids(self, tokens):
vocab = self.get_vocab()
return [vocab[token] for token in tokens]
def convert_ids_to_tokens(self, ids):
vocab = self.get_vocab()
return [list(vocab.keys())[list(vocab.values()).index(id)] for id in ids]
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Merges file.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
"""
# 确定词汇文件名的键名
vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型的词汇文件映射表
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练位置嵌入的最大模型输入大小
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 模型输入名称列表
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
pad_token="<pad>",
**kwargs,
):
try:
import sacremoses # 尝试导入 sacremoses 库
except ImportError:
# 如果导入失败,抛出 ImportError 异常,提醒需要安装 sacremoses 库
raise ImportError(
"You need to install sacremoses to use BioGptTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
self.lang = "en"
self.sm = sacremoses # 将 sacremoses 赋值给 self.sm
# 缓存 sm.MosesTokenizer 实例
self.cache_moses_tokenizer = {}
self.cache_moses_detokenizer = {}
""" Initialisation"""
# 用 utf-8 编码打开 vocab_file 文件,并将其内容加载为 JSON 格式,赋值给 self.encoder
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
# 创建 self.decoder 字典,将 self.encoder 的键值对反转
self.decoder = {v: k for k, v in self.encoder.items()}
# 用 utf-8 编码打开 merges_file 文件,读取内容并按行分割,去除最后一个空行,赋值给 merges 列表
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
# 将 merges 列表中的每个元素按空格分割成元组,构成 merges 列表
merges = [tuple(merge.split()[:2]) for merge in merges]
# 创建 self.bpe_ranks 字典,将 merges 列表中的元素与其在列表中的索引号配对
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
# 调用父类的初始化方法,并传入指定参数
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
unk_token=unk_token,
pad_token=pad_token,
**kwargs,
)
@property
def vocab_size(self):
"""Returns vocab size"""
# 返回 self.encoder 的长度,即词汇表的大小
return len(self.encoder)
def get_vocab(self):
# 返回包含 self.encoder 和 self.added_tokens_encoder 的字典
return dict(self.encoder, **self.added_tokens_encoder)
def moses_tokenize(self, text, lang):
# 如果 lang 不在 self.cache_moses_tokenizer 中,则创建一个新的 sm.MosesTokenizer 实例并缓存
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
# 使用缓存的 moses_tokenizer 对象对文本进行分词处理并返回结果
return self.cache_moses_tokenizer[lang].tokenize(
text, aggressive_dash_splits=True, return_str=False, escape=True
)
def moses_detokenize(self, tokens, lang):
# 如果 lang 不在 self.cache_moses_detokenizer 中,则创建一个新的 sm.MosesDetokenizer 实例并缓存
if lang not in self.cache_moses_detokenizer:
moses_detokenizer = self.sm.MosesDetokenizer(lang=lang)
self.cache_moses_detokenizer[lang] = moses_detokenizer
# 使用缓存的 moses_detokenizer 对象对 tokens 进行反向处理并返回结果
return self.cache_moses_detokenizer[lang].detokenize(tokens)
def bpe(self, token):
# 将输入的token进行BPE处理,生成新的词形
word = tuple(token[:-1]) + (token[-1] + "</w>",)
# 如果token已经被处理过,直接返回缓存中的结果
if token in self.cache:
return self.cache[token]
# 获取token中的所有字符对
pairs = get_pairs(word)
# 如果没有字符对,直接返回token后加上结束符的形式
if not pairs:
return token + "</w>"
# 循环处理字符对,直到无法继续合并
while True:
# 找到优先级最低的字符对
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
# 如果这个字符对不在BPE词频表中,停止合并
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
# 合并找到的字符对
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
# 如果只剩一个词元,停止合并
if len(word) == 1:
break
else:
pairs = get_pairs(word)
# 将词元列表转换为字符串形式返回
word = " ".join(word)
# 处理特殊情况下的结束符
if word == "\n </w>":
word = "\n</w>"
# 将处理后的结果缓存起来
self.cache[token] = word
return word
def _tokenize(self, text, bypass_tokenizer=False):
"""Returns a tokenized string."""
# 根据bypass_tokenizer的值选择是否使用分词器进行处理文本
if bypass_tokenizer:
text = text.split()
else:
text = self.moses_tokenize(text, self.lang)
split_tokens = []
# 对文本中的每个token进行BPE处理并拆分结果
for token in text:
if token:
split_tokens.extend(list(self.bpe(token).split(" ")))
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
# 根据词汇表将token转换为对应的ID,未知token返回未知token的ID
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
# 根据词汇表将ID转换为对应的token,未知ID返回未知token
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
# 移除BPE处理过程中的空格和结束符,将tokens拼接成文本
tokens = [t.replace(" ", "").replace("</w>", " ") for t in tokens]
tokens = "".join(tokens).split()
# 使用Moses库的反分词函数,将tokens还原为文本
text = self.moses_detokenize(tokens, self.lang)
return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
# 构建输入序列,添加特殊token以及可能的第二个序列的token
def build_model_inputs(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BioGPT sequence has the following format:
- single sequence: `</s> X `
- pair of sequences: `</s> A </s> B `
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary
"""
# If only one sequence provided, return with a single separator token added at the beginning
if token_ids_1 is None:
return [self.sep_token_id] + token_ids_0
# If two sequences provided, construct the input with a separator token between and at the ends of each sequence
sep = [self.sep_token_id]
return sep + token_ids_0 + sep + token_ids_1
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
# If tokens already have special tokens, delegate to superclass method
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# For sequences without special tokens, return a mask with 1s for special tokens and 0s for sequence tokens
# no bos used in fairseq
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
return [1] + ([0] * len(token_ids_0))
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
Transformer sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary
"""
sep = [self.sep_token_id]
# 如果没有传入第二个序列的 token_ids_1,则只返回第一个部分的 mask(全为 0)
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0]
# 返回一个由 token_ids_0 和 token_ids_1 组成的 mask,其中 token_ids_0 部分全为 0,token_ids_1 部分全为 1
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and merges files to the specified directory.
Args:
save_directory (str):
Directory path where the vocabulary files will be saved.
filename_prefix (str, optional):
Prefix to be added to the filenames of vocabulary and merges files.
Returns:
Tuple[str]: Tuple containing the paths to the saved vocabulary and merges files.
"""
# 如果保存目录不存在,则记录错误并返回
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构造保存的词汇表文件名和合并文件名的路径
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
# 将词汇表写入到 JSON 格式的文件中
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
# 将 BPE token 和其索引写入到合并文件中
with open(merge_file, "w", encoding="utf-8") as writer:
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
# 返回保存的词汇表文件和合并文件的路径
return vocab_file, merge_file
def __getstate__(self):
"""
Get the state of the XLMTokenizer object for pickling.
"""
# 复制对象的状态字典,并设置 'sm' 为 None,以便序列化时忽略 'sm'
state = self.__dict__.copy()
state["sm"] = None
return state
def __setstate__(self, d):
"""
Set the state of the XLMTokenizer object from a dictionary.
"""
# 从字典中恢复对象的状态
self.__dict__ = d
# 检查是否安装了 sacremoses 库,如果没有则抛出 ImportError
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)
# 将 sacremoses 模块引入 self.sm
self.sm = sacremoses
.\models\biogpt\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {
"configuration_biogpt": ["BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BioGptConfig"],
"tokenization_biogpt": ["BioGptTokenizer"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_biogpt"] = [
"BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
"BioGptForCausalLM",
"BioGptForTokenClassification",
"BioGptForSequenceClassification",
"BioGptModel",
"BioGptPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_biogpt import BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, BioGptConfig
from .tokenization_biogpt import BioGptTokenizer
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_biogpt import (
BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
BioGptForCausalLM,
BioGptForSequenceClassification,
BioGptForTokenClassification,
BioGptModel,
BioGptPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\bit\configuration_bit.py
""" BiT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
BIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/bit-50": "https://huggingface.co/google/bit-50/resolve/main/config.json",
}
class BitConfig(BackboneConfigMixin, PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the BiT
[google/bit-50](https://huggingface.co/google/bit-50) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "bit"
layer_types = ["preactivation", "bottleneck"]
supported_padding = ["SAME", "VALID"]
def __init__(
self,
num_channels=3,
embedding_size=64,
hidden_sizes=[256, 512, 1024, 2048],
depths=[3, 4, 6, 3],
layer_type="preactivation",
hidden_act="relu",
global_padding=None,
num_groups=32,
drop_path_rate=0.0,
embedding_dynamic_padding=False,
output_stride=32,
width_factor=1,
out_features=None,
out_indices=None,
**kwargs,
):
super().__init__(**kwargs)
if layer_type not in self.layer_types:
raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
if global_padding is not None:
if global_padding.upper() in self.supported_padding:
global_padding = global_padding.upper()
else:
raise ValueError(f"Padding strategy {global_padding} not supported")
self.num_channels = num_channels
self.embedding_size = embedding_size
self.hidden_sizes = hidden_sizes
self.depths = depths
self.layer_type = layer_type
self.hidden_act = hidden_act
self.global_padding = global_padding
self.num_groups = num_groups
self.drop_path_rate = drop_path_rate
self.embedding_dynamic_padding = embedding_dynamic_padding
self.output_stride = output_stride
self.width_factor = width_factor
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
.\models\bit\convert_bit_to_pytorch.py
"""Convert BiT checkpoints from the timm library."""
import argparse
import json
from pathlib import Path
import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from timm import create_model
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from transformers import BitConfig, BitForImageClassification, BitImageProcessor
from transformers.image_utils import PILImageResampling
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_config(model_name):
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
conv_layer = "std_conv" if "bit" in model_name else False
config = BitConfig(
conv_layer=conv_layer,
num_labels=1000,
id2label=id2label,
label2id=label2id,
)
return config
def rename_key(name):
if "stem.conv" in name:
name = name.replace("stem.conv", "bit.embedder.convolution")
if "blocks" in name:
name = name.replace("blocks", "layers")
if "head.fc" in name:
name = name.replace("head.fc", "classifier.1")
if name.startswith("norm"):
name = "bit." + name
if "bit" not in name and "classifier" not in name:
name = "bit.encoder." + name
return name
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
"""
Copy/paste/tweak model's weights to our BiT structure.
"""
config = get_config(model_name)
timm_model = create_model(model_name, pretrained=True)
timm_model.eval()
state_dict = timm_model.state_dict()
for key in state_dict.copy().keys():
val = state_dict.pop(key)
state_dict[rename_key(key)] = val.squeeze() if "head" in key else val
model = BitForImageClassification(config)
model.eval()
model.load_state_dict(state_dict)
transform = create_transform(**resolve_data_config({}, model=timm_model))
timm_transforms = transform.transforms
pillow_resamplings = {
"bilinear": PILImageResampling.BILINEAR,
"bicubic": PILImageResampling.BICUBIC,
"nearest": PILImageResampling.NEAREST,
}
processor = BitImageProcessor(
do_resize=True,
size={"shortest_edge": timm_transforms[0].size},
resample=pillow_resamplings[timm_transforms[0].interpolation.value],
do_center_crop=True,
crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},
do_normalize=True,
image_mean=timm_transforms[-1].mean.tolist(),
image_std=timm_transforms[-1].std.tolist(),
)
image = prepare_img()
timm_pixel_values = transform(image).unsqueeze(0)
pixel_values = processor(image, return_tensors="pt").pixel_values
assert torch.allclose(timm_pixel_values, pixel_values)
with torch.no_grad():
outputs = model(pixel_values)
logits = outputs.logits
print("Logits:", logits[0, :3])
print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
timm_logits = timm_model(pixel_values)
assert timm_logits.shape == outputs.logits.shape
assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
print("Looks ok!")
if pytorch_dump_folder_path is not None:
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
print(f"Pushing model {model_name} and processor to the hub")
model.push_to_hub(f"ybelkada/{model_name}")
processor.push_to_hub(f"ybelkada/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default="resnetv2_50x1_bitm",
type=str,
help="Name of the BiT timm model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
parser.add_argument(
"--push_to_hub",
action="store_true",
help="Whether to push the model to the hub.",
)
args = parser.parse_args()
convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
.\models\bit\image_processing_bit.py
from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
convert_to_rgb,
get_resize_output_image_size,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging
logger = logging.get_logger(__name__)
if is_vision_available():
import PIL
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
`preprocess` method.
crop_size (`Dict[str, int]` *optional*, defaults to 224):
Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
method.
do_normalize:
Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
"""
# 模型输入的名称,这里指定了一个名称为 "pixel_values" 的输入
model_input_names = ["pixel_values"]
# 初始化函数,设置各种预处理参数的默认值
def __init__(
self,
do_resize: bool = True, # 是否进行调整大小
size: Dict[str, int] = None, # 图像大小字典
resample: PILImageResampling = PILImageResampling.BICUBIC, # 重采样方法
do_center_crop: bool = True, # 是否进行中心裁剪
crop_size: Dict[str, int] = None, # 裁剪大小字典
do_rescale: bool = True, # 是否进行重新缩放
rescale_factor: Union[int, float] = 1 / 255, # 重新缩放因子
do_normalize: bool = True, # 是否进行标准化
image_mean: Optional[Union[float, List[float]]] = None, # 图像均值
image_std: Optional[Union[float, List[float]]] = None, # 图像标准差
do_convert_rgb: bool = True, # 是否进行 RGB 转换
**kwargs,
) -> None:
# 调用父类初始化函数
super().__init__(**kwargs)
# 如果未指定图像大小,则设置默认值
size = size if size is not None else {"shortest_edge": 224}
# 获取图像大小字典
size = get_size_dict(size, default_to_square=False)
# 如果未指定裁剪大小,则设置默认值
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
# 获取裁剪大小字典
crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
# 设置各参数的值
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_center_crop",
"crop_size",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"do_convert_rgb",
"return_tensors",
"data_format",
"input_data_format",
]
# 从 transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize 方法复制而来
def resize(
self,
image: np.ndarray, # 输入的图像数组
size: Dict[str, int], # 图像大小的字典
resample: PILImageResampling = PILImageResampling.BICUBIC, # 重采样方法
data_format: Optional[Union[str, ChannelDimension]] = None, # 数据格式
input_data_format: Optional[Union[str, ChannelDimension]] = None, # 输入数据格式
**kwargs,
) -> np.ndarray:
"""
Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
resized to keep the input aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
# 默认使用正方形调整大小
default_to_square = True
# 如果 `size` 字典中包含 'shortest_edge' 键
if "shortest_edge" in size:
# 将 `size` 设为 `size["shortest_edge"]`,即最短边的长度
size = size["shortest_edge"]
# 关闭默认正方形调整大小选项
default_to_square = False
# 如果 `size` 字典中包含 'height' 和 'width' 键
elif "height" in size and "width" in size:
# 将 `size` 设为包含高度和宽度的元组
size = (size["height"], size["width"])
else:
# 如果 `size` 字典中既没有 'shortest_edge' 也没有同时包含 'height' 和 'width' 键,则抛出异常
raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
# 获取调整大小后的输出图像尺寸
output_size = get_resize_output_image_size(
image,
size=size,
default_to_square=default_to_square,
input_data_format=input_data_format,
)
# 调用 resize 函数进行图像调整大小操作,并返回调整大小后的图像
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: int = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
.\models\bit\modeling_bit.py
""" PyTorch BiT model. Also supports backbone for ViT hybrid."""
import collections
import math
from typing import Optional, Tuple
import numpy as np
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BackboneOutput,
BaseModelOutputWithNoAttention,
BaseModelOutputWithPoolingAndNoAttention,
ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_bit import BitConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "BitConfig"
_CHECKPOINT_FOR_DOC = "google/bit-50"
_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "google/bit-50"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/bit-50",
]
def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tuple[Tuple, bool]:
r"""
Utility function to get the tuple padding value given the kernel_size and padding.
Args:
padding (Union[`str`, `int`], *optional*):
Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
PyTorch is used.
kernel_size (`int`, *optional*, defaults to 7):
Kernel size of the convolution layers.
stride (`int`, *optional*, defaults to 1):
Stride value of the convolution layers.
dilation (`int`, *optional*, defaults to 1):
Dilation value of the convolution layers.
"""
dynamic = False
if padding is None:
padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
return padding, dynamic
if isinstance(padding, str):
padding = padding.lower()
if padding == "same":
if stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0:
padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
else:
padding = 0
dynamic = True
elif padding == "valid":
padding = 0
else:
padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
return padding, dynamic
class WeightStandardizedConv2d(nn.Conv2d):
"""Conv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model.
Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
Standardization](https://arxiv.org/abs/1903.10520v2)
"""
def __init__(
self,
in_channel,
out_channels,
kernel_size,
stride=1,
padding="SAME",
dilation=1,
groups=1,
bias=False,
eps=1e-6,
):
padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
super().__init__(
in_channel,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
)
if is_dynamic:
self.pad = DynamicPad2d(kernel_size, stride, dilation)
else:
self.pad = None
self.eps = eps
def forward(self, hidden_state):
if self.pad is not None:
hidden_state = self.pad(hidden_state)
weight = nn.functional.batch_norm(
self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps
).reshape_as(self.weight)
hidden_state = nn.functional.conv2d(
hidden_state, weight, self.bias, self.stride, self.padding, self.dilation, self.groups
)
return hidden_state
class BitGroupNormActivation(nn.GroupNorm):
r"""
A module that combines group normalization with an activation function.
"""
def __init__(self, config, num_channels, eps=1e-5, affine=True, apply_activation=True):
super(BitGroupNormActivation, self).__init__(config.num_groups, num_channels, eps=eps, affine=affine)
if apply_activation:
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = nn.Identity()
def forward(self, hidden_state):
hidden_state = nn.functional.group_norm(hidden_state, self.num_groups, self.weight, self.bias, self.eps)
hidden_state = self.activation(hidden_state)
return hidden_state
class DynamicPad2d(nn.Module):
r"""
A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
hidden states.
"""
def __init__(self, kernel_size, stride, dilation, value=0):
super().__init__()
if isinstance(kernel_size, int):
kernel_size = (kernel_size, kernel_size)
if isinstance(stride, int):
stride = (stride, stride)
if isinstance(dilation, int):
dilation = (dilation, dilation)
self.kernel_size = kernel_size
self.stride = stride
self.dilation = dilation
self.value = value
def compute_padding(x, kernel_size, stride, dilation):
return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
self.compute_padding = compute_padding
def __call__(self, input):
input_height, input_width = input.size()[-2:]
padding_height = self.compute_padding(input_height, self.kernel_size[0], self.stride[0], self.dilation[0])
padding_width = self.compute_padding(input_width, self.kernel_size[1], self.stride[1], self.dilation[1])
if padding_height > 0 or padding_width > 0:
input = nn.functional.pad(
input,
[
padding_width // 2,
padding_width - padding_width // 2,
padding_height // 2,
padding_height - padding_height // 2,
],
value=self.value,
)
return input
class BitMaxPool2d(nn.MaxPool2d):
"""Tensorflow like 'SAME' wrapper for 2D max pooling"""
def __init__(
self,
kernel_size: int,
stride=None,
dilation=1,
ceil_mode=False,
padding=(0, 0),
padding_value=0,
use_dynamic_padding=True,
):
kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation)
super().__init__(kernel_size, stride, padding, dilation, ceil_mode)
if use_dynamic_padding:
self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value)
else:
self.pad = nn.Identity()
def forward(self, hidden_states):
hidden_states = self.pad(hidden_states)
return nn.functional.max_pool2d(
hidden_states, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode
)
class BitEmbeddings(nn.Module):
"""
BiT Embeddings (stem) composed of a single aggressive convolution.
"""
def __init__(self, config: BitConfig):
super().__init__()
self.convolution = WeightStandardizedConv2d(
config.num_channels,
config.embedding_size,
kernel_size=7,
stride=2,
eps=1e-8,
padding=config.global_padding,
)
self.pooler = BitMaxPool2d(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding)
if config.global_padding is not None and config.global_padding.upper() == "SAME":
self.pad = nn.Identity()
else:
self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)
if not config.layer_type == "preactivation":
self.norm = BitGroupNormActivation(config, num_channels=config.embedding_size)
else:
self.norm = nn.Identity()
self.num_channels = config.num_channels
def forward(self, pixel_values: Tensor) -> Tensor:
num_channels = pixel_values.shape[1]
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embedding = self.convolution(pixel_values)
embedding = self.pad(embedding)
embedding = self.norm(embedding)
embedding = self.pooler(embedding)
return embedding
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class BitDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
def make_div(value, divisor=8):
min_value = divisor
new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
if new_value < 0.9 * value:
new_value += divisor
return new_value
class BitPreActivationBottleneckLayer(nn.Module):
"""Pre-activation (v2) bottleneck block.
Follows the implementation of "Identity Mappings in Deep Residual Networks":
https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
Except it puts the stride on 3x3 conv when available.
"""
def __init__(
self,
config,
in_channels,
out_channels=None,
bottle_ratio=0.25,
stride=1,
dilation=1,
first_dilation=None,
groups=1,
drop_path_rate=0.0,
is_first_layer=False,
):
super().__init__()
first_dilation = first_dilation or dilation
out_channels = out_channels or in_channels
mid_channels = make_div(out_channels * bottle_ratio)
if is_first_layer:
self.downsample = BitDownsampleConv(
config,
in_channels,
out_channels,
stride=stride,
preact=True,
)
else:
self.downsample = None
self.norm1 = BitGroupNormActivation(config, in_channels)
self.conv1 = WeightStandardizedConv2d(in_channels, mid_channels, 1, eps=1e-8, padding=config.global_padding)
self.norm2 = BitGroupNormActivation(config, num_channels=mid_channels)
self.conv2 = WeightStandardizedConv2d(
mid_channels, mid_channels, 3, stride=stride, groups=groups, eps=1e-8, padding=config.global_padding
)
self.norm3 = BitGroupNormActivation(config, mid_channels)
self.conv3 = WeightStandardizedConv2d(mid_channels, out_channels, 1, eps=1e-8, padding=config.global_padding)
self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
def forward(self, hidden_states):
hidden_states_preact = self.norm1(hidden_states)
shortcut = hidden_states
if self.downsample is not None:
shortcut = self.downsample(hidden_states_preact)
hidden_states = self.conv1(hidden_states_preact)
hidden_states = self.conv2(self.norm2(hidden_states))
hidden_states = self.conv3(self.norm3(hidden_states))
hidden_states = self.drop_path(hidden_states)
return hidden_states + shortcut
class BitBottleneckLayer(nn.Module):
"""Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid."""
def __init__(
self,
config,
in_channels,
out_channels=None,
bottle_ratio=0.25,
stride=1,
dilation=1,
first_dilation=None,
groups=1,
drop_path_rate=0.0,
is_first_layer=False,
):
super().__init__()
first_dilation = first_dilation or dilation
out_channels = out_channels or in_channels
mid_chs = make_div(out_channels * bottle_ratio)
if is_first_layer:
self.downsample = BitDownsampleConv(
config,
in_channels,
out_channels,
stride=stride,
preact=False,
)
else:
self.downsample = None
self.conv1 = WeightStandardizedConv2d(in_channels, mid_chs, 1, eps=1e-8, padding=config.global_padding)
self.norm1 = BitGroupNormActivation(config, num_channels=mid_chs)
self.conv2 = WeightStandardizedConv2d(
mid_chs,
mid_chs,
3,
stride=stride,
dilation=first_dilation,
groups=groups,
eps=1e-8,
padding=config.global_padding,
)
self.norm2 = BitGroupNormActivation(config, num_channels=mid_chs)
self.conv3 = WeightStandardizedConv2d(mid_chs, out_channels, 1, eps=1e-8, padding=config.global_padding)
self.norm3 = BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)
self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
self.activation = ACT2FN[config.hidden_act]
def forward(self, hidden_states):
shortcut = hidden_states
if self.downsample is not None:
shortcut = self.downsample(hidden_states)
hidden_states = self.conv1(hidden_states)
hidden_states = self.norm1(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = self.norm2(hidden_states)
hidden_states = self.conv3(hidden_states)
hidden_states = self.norm3(hidden_states)
hidden_states = self.drop_path(hidden_states)
hidden_states = self.activation(hidden_states + shortcut)
return hidden_states
class BitDownsampleConv(nn.Module):
def __init__(
self,
config,
in_channels,
out_channels,
stride=1,
preact=True,
):
super().__init__()
self.conv = WeightStandardizedConv2d(
in_channels, out_channels, 1, stride=stride, eps=1e-8, padding=config.global_padding
)
self.norm = (
nn.Identity()
if preact
else BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)
)
def forward(self, x):
conv_output = self.conv(x)
normalized_output = self.norm(conv_output)
return normalized_output
class BitStage(nn.Module):
"""
A ResNet v2 stage composed by stacked layers.
"""
def __init__(
self,
config,
in_channels,
out_channels,
stride,
dilation,
depth,
bottle_ratio=0.25,
layer_dropout=None,
):
super().__init__()
first_dilation = 1 if dilation in (1, 2) else 2
if config.layer_type == "bottleneck":
layer_cls = BitBottleneckLayer
else:
layer_cls = BitPreActivationBottleneckLayer
prev_chs = in_channels
self.layers = nn.Sequential()
for layer_idx in range(depth):
stride, drop_path_rate, is_first_layer = self._get_updated_hyperparameters(
layer_idx, stride, layer_dropout
)
self.layers.add_module(
str(layer_idx),
layer_cls(
config,
prev_chs,
out_channels,
stride=stride,
dilation=dilation,
bottle_ratio=bottle_ratio,
first_dilation=first_dilation,
drop_path_rate=drop_path_rate,
is_first_layer=is_first_layer,
),
)
prev_chs = out_channels
first_dilation = dilation
def _get_updated_hyperparameters(self, layer_idx, stride, layer_dropout):
"""
Get the new hyper-parameters with respect to the previous ones and the index of the current layer.
"""
if layer_dropout:
drop_path_rate = layer_dropout[layer_idx]
else:
drop_path_rate = 0.0
if layer_idx != 0:
stride = 1
is_first_layer = layer_idx == 0
return stride, drop_path_rate, is_first_layer
def forward(self, input: Tensor) -> Tensor:
hidden_state = input
for _, layer in enumerate(self.layers):
hidden_state = layer(hidden_state)
return hidden_state
def __init__(self, config: BitConfig):
super().__init__()
self.stages = nn.ModuleList([])
prev_chs = config.embedding_size
current_stride = 4
dilation = 1
layer_dropouts = [
x.tolist()
for x in torch.Tensor(np.linspace(0, config.drop_path_rate, sum(config.depths))).split(config.depths)
]
for stage_idx, (current_depth, current_hidden_size, layer_dropout) in enumerate(
zip(config.depths, config.hidden_sizes, layer_dropouts)
):
out_channels, stride, dilation = self._get_updated_hyperparameters(
stage_idx, current_stride, current_hidden_size, dilation, config
)
stage = BitStage(
config,
prev_chs,
out_channels,
stride=stride,
dilation=dilation,
depth=current_depth,
layer_dropout=layer_dropout,
)
prev_chs = out_channels
current_stride *= stride
self.stages.add_module(str(stage_idx), stage)
def _get_updated_hyperparameters(self, stage_idx, current_stride, current_hidden_size, dilation, config):
out_channels = make_div(current_hidden_size * config.width_factor)
stride = 1 if stage_idx == 0 else 2
if current_stride >= config.output_stride:
dilation *= stride
stride = 1
return out_channels, stride, dilation
def forward(
self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
) -> BaseModelOutputWithNoAttention:
hidden_states = () if output_hidden_states else None
for stage_module in self.stages:
if output_hidden_states:
hidden_states = hidden_states + (hidden_state,)
hidden_state = stage_module(hidden_state)
if output_hidden_states:
hidden_states = hidden_states + (hidden_state,)
if not return_dict:
return tuple(v for v in [hidden_state, hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(
last_hidden_state=hidden_state,
hidden_states=hidden_states,
)
@add_start_docstrings(
"The bare BiT model outputting raw features without any specific head on top.",
BIT_START_DOCSTRING,
)
class BitModel(BitPreTrainedModel):
"""
BiT 模型的抽象类,负责权重初始化、预训练模型的下载和加载接口。
"""
def __init__(self, config):
"""
初始化函数,设置模型结构及参数。
Args:
config (BitConfig): 模型的配置类,包含模型的所有参数。
Attributes:
embedder (BitEmbeddings): BiT 模型的嵌入层。
encoder (BitEncoder): BiT 模型的编码器。
norm (nn.Module): 规范化层,根据配置决定是分组规范化还是身份映射。
pooler (nn.Module): 自适应平均池化层,用于汇总特征。
"""
super().__init__(config)
self.config = config
self.embedder = BitEmbeddings(config)
self.encoder = BitEncoder(config)
self.norm = (
BitGroupNormActivation(config, num_channels=config.hidden_sizes[-1])
if config.layer_type == "preactivation"
else nn.Identity()
)
self.pooler = nn.AdaptiveAvgPool2d((1, 1))
self.post_init()
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
) -> Union[ModelOutput, Tuple[Tensor]]:
"""
BiT 模型的前向传播函数。
Args:
pixel_values (Tensor): 输入的像素值张量,形状为(batch_size, num_channels, height, width)。
output_hidden_states (bool, optional): 是否返回所有层的隐藏状态。
return_dict (bool, optional): 是否返回一个 ModelOutput 而不是普通元组。
Returns:
Union[ModelOutput, Tuple[Tensor]]: 根据 return_dict 参数返回不同形式的输出。
Raises:
NotImplementedError: 如果未指定返回格式,则抛出错误。
"""
raise NotImplementedError
) -> BaseModelOutputWithPoolingAndNoAttention:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
embedding_output = self.embedder(pixel_values)
encoder_outputs = self.encoder(
embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.norm(last_hidden_state)
pooled_output = self.pooler(last_hidden_state)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
@add_start_docstrings(
"""
BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
""",
BIT_START_DOCSTRING,
)
class BitForImageClassification(BitPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bit = BitModel(config)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
)
self.post_init()
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> ImageClassifierOutputWithNoAttention:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return (loss,) + output if loss is not None else output
return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
@add_start_docstrings(
"""
BiT backbone, to be used with frameworks like DETR and MaskFormer.
""",
BIT_START_DOCSTRING,
)
class BitBackbone(BitPreTrainedModel, BackboneMixin):
def __init__(self, config):
super().__init__(config)
super()._init_backbone(config)
self.bit = BitModel(config)
self.num_features = [config.embedding_size] + config.hidden_sizes
self.post_init()
@add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
) -> BackboneOutput:
"""
Returns:
Examples:
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50")
>>> model = AutoBackbone.from_pretrained("google/resnetnv2-50")
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
outputs = self.bit(pixel_values, output_hidden_states=True, return_dict=True)
hidden_states = outputs.hidden_states
feature_maps = ()
for idx, stage in enumerate(self.stage_names):
if stage in self.out_features:
feature_maps += (hidden_states[idx],)
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (outputs.hidden_states,)
return output
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=None,
)
.\models\bit\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {"configuration_bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig", "BitOnnxConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_bit"] = [
"BIT_PRETRAINED_MODEL_ARCHIVE_LIST",
"BitForImageClassification",
"BitModel",
"BitPreTrainedModel",
"BitBackbone",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_bit"] = ["BitImageProcessor"]
if TYPE_CHECKING:
from .configuration_bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig, BitOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_bit import (
BIT_PRETRAINED_MODEL_ARCHIVE_LIST,
BitBackbone,
BitForImageClassification,
BitModel,
BitPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_bit import BitImageProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\blenderbot\configuration_blenderbot.py
"""
Blenderbot model configuration
"""
from collections import OrderedDict
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig
from ...file_utils import TensorType, is_torch_available
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
from ...onnx.utils import compute_effective_axis_dimension
from ...utils import logging
logger = logging.get_logger(__name__)
BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/config.json",
}
class BlenderbotConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`BlenderbotModel`]. It is used to instantiate an
Blenderbot model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Blenderbot
[facebook/blenderbot-3B](https://huggingface.co/facebook/blenderbot-3B) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import BlenderbotConfig, BlenderbotModel
>>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
>>> configuration = BlenderbotConfig()
>>> # Initializing a model (with random weights) from the facebook/blenderbot-3B style configuration
>>> model = BlenderbotModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "blenderbot"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=8008,
max_position_embeddings=128,
encoder_layers=2,
encoder_ffn_dim=10240,
encoder_attention_heads=32,
decoder_layers=24,
decoder_ffn_dim=10240,
decoder_attention_heads=32,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu",
d_model=2560,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=1,
scale_embedding=False,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
encoder_no_repeat_ngram_size=3,
forced_eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
class BlenderbotOnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
common_inputs["decoder_input_ids"] = {0: "batch"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
elif self.task == "causal-lm":
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
_, num_decoder_layers = self.num_layers
for i in range(num_decoder_layers):
common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
else:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
]
)
return common_inputs
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_outputs = super().outputs
else:
common_outputs = super(OnnxConfigWithPast, self).outputs
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
return common_outputs
def _generate_dummy_inputs_for_default_and_seq2seq_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
decoder_seq_length = seq_length if not self.use_past else 1
decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, decoder_seq_length, is_pair, framework
)
decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
common_inputs = dict(**encoder_inputs, **decoder_inputs)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, encoder_seq_length = common_inputs["input_ids"].shape
decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
encoder_shape = (
batch,
num_encoder_attention_heads,
encoder_seq_length,
self._config.hidden_size // num_encoder_attention_heads,
)
decoder_past_length = decoder_seq_length
decoder_shape = (
batch,
num_decoder_attention_heads,
decoder_past_length,
self._config.hidden_size // num_decoder_attention_heads,
)
common_inputs["decoder_attention_mask"] = torch.cat(
[common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
)
common_inputs["past_key_values"] = []
_, num_decoder_layers = self.num_layers
for _ in range(num_decoder_layers):
common_inputs["past_key_values"].append(
(
torch.zeros(decoder_shape),
torch.zeros(decoder_shape),
torch.zeros(encoder_shape),
torch.zeros(encoder_shape),
)
)
return common_inputs
) -> Mapping[str, Any]:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
past_key_values_length = seqlen
_, num_decoder_layers = self.num_layers
num_encoder_attention_heads, _ = self.num_attention_heads
past_shape = (
batch,
num_encoder_attention_heads,
past_key_values_length,
self._config.hidden_size // num_encoder_attention_heads,
)
mask_dtype = common_inputs["attention_mask"].dtype
common_inputs["attention_mask"] = torch.cat(
[common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
common_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_decoder_layers)
]
return common_inputs
def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
batch_size = compute_effective_axis_dimension(
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
)
token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
seq_length = compute_effective_axis_dimension(
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
)
dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
return common_inputs
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
elif self.task == "causal-lm":
common_inputs = self._generate_dummy_inputs_for_causal_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
else:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
return common_inputs
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
if self.task in ["default", "seq2seq-lm"]:
flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
else:
flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
flattened_output, name, idx, t
)
def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
if direction not in ["inputs", "outputs"]:
raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
name = "past_key_values" if direction == "inputs" else "present"
_, num_decoder_layers = self.num_layers
encoder_sequence = "past_encoder_sequence"
decoder_sequence = "past_decoder_sequence" if direction == "inputs" else "past_decoder_sequence + sequence"
for i in range(num_decoder_layers):
inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch", 2: decoder_sequence}
inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch", 2: decoder_sequence}
inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch", 2: encoder_sequence}
inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch", 2: encoder_sequence}
.\models\blenderbot\convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
"""Convert Blenderbot checkpoint."""
import argparse
import torch
from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
PATTERNS = [
["attention", "attn"],
["encoder_attention", "encoder_attn"],
["q_lin", "q_proj"],
["k_lin", "k_proj"],
["v_lin", "v_proj"],
["out_lin", "out_proj"],
["norm_embeddings", "layernorm_embedding"],
["position_embeddings", "embed_positions"],
["embeddings", "embed_tokens"],
["ffn.lin", "fc"],
]
def rename_state_dict_key(k):
if k == "embeddings.weight":
return "shared.weight"
for parlai_name, hf_name in PATTERNS:
k = k.replace(parlai_name, hf_name)
if k.startswith("encoder"):
k = k.replace(".attn", ".self_attn")
k = k.replace("norm1", "self_attn_layer_norm")
k = k.replace("norm2", "final_layer_norm")
elif k.startswith("decoder"):
k = k.replace("norm1", "self_attn_layer_norm")
k = k.replace("norm2", "encoder_attn_layer_norm")
k = k.replace("norm3", "final_layer_norm")
return k
def rename_layernorm_keys(sd):
keys = [
"model.encoder.layernorm_embedding.weight",
"model.encoder.layernorm_embedding.bias",
"model.decoder.layernorm_embedding.weight",
"model.decoder.layernorm_embedding.bias",
]
for k in keys:
v = sd.pop(k)
new_k = k.replace("layernorm_embedding", "layer_norm")
assert new_k not in sd
sd[new_k] = v
IGNORE_KEYS = ["START"]
@torch.no_grad()
def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path):
"""
Copy/paste/tweak model's weights to our BERT structure.
"""
model = torch.load(checkpoint_path, map_location="cpu")
sd = model["model"]
cfg = BlenderbotConfig.from_json_file(config_json_path)
m = BlenderbotForConditionalGeneration(cfg)
valid_keys = m.model.state_dict().keys()
failures = []
mapping = {}
for k, v in sd.items():
if k in IGNORE_KEYS:
continue
new_k = rename_state_dict_key(k)
if new_k not in valid_keys:
failures.append([k, new_k])
else:
mapping[new_k] = v
if cfg.normalize_before:
rename_layernorm_keys(sd)
m.model.load_state_dict(mapping, strict=True)
m.half()
m.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin")
parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.")
parser.add_argument(
"--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use"
)
args = parser.parse_args()
convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json)
.\models\blenderbot\modeling_blenderbot.py
""" PyTorch Blenderbot model."""
import copy
import math
import os
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ..blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel
from .configuration_blenderbot import BlenderbotConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "BlenderbotConfig"
_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/blenderbot-3B",
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
class BlenderbotLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
super().__init__(num_embeddings, embedding_dim)
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
"""
`input_ids_shape` is expected to be [bsz x seqlen].
"""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions)
class BlenderbotAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
config: Optional[BlenderbotConfig] = None,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
):
BLENDERBOT_ATTENTION_CLASSES = {"eager": BlenderbotAttention}
pass
class BlenderbotEncoderLayer(nn.Module):
def __init__(self, config: BlenderbotConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
layer_head_mask: torch.Tensor,
output_attentions: bool = False,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class BlenderbotDecoderLayer(nn.Module):
def __init__(self, config: BlenderbotConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
"""
BLENDERBOT_GENERATION_EXAMPLE = r"""
Conversation example:
```
>>> from transformers import AutoTokenizer, BlenderbotForConditionalGeneration
>>> mname = "facebook/blenderbot-400M-distill"
>>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
>>> tokenizer = AutoTokenizer.from_pretrained(mname)
>>> UTTERANCE = "My friends are cool but they eat too many carbs."
>>> print("Human: ", UTTERANCE)
Human: My friends are cool but they eat too many carbs.
>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
>>> reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
Bot: That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?
>>> REPLY = "I'm not sure"
>>> print("Human: ", REPLY)
Human: I'm not sure
>>> NEXT_UTTERANCE = (
... "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
... "Are they trying to lose weight or are they just trying to be healthier?</s> "
... "<s> I'm not sure."
... )
>>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
>>> next_reply_ids = model.generate(**inputs)
>>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
Bot: I see. Well, it's good that they're trying to change their eating habits.
```
"""
BLENDERBOT_INPUTS_DOCSTRING = r"""
"""
class BlenderbotEncoder(BlenderbotPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`BlenderbotEncoderLayer`].
Args:
config: BlenderbotConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
self.embed_positions = BlenderbotLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BlenderbotDecoderLayer`]
Args:
config: BlenderbotConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = BlenderbotLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
if pretrained_model_name_or_path == "facebook/blenderbot-90M":
warnings.warn(
"The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
" checkpoint `facebook/small_blenderbot-90M` with"
" `BlenderbotSmallModel.from_pretrained('facebook/small_blenderbot-90M')` instead.",
FutureWarning,
)
return BlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path)
return super(BlenderbotModel, cls).from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"The Blenderbot Model with a language modeling head. Can be used for summarization.",
BLENDERBOT_START_DOCSTRING
)
class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: BlenderbotConfig):
super().__init__(config)
self.model = BlenderbotModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
self.post_init()
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
if pretrained_model_name_or_path == "facebook/blenderbot-90M":
warnings.warn(
"The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
" checkpoint `facebook/small_blenderbot-90M` with"
" `BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')` instead.",
FutureWarning,
)
return BlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
return super(BlenderbotForConditionalGeneration, cls).from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
A tuple containing either masked language modeling loss and model outputs or just model outputs.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if use_cache:
logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
"""
这个包装器类是一个辅助类,用于在将因果语言模型与 [`EncoderDecoderModel`] 框架结合使用时正确加载预训练检查点。
"""
def __init__(self, config):
super().__init__(config)
self.decoder = BlenderbotDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = BlenderbotDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
...
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
...
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
.\models\blenderbot\modeling_flax_blenderbot.py
""" Flax Blenderbot model."""
import math
import random
from functools import partial
from typing import Callable, Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey
from ...modeling_flax_outputs import (
FlaxBaseModelOutput,
FlaxBaseModelOutputWithPastAndCrossAttentions,
FlaxCausalLMOutputWithCrossAttentions,
FlaxSeq2SeqLMOutput,
FlaxSeq2SeqModelOutput,
)
from ...modeling_flax_utils import (
ACT2FN,
FlaxPreTrainedModel,
append_call_sample_docstring,
append_replace_return_docstrings,
overwrite_call_docstring,
)
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_blenderbot import BlenderbotConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "BlenderbotConfig"
_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"
BLENDERBOT_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a Flax Linen
[flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
Parameters:
config ([`BlenderbotConfig`]): Model configuration class with all the parameters of the model.
使用 BlenderbotConfig 类作为参数,这个类包含了模型的所有参数。
初始化时,仅加载配置文件,并不加载与模型相关的权重。
若要加载模型的权重,请参考 [`~FlaxPreTrainedModel.from_pretrained`] 方法。
"""
BLENDERBOT_INPUTS_DOCSTRING = r"""
"""
BLENDERBOT_ENCODE_INPUTS_DOCSTRING = r"""
Args:
input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary
position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
BLENDERBOT_DECODE_INPUTS_DOCSTRING = r"""
"""
# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
"""
Shift input ids one token to the right.
Args:
input_ids (jnp.ndarray): Array of input token indices.
pad_token_id (int): Index of the padding token in the vocabulary.
decoder_start_token_id (int): Index of the start token for decoder input.
Returns:
jnp.ndarray: Shifted input token indices.
This function shifts the input token indices to the right by one position,
inserting the decoder start token at the beginning and handling padding tokens.
"""
shifted_input_ids = jnp.zeros_like(input_ids) # Initialize an array of zeros with the same shape as input_ids
shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1]) # Shift input_ids to the right
shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id) # Set the start token
shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) # Handle padding tokens
return shifted_input_ids
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->Blenderbot
class FlaxBlenderbotAttention(nn.Module):
"""
Implementation of the attention mechanism in the Blenderbot model.
Attributes:
config (BlenderbotConfig): The configuration object for the Blenderbot model.
embed_dim (int): Dimensionality of the token embeddings.
num_heads (int): Number of attention heads.
dropout (float): Dropout probability.
causal (bool): Whether the attention is causal (for decoding).
bias (bool): Whether to include bias in the attention computation.
dtype (jnp.dtype): Data type of the computation (default is jnp.float32).
"""
config: BlenderbotConfig
embed_dim: int
num_heads: int
dropout: float = 0.0
causal: bool = False
bias: bool = True
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self) -> None:
# 将头维度设置为嵌入维度除以头数
self.head_dim = self.embed_dim // self.num_heads
# 检查嵌入维度是否能被头数整除,否则抛出数值错误
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {self.num_heads})."
)
# 定义一个偏函数,用于创建具有固定参数的全连接层
dense = partial(
nn.Dense,
self.embed_dim,
use_bias=self.bias,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 创建查询、键、值和输出的全连接层
self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
self.out_proj = dense()
# 创建一个dropout层,用于随机失活输入
self.dropout_layer = nn.Dropout(rate=self.dropout)
# 如果启用因果注意力,创建一个因果掩码
if self.causal:
self.causal_mask = make_causal_mask(
jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
)
def _split_heads(self, hidden_states):
# 将隐藏状态张量重新形状为多头注意力的形状
return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
def _merge_heads(self, hidden_states):
# 将多头注意力的张量重新形状为原始形状
return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
@nn.compact
def _concatenate_to_cache(self, key, value, query, attention_mask):
"""
This function takes projected key, value states from a single input token and concatenates the states to cached
states from previous steps. This function is slighly adapted from the official Flax repository:
https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py
"""
# 检测是否通过检查"cache"中的变量"cached_key"存在来初始化
is_initialized = self.has_variable("cache", "cached_key")
# 从缓存中初始化或创建一个变量,用于存储缓存关键字
cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
# 从缓存中初始化或创建一个变量,用于存储缓存数值
cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
# 从缓存中初始化或创建一个变量,用于存储缓存索引
cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
# 如果已经初始化
if is_initialized:
# 获取缓存关键字的维度信息
*batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
# 使用新的1d空间切片更新关键字、数值缓存
cur_index = cache_index.value
indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
key = lax.dynamic_update_slice(cached_key.value, key, indices)
value = lax.dynamic_update_slice(cached_value.value, value, indices)
# 更新缓存关键字的值
cached_key.value = key
# 更新缓存数值的值
cached_value.value = value
# 更新缓存索引值
num_updated_cache_vectors = query.shape[1]
cache_index.value = cache_index.value + num_updated_cache_vectors
# 用于缓存的decoder自注意力的因果掩膜:我们的单个查询位置应只关注已经生成和缓存的那些关键字位置,而不是剩余的零元素。
pad_mask = jnp.broadcast_to(
jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
)
attention_mask = combine_masks(pad_mask, attention_mask)
return key, value, attention_mask
def __call__(
self,
hidden_states: jnp.ndarray,
key_value_states: Optional[jnp.ndarray] = None,
attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
deterministic: bool = True,
# 从transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer复制代码,并将MBart->Blenderbot
class FlaxBlenderbotEncoderLayer(nn.Module):
# 使用BlenderbotConfig配置类
config: BlenderbotConfig
# 计算中使用的数据类型为jnp.float32
dtype: jnp.dtype = jnp.float32
# 设置函数,初始化编码器层
def setup(self) -> None:
# 设定嵌入维度为配置中的d_model值
self.embed_dim = self.config.d_model
# 使用FlaxBlenderbotAttention创建自注意力层
self.self_attn = FlaxBlenderbotAttention(
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.encoder_attention_heads,
dropout=self.config.attention_dropout,
dtype=self.dtype,
)
# 对自注意力输出进行层归一化
self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# dropout层,以配置中的dropout率丢弃部分神经元
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 激活函数使用配置中的激活函数类型的对应函数
self.activation_fn = ACT2FN[self.config.activation_function]
# 使用激活函数对应的dropout层,以配置中的激活dropout率丢弃部分神经元
self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
# 第一个全连接层,输出维度为配置中的encoder_ffn_dim
self.fc1 = nn.Dense(
self.config.encoder_ffn_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 第二个全连接层,输出维度与嵌入维度相同
self.fc2 = nn.Dense(
self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
)
# 最终的层归一化
self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 调用函数,执行编码器层的前向传播
def __call__(
self,
hidden_states: jnp.ndarray,
attention_mask: jnp.ndarray,
output_attentions: bool = True,
deterministic: bool = True,
) -> Tuple[jnp.ndarray]:
# 记录残差连接
residual = hidden_states
# 对输入的隐藏状态进行自注意力层的归一化
hidden_states = self.self_attn_layer_norm(hidden_states)
# 执行自注意力机制计算
hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
# 使用dropout层,丢弃部分计算结果
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 残差连接
hidden_states = residual + hidden_states
# 记录残差连接
residual = hidden_states
# 最终层归一化
hidden_states = self.final_layer_norm(hidden_states)
# 使用激活函数激活第一个全连接层的输出
hidden_states = self.activation_fn(self.fc1(hidden_states))
# 使用激活dropout层,丢弃部分计算结果
hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
# 第二个全连接层的计算
hidden_states = self.fc2(hidden_states)
# 使用dropout层,丢弃部分计算结果
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 残差连接
hidden_states = residual + hidden_states
# 输出结果只包含隐藏状态
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将注意力权重也加入输出
if output_attentions:
outputs += (attn_weights,)
return outputs
# 从transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection复制代码,并将Bart->Blenderbot
class FlaxBlenderbotEncoderLayerCollection(nn.Module):
# 使用BlenderbotConfig配置类
config: BlenderbotConfig
# 计算中使用的数据类型为jnp.float32
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 设置函数,初始化编码器层集合
def setup(self):
# 创建编码器层列表,包括多个FlaxBlenderbotEncoderLayer实例
self.layers = [
FlaxBlenderbotEncoderLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.encoder_layers)
]
# 编码器层的层丢弃率,从配置中获取
self.layerdrop = self.config.encoder_layerdrop
def __call__(
self,
hidden_states,
attention_mask,
deterministic: bool = True,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# 如果需要输出注意力权重,则初始化空元组;否则设为None
all_attentions = () if output_attentions else None
# 如果需要输出隐藏状态,则初始化空元组;否则设为None
all_hidden_states = () if output_hidden_states else None
# 遍历每个编码器层
for encoder_layer in self.layers:
# 如果需要输出隐藏状态,则将当前隐藏状态加入元组中
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 添加 LayerDrop 功能(参见 https://arxiv.org/abs/1909.11556 )
dropout_probability = random.uniform(0, 1)
# 如果非确定性计算且随机数小于层丢弃概率,则跳过当前层
if not deterministic and (dropout_probability < self.layerdrop):
layer_outputs = (None, None)
else:
# 否则,调用当前编码器层的前向传播方法
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
output_attentions,
deterministic,
)
# 更新隐藏状态为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,则将当前层的注意力权重加入元组中
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# 如果需要输出隐藏状态,则将最终的隐藏状态加入元组中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 构建输出元组,包括最终的隐藏状态、所有隐藏状态和所有注意力权重
outputs = (hidden_states, all_hidden_states, all_attentions)
# 如果不需要返回字典,则将输出中的None值过滤掉后返回
if not return_dict:
return tuple(v for v in outputs if v is not None)
# 否则,将输出作为 FlaxBaseModelOutput 类的实例返回
return FlaxBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
# 从transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer复制代码,并将MBart->Blenderbot
class FlaxBlenderbotDecoderLayer(nn.Module):
# 定义配置为BlenderbotConfig
config: BlenderbotConfig
# 定义数据类型为jnp.float32
dtype: jnp.dtype = jnp.float32
# 初始化函数,设置各层和模块
def setup(self) -> None:
# 设置嵌入维度为配置中的d_model
self.embed_dim = self.config.d_model
# 创建BlenderbotAttention自注意力机制实例
self.self_attn = FlaxBlenderbotAttention(
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.decoder_attention_heads,
dropout=self.config.attention_dropout,
causal=True,
dtype=self.dtype,
)
# Dropout层,使用配置中的dropout率
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 激活函数,使用配置中指定的激活函数
self.activation_fn = ACT2FN[self.config.activation_function]
# 激活函数的Dropout层,使用配置中的activation_dropout率
self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
# Layer normalization层,用于自注意力机制
self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 创建BlenderbotAttention编码器注意力机制实例
self.encoder_attn = FlaxBlenderbotAttention(
config=self.config,
embed_dim=self.embed_dim,
num_heads=self.config.decoder_attention_heads,
dropout=self.config.attention_dropout,
dtype=self.dtype,
)
# 编码器注意力机制的Layer normalization层
self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 第一个全连接层,输出维度为配置中的decoder_ffn_dim
self.fc1 = nn.Dense(
self.config.decoder_ffn_dim,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 第二个全连接层,输出维度与嵌入维度相同
self.fc2 = nn.Dense(
self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
)
# 最终的Layer normalization层
self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 调用函数,定义模型的前向计算过程
def __call__(
self,
hidden_states: jnp.ndarray,
attention_mask: jnp.ndarray,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
output_attentions: bool = True,
deterministic: bool = True,
# 继续定义参数
) -> Tuple[jnp.ndarray]:
# 将输入的隐藏状态保存为残差连接的一部分
residual = hidden_states
# 对当前隐藏状态进行 Layer Normalization
hidden_states = self.self_attn_layer_norm(hidden_states)
# 自注意力机制
# 调用 self_attn 方法进行自注意力计算,同时返回注意力权重
hidden_states, self_attn_weights = self.self_attn(
hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
)
# 应用 dropout 层,以防止过拟合
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 将残差连接添加到当前隐藏状态中
hidden_states = residual + hidden_states
# 交叉注意力块
cross_attn_weights = None
# 如果存在编码器的隐藏状态,则执行以下操作
if encoder_hidden_states is not None:
# 将输入的隐藏状态保存为残差连接的一部分
residual = hidden_states
# 对当前隐藏状态进行 Layer Normalization
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# 调用 encoder_attn 方法进行交叉注意力计算,同时返回注意力权重
hidden_states, cross_attn_weights = self.encoder_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
)
# 应用 dropout 层,以防止过拟合
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 将残差连接添加到当前隐藏状态中
hidden_states = residual + hidden_states
# 全连接层
# 将输入的隐藏状态保存为残差连接的一部分
residual = hidden_states
# 对当前隐藏状态进行 Layer Normalization
hidden_states = self.final_layer_norm(hidden_states)
# 应用激活函数到全连接层的第一层
hidden_states = self.activation_fn(self.fc1(hidden_states))
# 应用 dropout 层,以防止过拟合
hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
# 应用全连接层的第二层
hidden_states = self.fc2(hidden_states)
# 应用 dropout 层,以防止过拟合
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 将残差连接添加到当前隐藏状态中
hidden_states = residual + hidden_states
# 返回输出结果
outputs = (hidden_states,)
# 如果需要输出注意力权重,则将自注意力和交叉注意力的权重添加到输出中
if output_attentions:
outputs += (self_attn_weights, cross_attn_weights)
return outputs
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection with Bart->Blenderbot
class FlaxBlenderbotDecoderLayerCollection(nn.Module):
config: BlenderbotConfig
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
def setup(self):
# Initialize a list of Blenderbot decoder layers based on the provided configuration
self.layers = [
FlaxBlenderbotDecoderLayer(self.config, name=str(i), dtype=self.dtype)
for i in range(self.config.decoder_layers)
]
# Set the layer dropout probability from the configuration
self.layerdrop = self.config.decoder_layerdrop
def __call__(
self,
hidden_states,
attention_mask,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
deterministic: bool = True,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
# Initialize containers for storing outputs
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
# Iterate through each decoder layer
for decoder_layer in self.layers:
if output_hidden_states:
# Store hidden states for potential use in output
all_hidden_states += (hidden_states,)
# Implement LayerDrop regularization during training
# (see https://arxiv.org/abs/1909.11556 for details)
dropout_probability = random.uniform(0, 1)
if not deterministic and (dropout_probability < self.layerdrop):
# Skip computation of the layer outputs based on LayerDrop probability
layer_outputs = (None, None, None)
else:
# Compute outputs of the current decoder layer
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
init_cache=init_cache,
output_attentions=output_attentions,
deterministic=deterministic,
)
# Update hidden states with the outputs of the current layer
hidden_states = layer_outputs[0]
if output_attentions:
# Store self-attention outputs if specified
all_self_attns += (layer_outputs[1],)
if encoder_hidden_states is not None:
# Store cross-attention outputs if specified and encoder_hidden_states is provided
all_cross_attentions += (layer_outputs[2],)
# Store hidden states from the last decoder layer if output_hidden_states is enabled
if output_hidden_states:
all_hidden_states += (hidden_states,)
# Prepare outputs based on return_dict flag
outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
if not return_dict:
# Return outputs as a tuple omitting None values
return tuple(v for v in outputs if v is not None)
# Return outputs as a FlaxBaseModelOutputWithPastAndCrossAttentions named tuple
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attns,
cross_attentions=all_cross_attentions,
)
class FlaxBlenderbotEncoder(nn.Module):
config: BlenderbotConfig
embed_tokens: nn.Embed
# 定义数据类型为 jnp.float32,用于计算的数据类型
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
# 执行初始化设置操作
def setup(self):
# 初始化一个丢弃层,根据配置中的丢弃率
self.dropout_layer = nn.Dropout(rate=self.config.dropout)
# 设置嵌入维度为模型配置中的 d_model
embed_dim = self.config.d_model
# 设置填充索引为配置中的 pad_token_id
self.padding_idx = self.config.pad_token_id
# 设置最大源序列位置为配置中的 max_position_embeddings
self.max_source_positions = self.config.max_position_embeddings
# 如果配置中设置了 scale_embedding,则设置嵌入缩放因子为 embed_dim 的平方根,否则为 1.0
self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0
# 初始化位置嵌入层,使用正态分布初始化方法,标准差为配置中的 init_std
self.embed_positions = nn.Embed(
self.config.max_position_embeddings,
embed_dim,
embedding_init=jax.nn.initializers.normal(self.config.init_std),
)
# 初始化编码器层集合
self.layers = FlaxBlenderbotEncoderLayerCollection(self.config, self.dtype)
# 初始化层归一化层,数据类型为 self.dtype,设置 epsilon 为 1e-05
self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
# 定义对象调用方法
def __call__(
self,
input_ids,
attention_mask,
position_ids,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 获取输入张量的形状
input_shape = input_ids.shape
# 将输入张量展平为二维张量,保留最后一个维度
input_ids = input_ids.reshape(-1, input_shape[-1])
# 对输入 token 嵌入进行缩放
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
# 获取位置嵌入
embed_pos = self.embed_positions(position_ids)
# 将嵌入的 token 和位置加和得到隐藏状态
hidden_states = inputs_embeds + embed_pos
# 对隐藏状态应用丢弃层,根据 deterministic 参数确定是否确定性丢弃
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
# 对隐藏状态应用编码器层
outputs = self.layers(
hidden_states,
attention_mask,
deterministic=deterministic,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取编码器层输出的最后一个隐藏状态
last_hidden_states = outputs[0]
# 对最后一个隐藏状态应用层归一化
last_hidden_states = self.layer_norm(last_hidden_states)
# 如果需要输出隐藏状态,则更新隐藏状态列表中的最后一个元素
hidden_states = None
if output_hidden_states:
hidden_states = outputs[1]
hidden_states = hidden_states[:-1] + (last_hidden_states,)
# 如果不需要以字典形式返回结果,则将输出元组化并返回
if not return_dict:
outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
return tuple(v for v in outputs if v is not None)
# 以 FlaxBaseModelOutput 类的形式返回结果,包括最后的隐藏状态、隐藏状态和注意力权重(如果有)
return FlaxBaseModelOutput(
last_hidden_state=last_hidden_states,
hidden_states=hidden_states,
attentions=outputs.attentions,
)
class FlaxBlenderbotDecoder(nn.Module):
config: BlenderbotConfig # 定义配置对象的类型为 BlenderbotConfig
embed_tokens: nn.Embed # 定义嵌入层对象的类型为 nn.Embed
dtype: jnp.dtype = jnp.float32 # 定义计算过程中的数据类型为 jnp.float32
def setup(self):
self.dropout_layer = nn.Dropout(rate=self.config.dropout) # 初始化丢弃层对象,并设定丢弃率为配置中的 dropout
embed_dim = self.config.d_model # 从配置中获取嵌入维度
self.padding_idx = self.config.pad_token_id # 获取填充标记的索引
self.max_target_positions = self.config.max_position_embeddings # 获取目标位置的最大数
self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0 # 根据配置计算嵌入比例
self.embed_positions = nn.Embed(
self.config.max_position_embeddings,
embed_dim,
embedding_init=jax.nn.initializers.normal(self.config.init_std),
) # 初始化位置嵌入层对象,设定位置数量、嵌入维度和初始化方式
self.layers = FlaxBlenderbotDecoderLayerCollection(self.config, self.dtype) # 初始化解码器层集合对象
self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) # 初始化层归一化对象,设定数据类型和 epsilon 值
def __call__(
self,
input_ids,
attention_mask,
position_ids,
encoder_hidden_states: Optional[jnp.ndarray] = None,
encoder_attention_mask: Optional[jnp.ndarray] = None,
init_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
input_shape = input_ids.shape # 获取输入数据形状
input_ids = input_ids.reshape(-1, input_shape[-1]) # 重塑输入数据的形状为二维矩阵
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale # 将输入数据嵌入到嵌入层中,并按比例缩放
# 嵌入位置信息
positions = self.embed_positions(position_ids)
hidden_states = inputs_embeds + positions # 将输入嵌入向量与位置嵌入相加得到隐藏状态
hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) # 应用丢弃层
outputs = self.layers(
hidden_states,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
deterministic=deterministic,
init_cache=init_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
) # 将隐藏状态传递给解码器层集合对象进行解码
last_hidden_states = outputs[0] # 获取最后一个隐藏状态
last_hidden_states = self.layer_norm(last_hidden_states) # 应用层归一化到最后一个隐藏状态
# 更新 `hidden_states` 中应用 `layernorm` 后的最后一个元素
hidden_states = None
if output_hidden_states:
hidden_states = outputs[1] # 获取所有隐藏状态
hidden_states = hidden_states[:-1] + (last_hidden_states,) # 将最后一个隐藏状态添加到列表中
if not return_dict:
outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
return tuple(v for v in outputs if v is not None)
return FlaxBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=last_hidden_states,
hidden_states=hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
) # 返回模型输出对象,包含最后的隐藏状态、所有隐藏状态、注意力分数和交叉注意力分数
# 定义一个自定义的 Flax 模型,继承自 nn.Module
class FlaxBlenderbotModule(nn.Module):
# 声明类变量 config,类型为 BlenderbotConfig,用于配置模型
config: BlenderbotConfig
# 声明类变量 dtype,表示计算时的数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32 # 计算时的数据类型
# 模型的初始化方法
def setup(self):
# 初始化一个共享的嵌入层,vocab_size 和 d_model 来自于 config,初始化方式为正态分布
self.shared = nn.Embed(
self.config.vocab_size,
self.config.d_model,
embedding_init=jax.nn.initializers.normal(self.config.init_std),
dtype=self.dtype,
)
# 初始化编码器,使用自定义的 FlaxBlenderbotEncoder 类
self.encoder = FlaxBlenderbotEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
# 初始化解码器,使用自定义的 FlaxBlenderbotDecoder 类
self.decoder = FlaxBlenderbotDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
# 获取编码器模块的方法
def _get_encoder_module(self):
return self.encoder
# 获取解码器模块的方法
def _get_decoder_module(self):
return self.decoder
# 定义模型的调用方法,实现模型的前向传播
def __call__(
self,
input_ids,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
position_ids,
decoder_position_ids,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
):
# 调用编码器进行前向传播
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 调用解码器进行前向传播,其中传入编码器的输出作为解码器的输入
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
position_ids=decoder_position_ids,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 如果 return_dict 为 False,则返回解码器和编码器的输出
if not return_dict:
return decoder_outputs + encoder_outputs
# 如果 return_dict 为 True,则将解码器和编码器的输出整合成 FlaxSeq2SeqModelOutput 类型的结果并返回
return FlaxSeq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
# 继承自 FlaxPreTrainedModel 的预训练模型基类
class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
# 配置类为 BlenderbotConfig
config_class = BlenderbotConfig
# 模型前缀为 "model"
base_model_prefix: str = "model"
# 模块类为 nn.Module,在子类中定义
module_class: nn.Module = None
# 模型初始化方法
def __init__(
self,
config: BlenderbotConfig,
input_shape: Tuple[int] = (1, 1),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 使用给定的配置、数据类型和其他关键字参数初始化模块对象
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 调用父类的构造方法初始化对象
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化输入张量
input_ids = jnp.zeros(input_shape, dtype="i4")
# 确保初始化过程适用于 FlaxBlenderbotForSequenceClassificationModule
input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id)
attention_mask = jnp.ones_like(input_ids)
decoder_input_ids = input_ids
decoder_attention_mask = jnp.ones_like(input_ids)
batch_size, sequence_length = input_ids.shape
# 创建位置编码矩阵,形状与输入张量相同
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 划分随机数生成器为参数和dropout使用的两部分
params_rng, dropout_rng = jax.random.split(rng)
rngs = {"params": params_rng, "dropout": dropout_rng}
# 使用模块的初始化方法初始化模型参数
random_params = self.module.init(
rngs,
input_ids,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
position_ids,
decoder_position_ids,
)["params"]
if params is not None:
# 如果提供了参数,则将随机生成的参数与已有参数进行融合
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
# 如果未提供参数,则返回随机生成的参数
return random_params
@add_start_docstrings(BLENDERBOT_ENCODE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BlenderbotConfig)
def encode(
self,
input_ids: jnp.ndarray,
attention_mask: Optional[jnp.ndarray] = None,
position_ids: Optional[jnp.ndarray] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
):
"""
Encodes input sequences into hidden states using the Blenderbot model.
Args:
input_ids (`jnp.ndarray`):
The input token IDs. Shape `(batch_size, sequence_length)`.
attention_mask (`Optional[jnp.ndarray]`, optional):
Mask to avoid performing attention on padding token indices. Shape `(batch_size, sequence_length)`.
position_ids (`Optional[jnp.ndarray]`, optional):
Indices of positions for each input token in the sequence. Shape `(batch_size, sequence_length)`.
output_attentions (`Optional[bool]`, optional):
Whether to output attentions weights. Default is `None`.
output_hidden_states (`Optional[bool]`, optional):
Whether to output hidden states of all layers. Default is `None`.
return_dict (`Optional[bool]`, optional):
Whether to return a dictionary instead of a tuple of outputs. Default is `None`.
train (`bool`, optional):
Whether the model is in training mode. Default is `False`.
params (`dict`, optional):
Optional parameters for model inference.
dropout_rng (`PRNGKey`, optional):
Dropout random number generator key for reproducibility.
Returns:
`FlaxBaseModelOutput`: A namedtuple with the following fields:
- last_hidden_state (`jnp.ndarray`):
Sequence of hidden states at the output of the last layer of the model. Shape
`(batch_size, sequence_length, hidden_size)`.
- hidden_states (`Optional[Tuple[jnp.ndarray]]`, optional):
Sequence of hidden states for all layers. Only returned when `output_hidden_states=True`.
- attentions (`Optional[Tuple[jnp.ndarray]]`, optional):
Attention weights for each layer. Only returned when `output_attentions=True`.
"""
# Combine the input arguments into a dictionary for model initialization
input_args = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"position_ids": position_ids,
"output_attentions": output_attentions,
"output_hidden_states": output_hidden_states,
"return_dict": return_dict,
"train": train,
"params": params,
"dropout_rng": dropout_rng,
}
# Initialize the model using its `init` method with specified input arguments
initialized_model = self.module.init(
jax.random.PRNGKey(0), # Initialize with a fixed PRNGKey for reproducibility
**input_args,
)
# Return the initialized cache from the model's initialization results
return unfreeze(initialized_model["cache"])
):
r"""
Returns:
Example:
```
>>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration
>>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
>>> text = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
>>> encoder_outputs = model.encode(**inputs)
```"""
# 初始化输出注意力的设置,如果没有传入则使用模型配置中的默认设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 初始化输出隐藏状态的设置,如果没有传入则使用模型配置中的默认设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 初始化返回字典的设置,如果没有传入则使用模型配置中的默认设置
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 如果注意力掩码为None,则创建一个全1的掩码数组,形状与输入ids相同
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
# 如果位置ids为None,则根据输入ids的形状广播创建位置ids
if position_ids is None:
batch_size, sequence_length = input_ids.shape
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 处理任何需要的伪随机数生成器
rngs = {}
if dropout_rng is not None:
rngs["dropout"] = dropout_rng
# 定义_encoder_forward函数,用于对编码模块进行前向传播
def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
encode_module = module._get_encoder_module()
return encode_module(input_ids, attention_mask, position_ids, **kwargs)
# 调用self.module.apply进行模型的前向传播
return self.module.apply(
{"params": params or self.params}, # 参数字典,使用传入的参数或者模型自身的参数
input_ids=jnp.array(input_ids, dtype="i4"), # 输入的ids数组,转换为JAX支持的整数数组
attention_mask=jnp.array(attention_mask, dtype="i4"), # 注意力掩码数组,转换为JAX支持的整数数组
position_ids=jnp.array(position_ids, dtype="i4"), # 位置ids数组,转换为JAX支持的整数数组
output_attentions=output_attentions, # 是否输出注意力权重
output_hidden_states=output_hidden_states, # 是否输出隐藏状态
return_dict=return_dict, # 是否返回字典格式的输出
deterministic=not train, # 是否确定性计算,如果非训练状态则确定性计算
rngs=rngs, # 伪随机数生成器字典
method=_encoder_forward, # 调用的方法,这里为_encoder_forward函数
)
@add_start_docstrings(BLENDERBOT_DECODE_INPUTS_DOCSTRING)
@replace_return_docstrings(
output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BlenderbotConfig
)
# 定义decode函数,用于解码
def decode(
self,
decoder_input_ids, # 解码器输入的ids
encoder_outputs, # 编码器的输出
encoder_attention_mask: Optional[jnp.ndarray] = None, # 编码器的注意力掩码
decoder_attention_mask: Optional[jnp.ndarray] = None, # 解码器的注意力掩码
decoder_position_ids: Optional[jnp.ndarray] = None, # 解码器的位置ids
past_key_values: dict = None, # 过去的键值对
output_attentions: Optional[bool] = None, # 是否输出注意力权重
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态
return_dict: Optional[bool] = None, # 是否返回字典格式的输出
train: bool = False, # 是否为训练模式
params: dict = None, # 参数字典
dropout_rng: PRNGKey = None, # 伪随机数生成器
@add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
def __call__(
self,
input_ids: jnp.ndarray,
attention_mask: Optional[jnp.ndarray] = None,
decoder_input_ids: Optional[jnp.ndarray] = None,
decoder_attention_mask: Optional[jnp.ndarray] = None,
position_ids: Optional[jnp.ndarray] = None,
decoder_position_ids: Optional[jnp.ndarray] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
):
# 确定是否输出注意力权重,默认从模型配置获取
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 确定是否输出隐藏状态,默认从模型配置获取
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 确定是否返回字典格式,默认从模型配置获取
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 准备编码器的输入
if attention_mask is None:
attention_mask = jnp.ones_like(input_ids)
if position_ids is None:
batch_size, sequence_length = input_ids.shape
position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
# 准备解码器的输入
if decoder_input_ids is None:
# 将输入右移一位,用于生成解码器的输入序列
decoder_input_ids = shift_tokens_right(
input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id
)
if decoder_attention_mask is None:
decoder_attention_mask = jnp.ones_like(decoder_input_ids)
if decoder_position_ids is None:
batch_size, sequence_length = decoder_input_ids.shape
decoder_position_ids = jnp.broadcast_to(
jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
)
# 如果需要处理随机数生成器(PRNG)
rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
# 应用模型的前向传播
return self.module.apply(
{"params": params or self.params},
input_ids=jnp.array(input_ids, dtype="i4"),
attention_mask=jnp.array(attention_mask, dtype="i4"),
position_ids=jnp.array(position_ids, dtype="i4"),
decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=not train, # 确定是否使用确定性推理,根据训练标志
rngs=rngs,
)
# 给 FlaxBlenderbotModel 类添加文档字符串,描述其作为不带特定头部的裸 MBart 模型变换器的输出
@add_start_docstrings(
"The bare MBart Model transformer outputting raw hidden-states without any specific head on top.",
BLENDERBOT_START_DOCSTRING,
)
class FlaxBlenderbotModel(FlaxBlenderbotPreTrainedModel):
# 引入 BlenderbotConfig 配置
config: BlenderbotConfig
# 计算中使用的数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 指定模块类为 FlaxBlenderbotModule
module_class = FlaxBlenderbotModule
# 向 FlaxBlenderbotModel 类附加示例调用文档字符串
append_call_sample_docstring(FlaxBlenderbotModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
# 从 transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule 复制代码,将 Bart 替换为 Blenderbot
class FlaxBlenderbotForConditionalGenerationModule(nn.Module):
# 引入 BlenderbotConfig 配置
config: BlenderbotConfig
# 计算中使用的数据类型,默认为 jnp.float32
dtype: jnp.dtype = jnp.float32
# 偏置初始化器,用于初始化偏置参数
bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
def setup(self):
# 创建 Blenderbot 模块,并使用给定的配置和数据类型初始化
self.model = FlaxBlenderbotModule(config=self.config, dtype=self.dtype)
# 创建语言模型头部,使用全连接层,参数根据模型共享的词汇表大小初始化
self.lm_head = nn.Dense(
self.model.shared.num_embeddings,
use_bias=False,
dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.init_std),
)
# 创建最终预测 logits 的偏置项,初始化为零向量
self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings))
# 获取编码器模块的方法
def _get_encoder_module(self):
return self.model.encoder
# 获取解码器模块的方法
def _get_decoder_module(self):
return self.model.decoder
# 定义对象调用方法,接收多个输入和控制参数,用于模型的正向传播
def __call__(
self,
input_ids,
attention_mask,
decoder_input_ids,
decoder_attention_mask,
position_ids,
decoder_position_ids,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
# 后续还有更多参数...
):
# 使用模型生成输出结果,传入输入张量及相关参数
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
position_ids=position_ids,
decoder_position_ids=decoder_position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 从模型输出中获取隐藏状态张量
hidden_states = outputs[0]
# 如果配置了词嵌入共享,使用共享的嵌入层进行计算
if self.config.tie_word_embeddings:
shared_embedding = self.model.variables["params"]["shared"]["embedding"]
lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
else:
# 否则直接使用语言模型头部计算逻辑回归
lm_logits = self.lm_head(hidden_states)
# 添加最终逻辑偏置
lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
# 如果不要求返回字典形式的输出,则返回元组
if not return_dict:
output = (lm_logits,) + outputs[1:]
return output
# 返回 FlaxSeq2SeqLMOutput 类型的输出,包含逻辑回归、解码器隐藏状态和注意力权重等
return FlaxSeq2SeqLMOutput(
logits=lm_logits,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
@add_start_docstrings(
"The Blenderbot Model with a language modeling head. Can be used for summarization.", BLENDERBOT_START_DOCSTRING
)
class FlaxBlenderbotForConditionalGeneration(FlaxBlenderbotPreTrainedModel):
module_class = FlaxBlenderbotForConditionalGenerationModule
dtype: jnp.dtype = jnp.float32
@add_start_docstrings(BLENDERBOT_DECODE_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BlenderbotConfig)
def decode(
self,
decoder_input_ids,
encoder_outputs,
encoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_attention_mask: Optional[jnp.ndarray] = None,
decoder_position_ids: Optional[jnp.ndarray] = None,
past_key_values: dict = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
train: bool = False,
params: dict = None,
dropout_rng: PRNGKey = None,
):
"""
Decode function for generation tasks.
Args:
- decoder_input_ids: Input tensor of decoder input token IDs.
- encoder_outputs: Output from the encoder.
- encoder_attention_mask: Optional tensor specifying which tokens in the encoder output should not be attended to.
- decoder_attention_mask: Optional tensor specifying which tokens in the decoder input should not be attended to.
- decoder_position_ids: Optional tensor specifying the position IDs for the decoder input tokens.
- past_key_values: Dictionary containing cached key-value states from previous decoding steps.
- output_attentions: Whether to output attention weights.
- output_hidden_states: Whether to output hidden states.
- return_dict: Whether to return a dictionary of outputs.
- train: Whether the model is in training mode.
- params: Additional parameters for decoding.
- dropout_rng: Dropout random number generator key.
Returns:
- FlaxCausalLMOutputWithCrossAttentions: Output object containing generated logits and optional cross-attention weights.
"""
# initializing the cache
batch_size, seq_length = decoder_input_ids.shape
past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
# Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
# But since the decoder uses a causal mask, those positions are masked anyways.
# Thus we can create a single static attention_mask here, which is more efficient for compilation
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
if decoder_attention_mask is not None:
position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
else:
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
return {
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"encoder_attention_mask": attention_mask,
"decoder_attention_mask": extended_attention_mask,
"decoder_position_ids": position_ids,
}
def prepare_inputs_for_generation(
self,
decoder_input_ids,
max_length,
attention_mask: Optional[jax.Array] = None,
decoder_attention_mask: Optional[jax.Array] = None,
encoder_outputs=None,
**kwargs,
):
"""
Prepares inputs for generation by setting up attention masks and position IDs.
Args:
- decoder_input_ids: Tensor of input token IDs for the decoder.
- max_length: Maximum length of the generated sequence.
- attention_mask: Optional tensor specifying which tokens in the encoder output should not be attended to.
- decoder_attention_mask: Optional tensor specifying which tokens in the decoder input should not be attended to.
- encoder_outputs: Output from the encoder.
- **kwargs: Additional keyword arguments.
Returns:
- dict: Dictionary containing prepared inputs for the generation process.
"""
# initializing the cache
batch_size, seq_length = decoder_input_ids.shape
past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
# Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
# But since the decoder uses a causal mask, those positions are masked anyways.
# Thus we can create a single static attention_mask here, which is more efficient for compilation
extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
if decoder_attention_mask is not None:
position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
else:
position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
return {
"past_key_values": past_key_values,
"encoder_outputs": encoder_outputs,
"encoder_attention_mask": attention_mask,
"decoder_attention_mask": extended_attention_mask,
"decoder_position_ids": position_ids,
}
def update_inputs_for_generation(self, model_outputs, model_kwargs):
"""
Updates inputs for the generation process by adjusting position IDs and past key values.
Args:
- model_outputs: Output from the model.
- model_kwargs: Keyword arguments for the model.
Returns:
- dict: Updated model keyword arguments for generation.
"""
model_kwargs["past_key_values"] = model_outputs.past_key_values
model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
return model_kwargs
FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING = r"""
Returns:
Conversation example::
```
>>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration
导入所需的库:从transformers库中导入AutoTokenizer和FlaxBlenderbotForConditionalGeneration类
>>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
使用预训练模型"facebook/blenderbot-400M-distill"初始化生成模型对象model
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
使用预训练的tokenizer模型"facebook/blenderbot-400M-distill"初始化分词器对象tokenizer
>>> UTTERANCE = "My friends are cool but they eat too many carbs."
定义待生成回复的文本UTTERANCE
>>> inputs = tokenizer([UTTERANCE], max_length=1024, return_tensors="np")
使用tokenizer对UTTERANCE进行分词和编码,生成模型输入inputs,设置最大长度为1024,返回类型为numpy数组
>>>
生成回复的注释标记
>>> reply_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True).sequences
调用生成模型生成回复,使用输入的input_ids,设置生成束搜索数为4,最大生成长度为5,启用提前停止策略,并获取生成的序列reply_ids
>>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in reply_ids])
打印生成的回复,对每个生成的序列进行反向分词和解码,并去除特殊标记和清理分词空格
"""
给指定类 `FlaxBlenderbotForConditionalGeneration` 覆盖方法 `call` 的文档字符串,
合并 `BLENDERBOT_INPUTS_DOCSTRING` 和 `FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING`。
"""
overwrite_call_docstring(
FlaxBlenderbotForConditionalGeneration,
BLENDERBOT_INPUTS_DOCSTRING + FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING,
)
"""
给指定类 `FlaxBlenderbotForConditionalGeneration` 添加或替换方法 `append_replace_return_docstrings` 的文档字符串,
设置输出类型为 `FlaxSeq2SeqLMOutput`,配置类为 `_CONFIG_FOR_DOC`。
"""
append_replace_return_docstrings(
FlaxBlenderbotForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
)