Transformers 源码解析(三十二)
.\models\data2vec\modeling_data2vec_text.py
"""PyTorch Data2VecText model."""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_data2vec_text import Data2VecTextConfig
logger = logging.get_logger(__name__)
_HIDDEN_STATES_START_POSITION = 2
_CHECKPOINT_FOR_DOC = "facebook/data2vec-text-base"
_CONFIG_FOR_DOC = "Data2VecTextConfig"
DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/data2vec-text-base",
]
class Data2VecTextForTextEmbeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
self.register_buffer(
"token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
)
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
):
如果没有提供位置 id:
如果提供了输入 token id:
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
否则:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
如果提供了输入 token id:
input_shape = input_ids.size()
否则:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
如果 token_type_ids 为空:
如果 self 中有 "token_type_ids" 属性:
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
否则:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
如果 inputs_embeds 为空:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
如果 self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
直接提供嵌入张量,无法推断哪些是填充的,因此生成顺序的位置 id。
Args:
inputs_embeds: torch.Tensor
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
class Data2VecTextSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
class Data2VecTextSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class Data2VecTextAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self = Data2VecTextSelfAttention(config, position_embedding_type=position_embedding_type)
self.output = Data2VecTextSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class Data2VecTextIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class Data2VecTextOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class Data2VecTextLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = Data2VecTextAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = Data2VecTextAttention(config, position_embedding_type="absolute")
self.intermediate = Data2VecTextIntermediate(config)
self.output = Data2VecTextOutput(config)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class Data2VecTextEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([Data2VecTextLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class Data2VecTextPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class Data2VecTextPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Data2VecTextConfig
base_model_prefix = "data2vec_text"
supports_gradient_checkpointing = True
_no_split_modules = ["Data2VecTextForTextEmbeddings", "Data2VecTextLayer"]
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
if hasattr(module, "bias") and module.bias is not None:
module.bias.data.zero_()
if hasattr(module, "weight") and module.weight is not None:
module.weight.data.fill_(1.0)
DATA2VECTEXT_START_DOCSTRING = r"""
Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
Michael Auli.
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
"""
Parameters:
config ([`Data2VecTextConfig`]): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
定义了一个多行字符串常量,用于文档字符串的输入参数说明。
"""
@add_start_docstrings(
"The bare Data2VecText Model for text transformer outputting raw hidden-states without any specific head on top.",
DATA2VECTEXT_START_DOCSTRING,
)
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in *Attention is
all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
Kaiser and Illia Polosukhin.
To behave as a decoder, the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
.. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = Data2VecTextForTextEmbeddings(config)
self.encoder = Data2VecTextEncoder(config)
self.pooler = Data2VecTextPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING
)
class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`")
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
self.lm_head = Data2VecTextLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
...
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
def _reorder_cache(self, past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING)
class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
self.lm_head = Data2VecTextLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.lm_head.decoder
def set_output_embeddings(self, new_embeddings):
self.lm_head.decoder = new_embeddings
@add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="<mask>",
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Used to hide legacy arguments that have been deprecated.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.data2vec_text(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
labels = labels.to(prediction_scores.device)
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class Data2VecTextLMHead(nn.Module):
"""Data2VecText Head for masked language modeling."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, features, **kwargs):
x = self.dense(features)
x = gelu(x)
x = self.layer_norm(x)
x = self.decoder(x)
return x
def _tie_weights(self):
if self.decoder.bias.device.type == "meta":
self.decoder.bias = self.bias
else:
self.bias = self.decoder.bias
@add_start_docstrings(
"""
Data2VecText模型变换器,顶部带有序列分类/回归头(汇总输出的线性层),例如用于GLUE任务。
""",
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
self.classifier = Data2VecTextClassificationHead(config)
self.post_init()
@add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.data2vec_text(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Data2VecText Model with a multiple choice classification head on top (a linear layer on top of the pooled output
and a softmax) e.g. for RocStories/SWAG tasks.
""",
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.data2vec_text = Data2VecTextModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.post_init()
@add_start_docstrings_to_model_forward(
DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
) -> Union[Tuple, MultipleChoiceModelOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.data2vec_text(
flat_input_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
labels = labels.to(reshaped_logits.device)
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
"""
Perform a forward pass through the model with optional inputs and outputs.
Args:
input_ids (Optional[torch.LongTensor]): The input tensor of token indices.
attention_mask (Optional[torch.FloatTensor]): The attention mask tensor.
token_type_ids (Optional[torch.LongTensor]): The token type IDs tensor.
position_ids (Optional[torch.LongTensor]): The position IDs tensor.
head_mask (Optional[torch.FloatTensor]): The head mask tensor.
inputs_embeds (Optional[torch.FloatTensor]): The embedded input tensors.
labels (Optional[torch.LongTensor]): The tensor of labels for classification.
output_attentions (Optional[bool]): Whether to output attentions.
output_hidden_states (Optional[bool]): Whether to output hidden states.
return_dict (Optional[bool]): Whether to return outputs as a dictionary.
Returns:
TokenClassifierOutput: Output object with logits and optional additional outputs.
"""
pass
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.data2vec_text(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
labels = labels.to(logits.device)
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class Data2VecTextClassificationHead(nn.Module):
"""用于句子级分类任务的头部模块。"""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings(
"""
Data2VecText模型的问题回答任务头部,用于像SQuAD这样的抽取式问答任务(在隐藏状态输出之上使用线性层来计算“起始位置logits”和“结束位置logits”)。
""",
DATA2VECTEXT_START_DOCSTRING,
)
class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.data2vec_text(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
input_ids: torch.Tensor, input tensor containing token IDs
padding_idx: int, index of padding token
past_key_values_length: int, optional, length of past key values
Returns:
torch.Tensor, tensor of position IDs corresponding to input_ids
"""
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
.\models\data2vec\modeling_data2vec_vision.py
""" PyTorch Data2VecVision 模型。"""
import collections.abc
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPooling,
ImageClassifierOutput,
SemanticSegmenterOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import (
find_pruneable_heads_and_indices,
meshgrid,
prune_linear_layer,
)
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_data2vec_vision import Data2VecVisionConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
_IMAGE_CLASS_CHECKPOINT = "facebook/data2vec-vision-base-ft1k"
_IMAGE_CLASS_EXPECTED_OUTPUT = "remote control, remote"
DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/data2vec-vision-base-ft1k",
]
@dataclass
class Data2VecVisionModelOutputWithPooling(BaseModelOutputWithPooling):
"""
[`Data2VecVisionModel`] 的输出类。
"""
pass
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
will be returned.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_()
output = input.div(keep_prob) * random_tensor
return output
class Data2VecVisionDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class Data2VecVisionEmbeddings(nn.Module):
"""
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
"""
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__()
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
if config.use_mask_token:
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
else:
self.mask_token = None
self.patch_embeddings = Data2VecVisionPatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
if config.use_absolute_position_embeddings:
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
else:
self.position_embeddings = None
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
embeddings, (patch_height, patch_width) = self.patch_embeddings(
pixel_values, self.position_embeddings[:, 1:, :] if self.position_embeddings is not None else None
)
batch_size, seq_len, _ = embeddings.size()
if bool_masked_pos is not None:
mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
embeddings = embeddings * (1 - w) + mask_tokens * w
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
if self.position_embeddings is not None:
cls_tokens = cls_tokens + self.position_embeddings[:, :1, :]
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
embeddings = self.dropout(embeddings)
return embeddings, (patch_height, patch_width)
class Data2VecVisionPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.patch_shape = patch_shape
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor, position_embedding: Optional[torch.Tensor] = None) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values)
patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
if position_embedding is not None:
position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(
0, 3, 1, 2
)
position_embedding = nn.functional.interpolate(
position_embedding, size=(patch_height, patch_width), mode="bicubic"
)
embeddings = embeddings + position_embedding
embeddings = embeddings.flatten(2).transpose(1, 2)
return embeddings, (patch_height, patch_width)
class Data2VecVisionSelfAttention(nn.Module):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
if window_size:
self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size)
else:
self.relative_position_bias = None
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if self.relative_position_bias is not None:
attention_scores = attention_scores + self.relative_position_bias().unsqueeze(0)
if relative_position_bias is not None:
attention_scores = attention_scores + relative_position_bias
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class Data2VecVisionSelfOutput(nn.Module):
"""
在 Data2VecVisionLayer 中定义了残差连接,而不是在这里(像其他模型一样),这是因为在每个块之前应用了 layernorm。
"""
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, gamma=None) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class Data2VecVisionAttention(nn.Module):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
super().__init__()
self.attention = Data2VecVisionSelfAttention(config, window_size=window_size)
self.output = Data2VecVisionSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
self_outputs = self.attention(hidden_states, head_mask, output_attentions, relative_position_bias)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]
return outputs
class Data2VecVisionIntermediate(nn.Module):
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class Data2VecVisionOutput(nn.Module):
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class Data2VecVisionLayer(nn.Module):
"""This corresponds to the Block class in the timm implementation."""
def __init__(
self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0
) -> None:
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = Data2VecVisionAttention(config, window_size=window_size)
self.intermediate = Data2VecVisionIntermediate(config)
self.output = Data2VecVisionOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.drop_path = Data2VecVisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
init_values = config.layer_scale_init_value
if init_values > 0:
self.lambda_1 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
self.lambda_2 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
else:
self.lambda_1, self.lambda_2 = None, None
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
relative_position_bias: Optional["Data2VecVisionRelativePositionBias"] = None,
) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states),
head_mask,
output_attentions=output_attentions,
relative_position_bias=relative_position_bias,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
if self.lambda_1 is not None:
attention_output = self.lambda_1 * attention_output
hidden_states = self.drop_path(attention_output) + hidden_states
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = self.output(layer_output)
if self.lambda_2 is not None:
layer_output = self.lambda_2 * layer_output
layer_output = self.drop_path(layer_output) + hidden_states
outputs = (layer_output,) + outputs
return outputs
class Data2VecVisionRelativePositionBias(nn.Module):
def __init__(self, config: Data2VecVisionConfig, window_size: tuple) -> None:
super().__init__()
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
self.relative_position_bias_table = nn.Parameter(
torch.zeros(self.num_relative_distance, config.num_attention_heads)
)
coords_h = torch.arange(window_size[0])
coords_w = torch.arange(window_size[1])
coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
coords_flatten = torch.flatten(coords, 1)
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
relative_coords[:, :, 0] += window_size[0] - 1
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = torch.zeros(
size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
)
relative_position_index[1:, 1:] = relative_coords.sum(-1)
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index", relative_position_index, persistent=False)
def forward(self) -> torch.Tensor:
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1
)
return relative_position_bias.permute(2, 0, 1).contiguous()
class Data2VecVisionEncoder(nn.Module):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None) -> None:
super().__init__()
self.config = config
if config.use_shared_relative_position_bias:
self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size)
else:
self.relative_position_bias = None
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
self.layer = nn.ModuleList(
[
Data2VecVisionLayer(
config,
window_size=window_size if config.use_relative_position_bias else None,
drop_path_rate=dpr[i],
)
for i in range(config.num_hidden_layers)
]
)
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, BaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
layer_module.__call__,
hidden_states,
layer_head_mask,
output_attentions,
)
else:
relative_position_bias = (
self.relative_position_bias() if self.relative_position_bias is not None else None
)
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, relative_position_bias)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class Data2VecVisionPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Data2VecVisionConfig
base_model_prefix = "data2vec_vision"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
DATA2VEC_VISION_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`Data2VecVisionConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
# 定义函数签名和参数说明
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`BeitImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Data2VecVision Model transformer outputting raw hidden-states without any specific head on top.",
DATA2VEC_VISION_START_DOCSTRING,
)
class Data2VecVisionModel(Data2VecVisionPreTrainedModel):
def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = False) -> None:
super().__init__(config)
self.config = config
self.embeddings = Data2VecVisionEmbeddings(config)
self.encoder = Data2VecVisionEncoder(config, window_size=self.embeddings.patch_embeddings.patch_shape)
self.layernorm = (
nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
)
self.pooler = Data2VecVisionPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Data2VecVisionModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, Data2VecVisionModelOutputWithPooling]:
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
embedding_output, (patch_height, patch_width) = self.embeddings(pixel_values, bool_masked_pos)
encoder_outputs = self.encoder(
embedding_output,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
return head_outputs + encoder_outputs[1:]
return Data2VecVisionModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class Data2VecVisionPooler(nn.Module):
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__()
self.layernorm = (
nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_mean_pooling else None
)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
if self.layernorm is not None:
patch_tokens = hidden_states[:, 1:, :]
pooled_output = self.layernorm(patch_tokens.mean(1))
else:
pooled_output = hidden_states[:, 0]
return pooled_output
@add_start_docstrings(
"""
Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
the final hidden states of the patch tokens) e.g. for ImageNet.
""",
DATA2VEC_VISION_START_DOCSTRING,
)
class Data2VecVisionForImageClassification(Data2VecVisionPreTrainedModel):
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.data2vec_vision = Data2VecVisionModel(config, add_pooling_layer=True)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
self.post_init()
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.data2vec_vision(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class Data2VecVisionConvModule(nn.Module):
"""
A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, int]],
padding: Union[int, Tuple[int, int], str] = 0,
bias: bool = False,
dilation: Union[int, Tuple[int, int]] = 1,
) -> None:
super().__init__()
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
padding=padding,
bias=bias,
dilation=dilation,
)
self.bn = nn.BatchNorm2d(out_channels)
self.activation = nn.ReLU()
def forward(self, input: torch.Tensor) -> torch.Tensor:
output = self.conv(input)
output = self.bn(output)
output = self.activation(output)
return output
class Data2VecVisionPyramidPoolingBlock(nn.Module):
def __init__(self, pool_scale: int, in_channels: int, channels: int) -> None:
super().__init__()
self.layers = [
nn.AdaptiveAvgPool2d(pool_scale),
Data2VecVisionConvModule(in_channels, channels, kernel_size=1),
]
for i, layer in enumerate(self.layers):
self.add_module(str(i), layer)
def forward(self, input: torch.Tensor) -> torch.Tensor:
hidden_state = input
for layer in self.layers:
hidden_state = layer(hidden_state)
return hidden_state
class Data2VecVisionPyramidPoolingModule(nn.Module):
"""
Pyramid Pooling Module (PPM) used in PSPNet.
Args:
pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
Module.
in_channels (int): Input channels.
channels (int): Channels after modules, before conv_seg.
align_corners (bool): align_corners argument of F.interpolate.
Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
"""
pass
def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, channels: int, align_corners: bool) -> None:
super().__init__()
self.pool_scales = pool_scales
self.align_corners = align_corners
self.in_channels = in_channels
self.channels = channels
self.blocks = []
for i, pool_scale in enumerate(pool_scales):
block = Data2VecVisionPyramidPoolingBlock(
pool_scale=pool_scale, in_channels=in_channels, channels=channels
)
self.blocks.append(block)
self.add_module(str(i), block)
def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
ppm_outs = []
for ppm in self.blocks:
ppm_out = ppm(x)
upsampled_ppm_out = nn.functional.interpolate(
ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners
)
ppm_outs.append(upsampled_ppm_out)
return ppm_outs
class Data2VecVisionUperHead(nn.Module):
"""
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://arxiv.org/abs/1807.10221).
Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
"""
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__()
self.pool_scales = config.pool_scales
self.in_channels = [config.hidden_size] * 4
self.channels = config.hidden_size
self.align_corners = False
self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
self.psp_modules = Data2VecVisionPyramidPoolingModule(
self.pool_scales,
self.in_channels[-1],
self.channels,
align_corners=self.align_corners,
)
self.bottleneck = Data2VecVisionConvModule(
self.in_channels[-1] + len(self.pool_scales) * self.channels,
self.channels,
kernel_size=3,
padding=1,
)
self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()
for in_channels in self.in_channels[:-1]:
l_conv = Data2VecVisionConvModule(in_channels, self.channels, kernel_size=1)
fpn_conv = Data2VecVisionConvModule(self.channels, self.channels, kernel_size=3, padding=1)
self.lateral_convs.append(l_conv)
self.fpn_convs.append(fpn_conv)
self.fpn_bottleneck = Data2VecVisionConvModule(
len(self.in_channels) * self.channels,
self.channels,
kernel_size=3,
padding=1,
)
def psp_forward(self, inputs):
x = inputs[-1]
psp_outs = [x]
psp_outs.extend(self.psp_modules(x))
psp_outs = torch.cat(psp_outs, dim=1)
output = self.bottleneck(psp_outs)
return output
def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]
laterals.append(self.psp_forward(encoder_hidden_states))
used_backbone_levels = len(laterals)
for i in range(used_backbone_levels - 1, 0, -1):
prev_shape = laterals[i - 1].shape[2:]
laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate(
laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners
)
fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
fpn_outs.append(laterals[-1])
for i in range(used_backbone_levels - 1, 0, -1):
fpn_outs[i] = nn.functional.interpolate(
fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners
)
fpn_outs = torch.cat(fpn_outs, dim=1)
output = self.fpn_bottleneck(fpn_outs)
output = self.classifier(output)
return output
class Data2VecVisionFCNHead(nn.Module):
"""
基于 Fully Convolution Networks 的语义分割头部。此头部的实现基于 FCNNet。
Args:
config (Data2VecVisionConfig): 配置参数。
in_channels: 输入通道数。
kernel_size (int): 头部卷积层的内核大小。默认为 3。
dilation (int): 头部卷积层的扩张率。默认为 1。
基于 OpenMMLab 实现,详情请见 https://github.com/open-mmlab/mmsegmentation。
"""
def __init__(
self,
config: Data2VecVisionConfig,
in_index: int = 2,
kernel_size: int = 3,
dilation: Union[int, Tuple[int, int]] = 1,
) -> None:
super().__init__()
self.in_channels = config.hidden_size
self.channels = config.auxiliary_channels
self.num_convs = config.auxiliary_num_convs
self.concat_input = config.auxiliary_concat_input
self.in_index = in_index
conv_padding = (kernel_size // 2) * dilation
convs = []
convs.append(
Data2VecVisionConvModule(
self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
)
)
for i in range(self.num_convs - 1):
convs.append(
Data2VecVisionConvModule(
self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
)
)
if self.num_convs == 0:
self.convs = nn.Identity()
else:
self.convs = nn.Sequential(*convs)
if self.concat_input:
self.conv_cat = Data2VecVisionConvModule(
self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2
)
self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = encoder_hidden_states[self.in_index]
output = self.convs(hidden_states)
if self.concat_input:
output = self.conv_cat(torch.cat([hidden_states, output], dim=1))
output = self.classifier(output)
return output
@add_start_docstrings(
"""
带有语义分割头部的 Data2VecVision 模型变压器,例如用于 ADE20k、CityScapes 等数据集。
""",
DATA2VEC_VISION_START_DOCSTRING,
)
class Data2VecVisionForSemanticSegmentation(Data2VecVisionPreTrainedModel):
def __init__(self, config: Data2VecVisionConfig) -> None:
super().__init__(config)
self.num_labels = config.num_labels
self.data2vec_vision = Data2VecVisionModel(config, add_pooling_layer=False)
if len(self.config.out_indices) != 4:
raise ValueError(
"Data2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, "
"specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of "
"a base-sized architecture."
)
self.fpn1 = nn.Sequential(
nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
nn.BatchNorm2d(config.hidden_size),
nn.GELU(),
nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
)
self.fpn2 = nn.Sequential(
nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
)
self.fpn3 = nn.Identity()
self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.decode_head = Data2VecVisionUperHead(config)
self.auxiliary_head = Data2VecVisionFCNHead(config) if config.use_auxiliary_head else None
self.post_init()
def compute_loss(self, logits, auxiliary_logits, labels):
upsampled_logits = nn.functional.interpolate(
logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
)
if auxiliary_logits is not None:
upsampled_auxiliary_logits = nn.functional.interpolate(
auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
)
loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
main_loss = loss_fct(upsampled_logits, labels)
loss = main_loss
if auxiliary_logits is not None:
auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
loss += self.config.auxiliary_loss_weight * auxiliary_loss
return loss
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\data2vec\modeling_tf_data2vec_vision.py
""" TF 2.0 Data2Vec Vision model."""
from __future__ import annotations
import collections.abc
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFSemanticSegmenterOutput,
TFSequenceClassifierOutput,
)
from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_data2vec_vision import Data2VecVisionConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "Data2VecVisionConfig"
_CHECKPOINT_FOR_DOC = "facebook/data2vec-vision-base"
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
_IMAGE_CLASS_CHECKPOINT = "facebook/data2vec-vision-base-ft1k"
_IMAGE_CLASS_EXPECTED_OUTPUT = "remote control, remote"
TF_DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/data2vec-vision-base-ft1k",
]
@dataclass
class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
"""
[`TFData2VecVisionModel`] 的输出类。
"""
pass
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列。
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
如果 *config.use_mean_pooling* 设置为 True,则为所有补丁令牌的最后一层隐藏状态的平均值(不包括 *[CLS]* 令牌)。
如果设置为 False,则返回 *[CLS]* 令牌的最终隐藏状态。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
一个元组,包含模型每一层的隐藏状态的 `tf.Tensor`(包括初始嵌入输出)。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
一个元组,包含注意力权重的 `tf.Tensor`(每层一个),形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
这些权重是注意力 softmax 后的结果,用于计算自注意力头中的加权平均值。
"""
# 初始化函数参数默认值为 None
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor = None
hidden_states: Tuple[tf.Tensor] | None = None
attentions: Tuple[tf.Tensor] | None = None
class TFData2VecVisionDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
References:
(1) github.com:rwightman/pytorch-image-models
"""
def __init__(self, drop_path, **kwargs):
super().__init__(**kwargs)
self.drop_path = drop_path # 初始化方法,设置了一个名为 drop_path 的属性
def call(self, x, training=None):
if training:
keep_prob = 1 - self.drop_path # 如果处于训练模式,计算保留概率
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) # 计算随机张量的形状
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) # 生成随机张量
random_tensor = tf.floor(random_tensor) # 取下界,得到二元随机张量
return (x / keep_prob) * random_tensor # 应用随机张量的 drop path 操作
return x # 如果不处于训练模式,直接返回输入张量
class TFData2VecVisionEmbeddings(keras.layers.Layer):
"""
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
"""
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
self.config = config # 初始化方法,设置了一个名为 config 的属性
self.patch_embeddings = TFData2VecVisionPatchEmbeddings(config, name="patch_embeddings") # 创建 TFData2VecVisionPatchEmbeddings 对象
self.num_patches = self.patch_embeddings.num_patches # 获取 patch embeddings 的数量
self.config = config # 再次设置 config 属性,这可能是重复的操作
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) # 创建一个 dropout 层
def build(self, input_shape=None):
self.cls_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
trainable=True,
name="cls_token",
) # 构建 CLS token 的权重
if self.config.use_mask_token:
self.mask_token = self.add_weight(
shape=(1, 1, self.config.hidden_size),
initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
trainable=True,
name="mask_token",
) # 如果配置中使用 mask token,则构建 mask token 的权重
else:
self.mask_token = None # 否则,设置 mask token 为 None
if self.config.use_absolute_position_embeddings:
self.position_embeddings = self.add_weight(
shape=(1, self.num_patches + 1, self.config.hidden_size),
initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
trainable=True,
name="position_embeddings",
) # 如果配置中使用绝对位置 embeddings,则构建位置 embeddings 的权重
else:
self.position_embeddings = None # 否则,设置位置 embeddings 为 None
if self.built:
return
self.built = True # 标记为已构建状态
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None) # 构建 patch embeddings
# 定义一个方法`call`,接受两个参数`pixel_values`和`bool_masked_pos`,返回一个张量
def call(self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None) -> tf.Tensor:
# 使用`patch_embeddings`方法将输入的像素值转换成嵌入向量
embeddings = self.patch_embeddings(pixel_values)
# 获取嵌入向量的形状信息:批大小、序列长度和投影维度
batch_size, seq_len, projection_dim = shape_list(embeddings)
# 创建一个形状为(batch_size, 1, 1)的张量,其中每个元素都是`cls_token`
cls_tokens = tf.tile(self.cls_token, (batch_size, 1, 1))
# 如果`bool_masked_pos`不为`None`,则执行以下操作
if bool_masked_pos is not None:
# 创建一个形状与`embeddings`相同的张量,每个元素都是`self.mask_token`
mask_tokens = tf.broadcast_to(self.mask_token, (batch_size, seq_len, projection_dim))
# 将被掩盖的视觉标记替换为`mask_tokens`
w = bool_masked_pos[..., None]
# 将`w`转换为与`mask_tokens`相同的数据类型
w = tf.cast(w, mask_tokens.dtype)
# 由于TF不支持即时张量赋值,使用加法和乘法来实现掩盖操作
embeddings = embeddings * (1 - w) + mask_tokens * w
# 将`cls_tokens`和`embeddings`沿着序列长度的方向连接起来
embeddings = tf.concat([cls_tokens, embeddings], axis=1)
# 如果存在`position_embeddings`,将其加到`embeddings`上
if self.position_embeddings is not None:
embeddings = embeddings + self.position_embeddings
# 对`embeddings`应用dropout操作
embeddings = self.dropout(embeddings)
# 返回处理后的`embeddings`张量作为方法的输出
return embeddings
class TFData2VecVisionPatchEmbeddings(keras.layers.Layer):
"""
Image to Patch Embedding.
"""
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
# 从配置中获取图像大小和补丁大小
image_size, patch_size = config.image_size, config.patch_size
# 获取通道数和隐藏层大小
num_channels, hidden_size = config.num_channels, config.hidden_size
# 将图像大小和补丁大小转换为迭代对象(如果它们不是),确保它们是元组
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算图像中的补丁数
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
# 计算补丁的形状
patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
# 设置对象的属性
self.image_size = image_size
self.patch_size = patch_size
self.num_patches = num_patches
self.patch_shape = patch_shape
self.num_channels = num_channels
# 创建卷积层,用于将像素值投影到隐藏空间
self.projection = keras.layers.Conv2D(
filters=hidden_size,
kernel_size=patch_size,
strides=patch_size,
padding="valid",
data_format="channels_last",
kernel_initializer="glorot_uniform", # 使用glorot_uniform初始化权重,类似于torch.nn.Linear
bias_initializer="zeros",
name="projection",
)
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
# 获取像素值张量的形状信息
batch_size, num_channels, height, width = shape_list(pixel_values)
# 在动态执行模式下,验证像素值的通道数是否与配置中设置的通道数匹配
if tf.executing_eagerly():
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the"
" configuration."
)
# 验证输入图像的高度和宽度是否与配置中设置的图像大小匹配
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model"
f" ({self.image_size[0]}*{self.image_size[1]})."
)
# 当在CPU上运行时,`keras.layers.Conv2D`不支持`NCHW`格式,所以将输入格式从`NCHW`转换为`NHWC`
# shape = (batch_size, in_height, in_width, in_channels=num_channels)
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
# 将像素值投影到隐藏空间
projection = self.projection(pixel_values)
# 将2D空间维度变换为单个时间维度,即将投影结果reshape成(batch_size, num_patches, -1)
num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])
return tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1))
# 定义一个方法 `build`,用于构建神经网络层的参数
def build(self, input_shape=None):
# 如果已经构建过,则直接返回,不再重复构建
if self.built:
return
# 标记为已经构建
self.built = True
# 如果存在投影层 `projection`,则构建该投影层
if getattr(self, "projection", None) is not None:
# 在 TensorFlow 中创建一个命名空间 `self.projection.name`
with tf.name_scope(self.projection.name):
# 构建投影层,指定输入形状为 [None, None, None, self.num_channels]
self.projection.build([None, None, None, self.num_channels])
class TFData2VecVisionSelfAttention(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs)
# 检查隐藏大小是否是注意力头数的整数倍
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
# 创建用于查询的全连接层
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
# 创建用于键的全连接层,不使用偏置项
self.key = keras.layers.Dense(
units=self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name="key",
use_bias=False,
)
# 创建用于值的全连接层
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
# Dropout 层,用于注意力概率的丢弃
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
# 如果给定了窗口大小,则创建相对位置偏置层
if window_size:
self.relative_position_bias = TFData2VecVisionRelativePositionBias(
config, window_size=window_size, name="relative_position_bias"
)
else:
self.relative_position_bias = None
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
# 将张量从 [batch_size, seq_length, all_head_size] 重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size]
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
# 将张量从 [batch_size, seq_length, num_attention_heads, attention_head_size] 转置为 [batch_size, num_attention_heads, seq_length, attention_head_size]
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
training: bool = False,
) -> Tuple[tf.Tensor]:
# 获取隐藏状态的批量大小
batch_size = shape_list(hidden_states)[0]
# 通过self.query对隐藏状态进行查询操作
mixed_query_layer = self.query(inputs=hidden_states)
# 通过self.key对隐藏状态进行键操作
mixed_key_layer = self.key(inputs=hidden_states)
# 通过self.value对隐藏状态进行值操作
mixed_value_layer = self.value(inputs=hidden_states)
# 将混合的查询层转置以便进行注意力计算
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
# 将混合的键层转置以便进行注意力计算
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
# 将混合的值层转置以便进行注意力计算
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# 对"查询"和"键"进行点积操作以获得原始注意力分数
# 结果形状为(batch size, num_heads, seq_len_q, seq_len_k)
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
# 对注意力分数进行除以平方根(num_heads)的缩放操作
attention_scores = attention_scores / self.sqrt_att_head_size
# 如果存在相对位置偏置,则添加到注意力分数中
if self.relative_position_bias is not None:
# 传递0.0给relative_position_bias()层,因为在这种情况下,该输入不会参与任何计算
attention_scores = attention_scores + self.relative_position_bias(0.0)[None, ...]
# 如果提供了共享的相对位置偏置,则添加到注意力分数中
if relative_position_bias is not None:
attention_scores = attention_scores + relative_position_bias
# 将注意力分数归一化为概率
attention_probs = stable_softmax(logits=attention_scores, axis=-1)
# 使用dropout随机丢弃整个注意力概率矩阵中的元素,这在Transformer中是标准做法
attention_probs = self.dropout(inputs=attention_probs, training=training)
# 如果有头部掩码(head_mask),则应用头部掩码到注意力概率中
if head_mask is not None:
attention_probs = tf.multiply(attention_probs, head_mask)
# 计算注意力输出,将注意力概率乘以值层
attention_output = tf.matmul(attention_probs, value_layer)
# 调整注意力输出的维度顺序
attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
# 将注意力输出重塑为(batch_size, seq_len_q, all_head_size)
attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
# 如果需要输出注意力矩阵,则将注意力概率包含在输出中
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
return outputs
# 构建方法用于初始化模型结构,如果已经构建过则直接返回
def build(self, input_shape=None):
if self.built:
return
# 将标志设置为已构建
self.built = True
# 如果存在查询(query)属性,则构建查询的神经网络层
if getattr(self, "query", None) is not None:
# 在 TensorFlow 中使用名称作用域来管理操作,这里是为查询层创建名称作用域
with tf.name_scope(self.query.name):
# 构建查询层,输入形状为 [None, None, self.config.hidden_size]
self.query.build([None, None, self.config.hidden_size])
# 如果存在键(key)属性,则构建键的神经网络层
if getattr(self, "key", None) is not None:
# 在 TensorFlow 中使用名称作用域来管理操作,这里是为键层创建名称作用域
with tf.name_scope(self.key.name):
# 构建键层,输入形状为 [None, None, self.config.hidden_size]
self.key.build([None, None, self.config.hidden_size])
# 如果存在值(value)属性,则构建值的神经网络层
if getattr(self, "value", None) is not None:
# 在 TensorFlow 中使用名称作用域来管理操作,这里是为值层创建名称作用域
with tf.name_scope(self.value.name):
# 构建值层,输入形状为 [None, None, self.config.hidden_size]
self.value.build([None, None, self.config.hidden_size])
# 如果存在相对位置偏置(relative_position_bias)属性,则构建该偏置
if getattr(self, "relative_position_bias", None) is not None:
# 在 TensorFlow 中使用名称作用域来管理操作,这里是为相对位置偏置层创建名称作用域
with tf.name_scope(self.relative_position_bias.name):
# 构建相对位置偏置层,输入形状为 None(形状由数据决定)
self.relative_position_bias.build(None)
class TFData2VecVisionSelfOutput(keras.layers.Layer):
"""
The residual connection is defined in TFData2VecVisionLayer instead of here (as is the case with other models), due
to the layernorm applied before each block.
"""
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
# 初始化一个全连接层,用于变换隐藏状态到指定大小
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 初始化一个dropout层,用于在训练时随机丢弃部分神经元,防止过拟合
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor:
# 使用全连接层变换隐藏状态
hidden_states = self.dense(inputs=hidden_states)
# 在训练时应用dropout层
hidden_states = self.dropout(inputs=hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果已经构建,直接返回
if getattr(self, "dense", None) is not None:
# 在名为dense的作用域内构建dense层
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFData2VecVisionAttention(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs)
# 初始化自注意力层,使用Data2VecVisionSelfAttention
self.attention = TFData2VecVisionSelfAttention(config, window_size=window_size, name="attention")
# 初始化输出层,使用TFData2VecVisionSelfOutput
self.dense_output = TFData2VecVisionSelfOutput(config, name="output")
def prune_heads(self, heads):
# 留空,抛出未实现错误,暂不实现头部修剪功能
raise NotImplementedError
def call(
self,
input_tensor: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
training: bool = False,
) -> Tuple[tf.Tensor]:
# 调用自注意力层处理输入张量
self_outputs = self.attention(
hidden_states=input_tensor,
head_mask=head_mask,
output_attentions=output_attentions,
relative_position_bias=relative_position_bias,
training=training,
)
# 使用输出层处理自注意力层的输出结果
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
)
# 将处理后的结果打包成元组输出,如果需要输出注意力权重,则附加到输出元组中
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果未构建,构建自注意力层
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
# 如果未构建,构建输出层
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision
class TFData2VecVisionIntermediate(keras.layers.Layer):
# 初始化函数,用于创建一个新的Data2VecVisionConfig对象,并设置网络层的一些参数
def __init__(self, config: Data2VecVisionConfig, **kwargs):
# 调用父类的初始化方法,传递额外的关键字参数
super().__init__(**kwargs)
# 创建一个全连接层,设置单元数为config.intermediate_size,使用指定的初始化器初始化权重
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 如果config.hidden_act是字符串类型,将其转换为对应的激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
# 否则直接使用配置中指定的激活函数
self.intermediate_act_fn = config.hidden_act
# 保存配置对象到当前实例中
self.config = config
# 调用函数,实现对输入隐藏状态的前向传播
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 将输入隐藏状态传递给全连接层,并获取输出
hidden_states = self.dense(inputs=hidden_states)
# 将全连接层的输出应用中间激活函数
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回处理后的隐藏状态作为输出
return hidden_states
# 构建函数,用于在第一次调用call函数时构建网络层
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 设置标志位表示已经构建过网络层
self.built = True
# 如果存在全连接层dense,则开始构建dense层,指定输入形状为[None, None, self.config.hidden_size]
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFData2VecVisionOutput(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
# 创建一个全连接层,用于处理隐藏状态
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
# 创建一个dropout层,用于在训练时随机断开连接以防止过拟合
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
# 输入隐藏状态到全连接层进行处理
hidden_states = self.dense(inputs=hidden_states)
# 根据训练状态应用dropout操作
hidden_states = self.dropout(inputs=hidden_states, training=training)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
# 根据配置构建全连接层,输入形状为[None, None, self.config.intermediate_size]
self.dense.build([None, None, self.config.intermediate_size])
class TFData2VecVisionLayer(keras.layers.Layer):
"""This corresponds to the Block class in the timm implementation."""
def __init__(
self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0, **kwargs
):
super().__init__(**kwargs)
self.config = config
# 创建注意力层对象,用于处理视觉特征
self.attention = TFData2VecVisionAttention(config, window_size=window_size, name="attention")
# 创建中间层对象,用于进一步处理注意力层的输出
self.intermediate = TFData2VecVisionIntermediate(config, name="intermediate")
# 创建数据2向量输出层对象,用于最终的输出
self.data2vec_output = TFData2VecVisionOutput(config, name="output")
# 创建LayerNormalization层,用于在每个子层之前应用
self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
# 创建LayerNormalization层,用于在每个子层之后应用
self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
# 根据drop_path_rate值选择不同的激活层对象
self.drop_path = (
TFData2VecVisionDropPath(drop_path_rate, name="drop_path")
if drop_path_rate > 0.0
else keras.layers.Activation("linear", name="drop_path")
)
# 初始化层标度的初始值
self.init_values = config.layer_scale_init_value
# 定义模型的 build 方法,用于构建模型结构
def build(self, input_shape: tf.TensorShape = None):
# 如果指定了初始化值,则创建 lambda_1 和 lambda_2 权重,并赋予初始值
if self.init_values > 0:
# 创建 lambda_1 权重,形状为隐藏层大小,初始化为全1,可训练
self.lambda_1 = self.add_weight(
shape=(self.config.hidden_size),
initializer="ones",
trainable=True,
name="lambda_1",
)
# 创建 lambda_2 权重,形状为隐藏层大小,初始化为全1,可训练
self.lambda_2 = self.add_weight(
shape=(self.config.hidden_size),
initializer="ones",
trainable=True,
name="lambda_2",
)
# 使用初始化值乘以全1向量,赋值给 lambda_1 和 lambda_2
self.lambda_1.assign(self.init_values * tf.ones((self.config.hidden_size)))
self.lambda_2.assign(self.init_values * tf.ones((self.config.hidden_size)))
else:
# 如果没有指定初始化值,则 lambda_1 和 lambda_2 设为 None
self.lambda_1, self.lambda_2 = None, None
# 如果模型已经构建完成,则直接返回
if self.built:
return
self.built = True
# 如果定义了 attention 属性,则构建 attention 模块
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
# 如果定义了 intermediate 属性,则构建 intermediate 模块
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
# 如果定义了 data2vec_output 属性,则构建 data2vec_output 模块
if getattr(self, "data2vec_output", None) is not None:
with tf.name_scope(self.data2vec_output.name):
self.data2vec_output.build(None)
# 如果定义了 layernorm_before 属性,则构建 layernorm_before 模块
if getattr(self, "layernorm_before", None) is not None:
with tf.name_scope(self.layernorm_before.name):
self.layernorm_before.build([None, None, self.config.hidden_size])
# 如果定义了 layernorm_after 属性,则构建 layernorm_after 模块
if getattr(self, "layernorm_after", None) is not None:
with tf.name_scope(self.layernorm_after.name):
self.layernorm_after.build([None, None, self.config.hidden_size])
# 如果定义了 drop_path 属性,则构建 drop_path 模块
if getattr(self, "drop_path", None) is not None:
with tf.name_scope(self.drop_path.name):
self.drop_path.build(None)
# 定义模型的 call 方法,用于执行前向传播
def call(
self,
hidden_states: tf.Tensor,
head_mask: tf.Tensor,
output_attentions: bool,
relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
training: bool = False,
) -> Tuple[tf.Tensor]: # 定义函数返回类型为包含一个 TensorFlow 张量的元组
self_attention_outputs = self.attention(
# 在 Data2VecVision 中,在自注意力之前应用层归一化
input_tensor=self.layernorm_before(inputs=hidden_states), # 对输入张量进行层归一化处理
head_mask=head_mask, # 头部遮罩,用于指定屏蔽哪些注意力头
output_attentions=output_attentions, # 是否输出注意力权重
relative_position_bias=relative_position_bias, # 相对位置偏置,用于自注意力中的位置编码
training=training, # 是否在训练模式下
)
attention_output = self_attention_outputs[0] # 获取自注意力模块的输出张量
outputs = self_attention_outputs[1:] # 如果需要输出注意力权重,则包含在输出中
# 如果存在 lambda_1,则应用到注意力输出上
if self.lambda_1 is not None:
attention_output = self.lambda_1 * attention_output
# 第一个残差连接
hidden_states = self.drop_path(attention_output) + hidden_states # 使用 drop_path 函数进行残差连接
# 在 Data2VecVision 中,还会在自注意力之后应用层归一化
layer_output = self.layernorm_after(hidden_states) # 对残差连接后的张量进行层归一化处理
layer_output = self.intermediate(layer_output) # 中间层转换
layer_output = self.data2vec_output(layer_output) # Data2Vec 输出层
# 如果存在 lambda_2,则应用到最终输出层上
if self.lambda_2 is not None:
layer_output = self.lambda_2 * layer_output
# 第二个残差连接
layer_output = self.drop_path(layer_output) + hidden_states # 使用 drop_path 函数进行残差连接到原始隐藏状态上
outputs = (layer_output,) + outputs # 构建最终的输出元组
return outputs # 返回输出元组
# Taken and modified from here:
# https://github.com/leondgarse/keras_cv_attention_models/blob/main/keras_cv_attention_models/beit/beit.py#L28
# 定义一个自定义的Keras层,用于处理数据2Vec视觉任务的相对位置偏置
class TFData2VecVisionRelativePositionBias(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, window_size: tuple, **kwargs) -> None:
super().__init__(**kwargs)
self.config = config
self.window_size = window_size
# 计算相对距离的数量,加上3用于处理特殊标记(cls_token_pos_len)
# window_size可以是类似于(14, 14)的元组
self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
# 创建相对位置索引
self.relative_position_index = self.get_position_index()
def build(self, input_shape):
# 添加一个可训练的权重矩阵,用于存储相对位置偏置表
self.relative_position_bias_table = self.add_weight(
shape=(self.num_relative_distance, self.config.num_attention_heads),
initializer="zeros",
trainable=True,
name="relative_position_bias_table",
) # [2*Wh-1 * 2*Ww-1, nH]
# cls to token & token 2 cls & cls to cls
super().build(input_shape)
def get_position_index(self):
# 获取窗口内每个标记的成对相对位置索引
xx, yy = tf.meshgrid(range(self.window_size[0]), range(self.window_size[1]))
coords = tf.stack([yy, xx], axis=0) # [2, Wh, Ww]
coords_flatten = tf.reshape(coords, [2, -1]) # [2, Wh*Ww]
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # [2, Wh*Ww, Wh*Ww]
relative_coords = tf.transpose(relative_coords, perm=[1, 2, 0]) # [Wh*Ww, Wh*Ww, 2]
xx = (relative_coords[:, :, 0] + self.window_size[0] - 1) * (2 * self.window_size[1] - 1)
yy = relative_coords[:, :, 1] + self.window_size[1] - 1
relative_coords = tf.stack([xx, yy], axis=-1)
relative_position_index = tf.reduce_sum(relative_coords, axis=-1) # [Wh*Ww, Wh*Ww]
# 添加特殊标记,表示cls到token、token到cls、cls到cls的相对位置
top = tf.ones((1, relative_position_index.shape[1]), dtype=relative_position_index.dtype) * (
self.num_relative_distance - 3
)
left = tf.ones((relative_position_index.shape[0], 1), dtype=relative_position_index.dtype) * (
self.num_relative_distance - 2
)
corner = tf.ones((1, 1), dtype=relative_position_index.dtype) * (self.num_relative_distance - 1)
left_corner = tf.concat([corner, left], axis=0)
relative_position_index = tf.concat([top, relative_position_index], axis=0)
relative_position_index = tf.concat([left_corner, relative_position_index], axis=1) # [Wh*Ww + 1, Wh*Ww + 1]
return relative_position_index
def call(self, inputs=None) -> tf.Tensor:
# 根据相对位置索引从相对位置偏置表中获取相对位置偏置
relative_position_bias = tf.gather(self.relative_position_bias_table, self.relative_position_index, axis=0)
return tf.transpose(relative_position_bias, [2, 0, 1])
def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs):
super().__init__(**kwargs)
self.config = config # 初始化对象的配置信息
# 根据配置决定是否创建相对位置偏置对象
if config.use_shared_relative_position_bias:
self.relative_position_bias = TFData2VecVisionRelativePositionBias(
config, window_size=window_size, name="relative_position_bias"
)
else:
self.relative_position_bias = None
# 根据层的数量创建 TFData2VecVisionLayer 对象的列表
# 每层具有不同的 drop path rate,并且根据配置选择是否使用相对位置偏置
dpr = list(tf.linspace(0.0, config.drop_path_rate, config.num_hidden_layers))
self.layer = [
TFData2VecVisionLayer(
config,
window_size=window_size if config.use_relative_position_bias else None,
drop_path_rate=dpr[i],
name=f"layer_._{i}",
)
for i in range(config.num_hidden_layers)
]
def call(
self,
hidden_states: tf.Tensor,
head_mask: tf.Tensor | None = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> Union[tuple, TFBaseModelOutput]:
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
# 遍历所有层,对每一层进行前向传播
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 获取当前层的头部掩码
layer_head_mask = head_mask[i] if head_mask is not None else None
# 获取相对位置偏置对象,如果没有则为 None
relative_position_bias = (
self.relative_position_bias(0.0) if self.relative_position_bias is not None else None
)
# 调用当前层的前向传播方法
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, relative_position_bias)
# 更新隐藏状态为当前层的输出
hidden_states = layer_outputs[0]
# 如果需要输出注意力权重,记录当前层的注意力权重输出
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
# 如果需要输出所有隐藏状态,记录最终的隐藏状态
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# 根据 return_dict 决定返回值的格式
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 返回 TFBaseModelOutput 对象,包含最终的隐藏状态、所有隐藏状态和注意力权重
return TFBaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
# 定义 build 方法,用于构建神经网络层
def build(self, input_shape=None):
# 如果已经构建过,直接返回,避免重复构建
if self.built:
return
# 设置标志位,表示已经构建
self.built = True
# 检查是否存在相对位置偏置项,如果有,则构建其相应的层
if getattr(self, "relative_position_bias", None) is not None:
# 使用相对位置偏置项的名称作为命名空间
with tf.name_scope(self.relative_position_bias.name):
# 调用该项的 build 方法构建
self.relative_position_bias.build(None)
# 检查是否存在层列表,如果有,则逐层构建
if getattr(self, "layer", None) is not None:
# 遍历层列表
for layer in self.layer:
# 使用层的名称作为命名空间
with tf.name_scope(layer.name):
# 调用层的 build 方法构建
layer.build(None)
# 声明一个自定义层 TFData2VecVisionMainLayer,使用 keras_serializable 装饰器标记为可序列化
@keras_serializable
class TFData2VecVisionMainLayer(keras.layers.Layer):
# 设置类属性 config_class 为 Data2VecVisionConfig,指定配置类
config_class = Data2VecVisionConfig
# 初始化方法,接受 Data2VecVisionConfig 实例和一个布尔型参数 add_pooling_layer
def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = True, **kwargs):
# 调用父类的初始化方法
super().__init__(**kwargs)
# 设置实例属性 config 为传入的 config 参数
self.config = config
# 设置实例属性 add_pooling_layer 为传入的 add_pooling_layer 参数
self.add_pooling_layer = add_pooling_layer
# 创建 TFData2VecVisionEmbeddings 实例,命名为 embeddings
self.embeddings = TFData2VecVisionEmbeddings(config, name="embeddings")
# 创建 TFData2VecVisionEncoder 实例,命名为 encoder,传入 window_size 参数
self.encoder = TFData2VecVisionEncoder(
config, window_size=self.embeddings.patch_embeddings.patch_shape, name="encoder"
)
# 根据配置中的 use_mean_pooling 属性选择性地初始化 layernorm 层
self.layernorm = (
tf.identity # 如果 use_mean_pooling 为 True,使用 tf.identity
if config.use_mean_pooling
else keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# 如果 use_mean_pooling 为 False,使用 LayerNormalization,并指定 epsilon 和名字
)
# 如果 add_pooling_layer 为 True,则创建 TFData2VecVisionPooler 实例,命名为 pooler
# 否则设为 None
self.pooler = TFData2VecVisionPooler(config, name="pooler") if add_pooling_layer else None
# 返回嵌入层的输入 embeddings.patch_embeddings
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings.patch_embeddings
# 未实现的方法,用于剪枝模型中的注意力头
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
# 使用 unpack_inputs 装饰器处理输入参数,定义模型的调用方法
def call(
self,
pixel_values: tf.Tensor | None = None,
bool_masked_pos: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
**kwargs
):
# 这里未完整显示,继续下面的函数参数和逻辑
) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
# 设置输出注意力权重,默认为模型配置中的设定
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 设置输出隐藏状态,默认为模型配置中的设定
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 设置是否返回字典格式的输出,默认为模型配置中的设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
# 如果未提供像素值,抛出数值错误
raise ValueError("You have to specify pixel_values")
# 如果需要,准备头部掩码
# head_mask 中 1.0 表示保留该头部
# attention_probs 的形状为 bsz x n_heads x N x N
# 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
# head_mask 被转换为 [num_hidden_layers x batch x num_heads x seq_length x seq_length] 的形状
if head_mask is not None:
# 如果有头部掩码,则抛出未实现错误
raise NotImplementedError
else:
# 否则,使用 None 初始化 head_mask 列表,长度为模型中的隐藏层数
head_mask = [None] * self.config.num_hidden_layers
# 使用 embeddings 方法生成嵌入输出
embedding_output = self.embeddings(pixel_values, bool_masked_pos, training=training)
# 使用 encoder 方法对嵌入输出进行编码
encoder_outputs = self.encoder(
embedding_output,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取编码器的序列输出
sequence_output = encoder_outputs[0]
# 应用 layernorm 层
sequence_output = self.layernorm(sequence_output)
# 如果存在池化层,对序列输出进行池化
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
# 如果不要求返回字典格式的输出,则返回元组形式的输出
head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
return head_outputs + encoder_outputs[1:]
# 如果要求返回字典格式的输出,则构建 TFData2VecVisionModelOutputWithPooling 对象返回
return TFData2VecVisionModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def build(self, input_shape=None):
# 如果已经构建过模型,则直接返回
if self.built:
return
self.built = True
# 如果存在 embeddings 属性,则构建 embeddings 层
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# 如果存在 encoder 属性,则构建 encoder 层
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果存在 layernorm 属性,则构建 layernorm 层
if getattr(self, "layernorm", None) is not None:
if hasattr(self.layernorm, "name"):
with tf.name_scope(self.layernorm.name):
self.layernorm.build((None, self.config.hidden_size))
# 如果存在 pooler 属性,则构建 pooler 层
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFData2VecVisionPooler(keras.layers.Layer):
def __init__(self, config: Data2VecVisionConfig, **kwargs):
super().__init__(**kwargs)
# 如果配置要求使用均值池化,则初始化 LayerNormalization 层
self.layernorm = (
keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
if config.use_mean_pooling
else None
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
if self.layernorm is not None:
# 对补丁令牌的最终隐藏状态进行均值池化
patch_tokens = hidden_states[:, 1:, :]
pooled_output = self.layernorm(tf.reduce_mean(patch_tokens, axis=1))
else:
# 通过仅获取 [CLS] 令牌的最终隐藏状态进行池化
pooled_output = hidden_states[:, 0]
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在 layernorm 层,构建它并指定其输入形状
if getattr(self, "layernorm", None) is not None:
if hasattr(self.layernorm, "name"):
with tf.name_scope(self.layernorm.name):
self.layernorm.build((None, self.config.hidden_size))
class TFData2VecVisionPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Data2VecVisionConfig
base_model_prefix = "data2vec_vision"
main_input_name = "pixel_values"
_keys_to_ignore_on_load_unexpected = [r"relative_position_index"]
DATA2VEC_VISION_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.).
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
仅传入 `pixel_values` 张量,没有其他参数,用法示例为 `model(pixel_values)`。
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
传入一个长度可变的列表,按照文档字符串中给定的顺序包含一个或多个输入张量,例如 `model([pixel_values, attention_mask])` 或 `model([pixel_values, attention_mask, token_type_ids])`。
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
传入一个字典,其中键是文档字符串中指定的输入名称,对应的值是相应的输入张量,例如 `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`。
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`Data2VecVisionConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
"""
DATA2VEC_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`BeitImageProcessor.__call__`] for details.
head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
in eager mode, in graph mode the value will always be set to True.
training (`bool`, *optional*, defaults to `False``):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
"""
@add_start_docstrings(
"The bare Data2VecVision Model transformer outputting raw hidden-states without any specific head on top.",
DATA2VEC_VISION_START_DOCSTRING,
)
class TFData2VecVisionModel(TFData2VecVisionPreTrainedModel):
def __init__(self, config: Data2VecVisionConfig, add_pooling_layer: bool = False, *inputs, **kwargs):
# 调用父类构造函数初始化模型
super().__init__(config, *inputs, **kwargs)
# 将传入的配置参数保存在实例变量中
self.config = config
# 创建 Data2VecVisionMainLayer 的实例作为模型的核心层
self.data2vec_vision = TFData2VecVisionMainLayer(
config, add_pooling_layer=add_pooling_layer, name="data2vec_vision"
)
# 获取输入嵌入的方法
def get_input_embeddings(self):
return self.data2vec_vision.get_input_embeddings()
# 模型的调用方法,接受多个输入参数并返回模型输出
@unpack_inputs
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFData2VecVisionModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def call(
self,
pixel_values: TFModelInputType | None = None,
bool_masked_pos: tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
# 剩余未列出的参数由装饰器处理
) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
r"""
bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
# 调用 self.data2vec_vision 方法,传入参数并获取输出
outputs = self.data2vec_vision(
pixel_values=pixel_values,
bool_masked_pos=bool_masked_pos,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回 self.data2vec_vision 方法的输出作为结果
return outputs
def build(self, input_shape=None):
# 如果已经构建过模型,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果 self.data2vec_vision 方法存在
if getattr(self, "data2vec_vision", None) is not None:
# 在命名空间 self.data2vec_vision.name 下构建模型
with tf.name_scope(self.data2vec_vision.name):
self.data2vec_vision.build(None)
@add_start_docstrings(
"""
Data2VecVision Model transformer with an image classification head on top (a linear layer`
@add_start_docstrings(
"""
Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
the final hidden states of the patch tokens) e.g. for ImageNet.
""",
DATA2VEC_VISION_START_DOCSTRING,
)
class TFData2VecVisionForImageClassification(TFData2VecVisionPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=True, name="data2vec_vision")
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def call(
self,
pixel_values: TFModelInputType | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.data2vec_vision(
pixel_values=pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "data2vec_vision", None) is not None:
with tf.name_scope(self.data2vec_vision.name):
self.data2vec_vision.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
class TFData2VecVisionConvModule(keras.layers.Layer):
"""
A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, int]],
padding: str = "valid",
bias: bool = False,
dilation: Union[int, Tuple[int, int]] = 1,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.conv = keras.layers.Conv2D(
filters=out_channels,
kernel_size=kernel_size,
padding=padding,
use_bias=bias,
dilation_rate=dilation,
name="conv",
)
self.bn = keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5)
self.activation = tf.nn.relu
self.in_channels = in_channels
self.out_channels = out_channels
def call(self, input: tf.Tensor) -> tf.Tensor:
output = self.conv(input)
output = self.bn(output)
output = self.activation(output)
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, None, self.in_channels])
if getattr(self, "bn", None) is not None:
with tf.name_scope(self.bn.name):
self.bn.build((None, None, None, self.out_channels))
class TFAdaptiveAvgPool2D(keras.layers.Layer):
def __init__(self, output_dims: Tuple[int, int], input_ordering: str = "NHWC", **kwargs):
super().__init__(**kwargs)
self.output_dims = output_dims
self.input_ordering = input_ordering
if input_ordering not in ("NCHW", "NHWC"):
raise ValueError("Unrecognized input_ordering, should be 'NCHW' or 'NHWC'!")
self.h_axis = input_ordering.index("H")
self.w_axis = input_ordering.index("W")
def call(self, inputs: tf.Tensor):
if self.input_ordering == "NHWC":
input_shape = inputs.shape[1:3]
else:
input_shape = inputs.shape[2:]
if self.output_dims[0] == self.output_dims[1] == 1:
if self.input_ordering == "NHWC":
reduce_dims = [1, 2]
else:
reduce_dims = [2, 3]
return tf.reduce_mean(inputs, axis=reduce_dims, keepdims=True)
elif input_shape[0] % self.output_dims[0] == 0 and input_shape[1] % self.output_dims[1] == 0:
h_resize = int(input_shape[0] // self.output_dims[0])
w_resize = int(input_shape[1] // self.output_dims[1])
return tf.nn.avg_pool2d(
inputs,
ksize=(h_resize, w_resize),
strides=(h_resize, w_resize),
padding="VALID",
data_format=self.input_ordering,
)
else:
h_pooled = self.pseudo_1d_pool(inputs, h_pooling=True)
return self.pseudo_1d_pool(h_pooled, h_pooling=False)
class TFData2VecVisionPyramidPoolingModule(keras.layers.Layer):
"""
Pyramid Pooling Module (PPM) used in PSPNet.
Args:
pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
Module.
channels (int): Channels after modules, before conv_seg.
Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
"""
def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, out_channels: int, **kwargs) -> None:
super().__init__(**kwargs)
self.pool_scales = pool_scales
self.in_channels = in_channels
self.out_channels = out_channels
self.layer_list = []
for idx, pool_scale in enumerate(pool_scales):
pool_scale = pool_scale if isinstance(pool_scale, collections.abc.Iterable) else (pool_scale, pool_scale)
self.layer_list.append(
[
TFAdaptiveAvgPool2D(output_dims=pool_scale),
TFData2VecVisionConvModule(
in_channels=in_channels, out_channels=self.out_channels, kernel_size=1, name=f"{idx}.1"
),
]
)
def call(self, x: tf.Tensor) -> List[tf.Tensor]:
ppm_outs = []
inputs = x
for ppm in self.layer_list:
for layer_module in ppm:
ppm_out = layer_module(x)
x = ppm_out
upsampled_ppm_out = tf.image.resize(ppm_out, size=shape_list(inputs)[1:-1], method="bilinear")
ppm_outs.append(upsampled_ppm_out)
return ppm_outs
def build(self, input_shape=None):
for layer in self.layer_list:
for layer_module in layer:
with tf.name_scope(layer_module.name):
layer_module.build(None)
class TFData2VecVisionUperHead(keras.layers.Layer):
"""
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://arxiv.org/abs/1807.10221).
Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
"""
def __init__(self, config: Data2VecVisionConfig, **kwargs) -> None:
super().__init__(**kwargs)
self.pool_scales = config.pool_scales
self.in_channels = [config.hidden_size] * 4
self.channels = config.hidden_size
self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
self.psp_modules = TFData2VecVisionPyramidPoolingModule(
self.pool_scales, self.in_channels[-1], self.channels, name="psp_modules"
)
self.bottleneck = TFData2VecVisionConvModule(
self.in_channels[-1] + len(self.pool_scales) * self.channels,
self.channels,
kernel_size=3,
padding="same",
name="bottleneck",
)
self.lateral_convs = []
self.fpn_convs = []
for idx, in_channels in enumerate(self.in_channels[:-1]):
l_conv = TFData2VecVisionConvModule(
in_channels, out_channels=self.channels, kernel_size=1, name=f"lateral_convs.{idx}"
)
fpn_conv = TFData2VecVisionConvModule(
in_channels=self.channels,
out_channels=self.channels,
kernel_size=3,
padding="same",
name=f"fpn_convs.{idx}",
)
self.lateral_convs.append(l_conv)
self.fpn_convs.append(fpn_conv)
self.fpn_bottleneck = TFData2VecVisionConvModule(
in_channels=len(self.in_channels) * self.channels,
out_channels=self.channels,
kernel_size=3,
padding="same",
name="fpn_bottleneck",
)
def psp_forward(self, inputs):
x = inputs[-1]
psp_outs = [x]
psp_outs.extend(self.psp_modules(x))
psp_outs = tf.concat(psp_outs, axis=-1)
output = self.bottleneck(psp_outs)
return output
def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]
laterals.append(self.psp_forward(encoder_hidden_states))
used_backbone_levels = len(laterals)
for i in range(used_backbone_levels - 1, 0, -1):
prev_shape = shape_list(laterals[i - 1])[1:-1]
laterals[i - 1] = laterals[i - 1] + tf.image.resize(laterals[i], size=prev_shape, method="bilinear")
fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
fpn_outs.append(laterals[-1])
for i in range(used_backbone_levels - 1, 0, -1):
fpn_outs[i] = tf.image.resize(fpn_outs[i], size=shape_list(fpn_outs[0])[1:-1], method="bilinear")
fpn_outs = tf.concat(fpn_outs, axis=-1)
output = self.fpn_bottleneck(fpn_outs)
output = self.classifier(output)
return output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, None, self.channels])
if getattr(self, "psp_modules", None) is not None:
with tf.name_scope(self.psp_modules.name):
self.psp_modules.build(None)
if getattr(self, "bottleneck", None) is not None:
with tf.name_scope(self.bottleneck.name):
self.bottleneck.build(None)
if getattr(self, "fpn_bottleneck", None) is not None:
with tf.name_scope(self.fpn_bottleneck.name):
self.fpn_bottleneck.build(None)
for layer in self.lateral_convs:
with tf.name_scope(layer.name):
layer.build(None)
for layer in self.fpn_convs:
with tf.name_scope(layer.name):
layer.build(None)
class TFData2VecVisionFCNHead(keras.layers.Layer):
"""
Fully Convolution Networks for Semantic Segmentation. This head is implemented from
[FCNNet](https://arxiv.org/abs/1411.4038).
Args:
config (Data2VecVisionConfig): Configuration.
kernel_size (int): The kernel size for convs in the head. Default: 3.
dilation (int): The dilation rate for convs in the head. Default: 1.
Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
"""
def __init__(
self,
config: Data2VecVisionConfig,
in_index: int = 2,
kernel_size: int = 3,
dilation: Union[int, Tuple[int, int]] = 1,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.in_channels = config.hidden_size
self.channels = config.auxiliary_channels
self.num_convs = config.auxiliary_num_convs
self.concat_input = config.auxiliary_concat_input
self.in_index = in_index
convs = []
convs.append(
TFData2VecVisionConvModule(
in_channels=self.in_channels,
out_channels=self.channels,
kernel_size=kernel_size,
padding="same",
dilation=dilation,
name="convs.0",
)
)
for i in range(self.num_convs - 1):
convs.append(
TFData2VecVisionConvModule(
in_channels=self.channels,
out_channels=self.channels,
kernel_size=kernel_size,
padding="same",
dilation=dilation,
name=f"conv_module_{i+2}",
)
)
if self.num_convs == 0:
self.convs = [tf.identity]
else:
self.convs = convs
if self.concat_input:
self.conv_cat = TFData2VecVisionConvModule(
self.in_channels + self.channels,
out_channels=self.channels,
kernel_size=kernel_size,
padding="same",
name="conv_cat",
)
self.classifier = keras.layers.Conv2D(config.num_labels, kernel_size=1, name="classifier")
def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = encoder_hidden_states[self.in_index]
output = hidden_states
for layer_module in self.convs:
output = layer_module(output)
if self.concat_input:
output = self.conv_cat(tf.concat([hidden_states, output], axis=-1))
output = self.classifier(output)
return output
if self.built:
return
self.built = True
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, None, self.channels])
if getattr(self, "conv_cat", None) is not None:
with tf.name_scope(self.conv_cat.name):
self.conv_cat.build(None)
@add_start_docstrings(
"""
Data2VecVision Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
""",
DATA2VEC_VISION_START_DOCSTRING,
)
class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel):
def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs) -> None:
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.data2vec_vision = TFData2VecVisionMainLayer(config, add_pooling_layer=False, name="data2vec_vision")
self.fpn1 = [
keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"),
keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5),
keras.layers.Activation("gelu"),
keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"),
]
self.fpn2 = [
keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn2.0"),
]
self.fpn3 = tf.identity
self.fpn4 = keras.layers.MaxPool2D(pool_size=2, strides=2)
self.decode_head = TFData2VecVisionUperHead(config, name="decode_head")
self.auxiliary_head = (
TFData2VecVisionFCNHead(config, name="auxiliary_head") if config.use_auxiliary_head else None
)
def compute_loss(self, logits, auxiliary_logits, labels):
if len(shape_list(labels)) > 3:
label_interp_shape = shape_list(labels)[1:-1]
else:
label_interp_shape = shape_list(labels)[-2:]
upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
if auxiliary_logits is not None:
upsampled_auxiliary_logits = tf.image.resize(auxiliary_logits, size=label_interp_shape, method="bilinear")
loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
def masked_loss(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, self.config.semantic_loss_ignore_index))
loss_ = loss_fct(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
reduced_masked_loss = tf.reduce_sum(loss_) / tf.reduce_sum(mask)
return tf.reshape(reduced_masked_loss, (1,))
main_loss = masked_loss(labels, upsampled_logits)
auxiliary_loss = masked_loss(labels, upsampled_auxiliary_logits)
loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
return loss
@unpack_inputs
@add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
def call(
self,
pixel_values: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
labels: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "data2vec_vision", None) is not None:
with tf.name_scope(self.data2vec_vision.name):
self.data2vec_vision.build(None)
if getattr(self, "decode_head", None) is not None:
with tf.name_scope(self.decode_head.name):
self.decode_head.build(None)
if getattr(self, "auxiliary_head", None) is not None:
with tf.name_scope(self.auxiliary_head.name):
self.auxiliary_head.build(None)
if getattr(self, "fpn1", None) is not None:
with tf.name_scope(self.fpn1[0].name):
self.fpn1[0].build([None, None, None, self.config.hidden_size])
with tf.name_scope(self.fpn1[1].name):
self.fpn1[1].build((None, None, None, self.config.hidden_size))
with tf.name_scope(self.fpn1[3].name):
self.fpn1[3].build([None, None, None, self.config.hidden_size])
if getattr(self, "fpn2", None) is not None:
with tf.name_scope(self.fpn2[0].name):
self.fpn2[0].build([None, None, None, self.config.hidden_size])
.\models\data2vec\__init__.py
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
_import_structure = {
"configuration_data2vec_audio": ["DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig"],
"configuration_data2vec_text": [
"DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Data2VecTextConfig",
"Data2VecTextOnnxConfig",
],
"configuration_data2vec_vision": [
"DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Data2VecVisionConfig",
"Data2VecVisionOnnxConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_data2vec_audio"] = [
"DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
"Data2VecAudioForAudioFrameClassification",
"Data2VecAudioForCTC",
"Data2VecAudioForSequenceClassification",
"Data2VecAudioForXVector",
"Data2VecAudioModel",
"Data2VecAudioPreTrainedModel",
]
_import_structure["modeling_data2vec_text"] = [
"DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
"Data2VecTextForCausalLM",
"Data2VecTextForMaskedLM",
"Data2VecTextForMultipleChoice",
"Data2VecTextForQuestionAnswering",
"Data2VecTextForSequenceClassification",
"Data2VecTextForTokenClassification",
"Data2VecTextModel",
"Data2VecTextPreTrainedModel",
]
_import_structure["modeling_data2vec_vision"] = [
"DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST",
"Data2VecVisionForImageClassification",
"Data2VecVisionForMaskedImageModeling",
"Data2VecVisionForSemanticSegmentation",
"Data2VecVisionModel",
"Data2VecVisionPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig
from .configuration_data2vec_text import (
DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
Data2VecTextConfig,
Data2VecTextOnnxConfig,
)
from .configuration_data2vec_vision import (
DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP,
Data2VecVisionConfig,
Data2VecVisionOnnxConfig,
)
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_data2vec_audio import (
DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
Data2VecAudioForAudioFrameClassification,
Data2VecAudioForCTC,
Data2VecAudioForSequenceClassification,
Data2VecAudioForXVector,
Data2VecAudioModel,
Data2VecAudioPreTrainedModel,
)
from .modeling_data2vec_text import (
DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
Data2VecTextForCausalLM,
Data2VecTextForMaskedLM,
Data2VecTextForMultipleChoice,
Data2VecTextForQuestionAnswering,
Data2VecTextForSequenceClassification,
Data2VecTextForTokenClassification,
Data2VecTextModel,
Data2VecTextPreTrainedModel,
)
from .modeling_data2vec_vision import (
DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
Data2VecVisionForImageClassification,
Data2VecVisionForMaskedImageModeling,
Data2VecVisionForSemanticSegmentation,
Data2VecVisionModel,
Data2VecVisionPreTrainedModel,
)
if is_tf_available():
from .modeling_tf_data2vec_vision import (
TFData2VecVisionForImageClassification,
TFData2VecVisionForSemanticSegmentation,
TFData2VecVisionModel,
TFData2VecVisionPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deberta\configuration_deberta.py
""" DeBERTa model configuration"""
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
if TYPE_CHECKING:
from ... import FeatureExtractionMixin, PreTrainedTokenizerBase, TensorType
logger = logging.get_logger(__name__)
DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/config.json",
"microsoft/deberta-large": "https://huggingface.co/microsoft/deberta-large/resolve/main/config.json",
"microsoft/deberta-xlarge": "https://huggingface.co/microsoft/deberta-xlarge/resolve/main/config.json",
"microsoft/deberta-base-mnli": "https://huggingface.co/microsoft/deberta-base-mnli/resolve/main/config.json",
"microsoft/deberta-large-mnli": "https://huggingface.co/microsoft/deberta-large-mnli/resolve/main/config.json",
"microsoft/deberta-xlarge-mnli": "https://huggingface.co/microsoft/deberta-xlarge-mnli/resolve/main/config.json",
}
class DebertaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DebertaModel`] or a [`TFDebertaModel`]. It is
used to instantiate a DeBERTa model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
[microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import DebertaConfig, DebertaModel
>>> # Initializing a DeBERTa microsoft/deberta-base style configuration
>>> configuration = DebertaConfig()
>>> # Initializing a model (with random weights) from the microsoft/deberta-base style configuration
>>> model = DebertaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "deberta"
def __init__(
self,
vocab_size=50265,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=0,
initializer_range=0.02,
layer_norm_eps=1e-7,
relative_attention=False,
max_relative_positions=-1,
pad_token_id=0,
position_biased_input=True,
pos_att_type=None,
pooler_dropout=0,
pooler_hidden_act="gelu",
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.relative_attention = relative_attention
self.max_relative_positions = max_relative_positions
self.pad_token_id = pad_token_id
self.position_biased_input = position_biased_input
if isinstance(pos_att_type, str):
pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]
self.pos_att_type = pos_att_type
self.vocab_size = vocab_size
self.layer_norm_eps = layer_norm_eps
self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)
self.pooler_dropout = pooler_dropout
self.pooler_hidden_act = pooler_hidden_act
class DebertaOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
if self._config.type_vocab_size > 0:
return OrderedDict(
[("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis)]
)
else:
return OrderedDict([("input_ids", dynamic_axis), ("attention_mask", dynamic_axis)])
@property
def default_onnx_opset(self) -> int:
return 12
def generate_dummy_inputs(
self,
preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
batch_size: int = -1,
seq_length: int = -1,
num_choices: int = -1,
is_pair: bool = False,
framework: Optional["TensorType"] = None,
num_channels: int = 3,
image_width: int = 40,
image_height: int = 40,
tokenizer: "PreTrainedTokenizerBase" = None,
) -> Mapping[str, Any]:
dummy_inputs = super().generate_dummy_inputs(preprocessor=preprocessor, framework=framework)
if self._config.type_vocab_size == 0 and "token_type_ids" in dummy_inputs:
del dummy_inputs["token_type_ids"]
return dummy_inputs
.\models\deberta\modeling_deberta.py
""" PyTorch DeBERTa 模型。"""
from collections.abc import Sequence
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
MaskedLMOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import softmax_backward_data
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_deberta import DebertaConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DebertaConfig"
_CHECKPOINT_FOR_DOC = "microsoft/deberta-base"
_CHECKPOINT_FOR_MASKED_LM = "lsanochkin/deberta-large-feedback"
_MASKED_LM_EXPECTED_OUTPUT = "' Paris'"
_MASKED_LM_EXPECTED_LOSS = "0.54"
_CHECKPOINT_FOR_QA = "Palak/microsoft_deberta-large_squad"
_QA_EXPECTED_OUTPUT = "' a nice puppet'"
_QA_EXPECTED_LOSS = 0.14
_QA_TARGET_START_INDEX = 12
_QA_TARGET_END_INDEX = 14
DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/deberta-base",
"microsoft/deberta-large",
"microsoft/deberta-xlarge",
"microsoft/deberta-base-mnli",
"microsoft/deberta-large-mnli",
"microsoft/deberta-xlarge-mnli",
]
class ContextPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
self.dropout = StableDropout(config.pooler_dropout)
self.config = config
def forward(self, hidden_states):
context_token = hidden_states[:, 0]
context_token = self.dropout(context_token)
pooled_output = self.dense(context_token)
pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
return pooled_output
@property
def output_dim(self):
return self.config.hidden_size
class XSoftmax(torch.autograd.Function):
"""
优化了内存的 Masked Softmax 实现
"""
@staticmethod
def forward(self, input, mask, dim):
self.dim = dim
rmask = ~(mask.to(torch.bool))
output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min))
output = torch.softmax(output, self.dim)
output.masked_fill_(rmask, 0)
self.save_for_backward(output)
return output
@staticmethod
def backward(self, grad_output):
(output,) = self.saved_tensors
inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output)
return inputGrad, None, None
@staticmethod
def symbolic(g, self, mask, dim):
import torch.onnx.symbolic_helper as sym_help
from torch.onnx.symbolic_opset9 import masked_fill, softmax
mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"])
r_mask = g.op(
"Cast",
g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value),
to_i=sym_help.cast_pytorch_to_onnx["Bool"],
)
output = masked_fill(
g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min))
)
output = softmax(g, output, dim)
return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool)))
class DropoutContext(object):
def __init__(self):
self.dropout = 0
self.mask = None
self.scale = 1
self.reuse_mask = True
def get_mask(input, local_context):
if not isinstance(local_context, DropoutContext):
dropout = local_context
mask = None
else:
dropout = local_context.dropout
dropout *= local_context.scale
mask = local_context.mask if local_context.reuse_mask else None
if dropout > 0 and mask is None:
mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)
if isinstance(local_context, DropoutContext):
if local_context.mask is None:
local_context.mask = mask
return mask, dropout
class XDropout(torch.autograd.Function):
"""Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
@staticmethod
def forward(ctx, input, local_ctx):
mask, dropout = get_mask(input, local_ctx)
ctx.scale = 1.0 / (1 - dropout)
if dropout > 0:
ctx.save_for_backward(mask)
return input.masked_fill(mask, 0) * ctx.scale
else:
return input
@staticmethod
def backward(ctx, grad_output):
if ctx.scale > 1:
(mask,) = ctx.saved_tensors
return grad_output.masked_fill(mask, 0) * ctx.scale, None
else:
return grad_output, None
@staticmethod
def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
from torch.onnx import symbolic_opset12
dropout_p = local_ctx
if isinstance(local_ctx, DropoutContext):
dropout_p = local_ctx.dropout
train = True
return symbolic_opset12.dropout(g, input, dropout_p, train)
class StableDropout(nn.Module):
"""
Optimized dropout module for stabilizing the training
Args:
drop_prob (float): the dropout probabilities
"""
def __init__(self, drop_prob):
super().__init__()
self.drop_prob = drop_prob
self.count = 0
self.context_stack = None
def forward(self, x):
"""
Call the module
Args:
x (`torch.tensor`): The input tensor to apply dropout
"""
if self.training and self.drop_prob > 0:
return XDropout.apply(x, self.get_context())
return x
def clear_context(self):
self.count = 0
self.context_stack = None
def init_context(self, reuse_mask=True, scale=1):
if self.context_stack is None:
self.context_stack = []
self.count = 0
for c in self.context_stack:
c.reuse_mask = reuse_mask
c.scale = scale
def get_context(self):
if self.context_stack is not None:
if self.count >= len(self.context_stack):
self.context_stack.append(DropoutContext())
ctx = self.context_stack[self.count]
ctx.dropout = self.drop_prob
self.count += 1
return ctx
else:
return self.drop_prob
class DebertaIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class DebertaOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class DebertaLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.attention = DebertaAttention(config)
self.intermediate = DebertaIntermediate(config)
self.output = DebertaOutput(config)
def forward(
self,
hidden_states,
attention_mask,
query_states=None,
relative_pos=None,
rel_embeddings=None,
output_attentions=False,
):
attention_output = self.attention(
hidden_states,
attention_mask,
output_attentions=output_attentions,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
)
if output_attentions:
attention_output, att_matrix = attention_output
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
if output_attentions:
return (layer_output, att_matrix)
else:
return layer_output
class DebertaEncoder(nn.Module):
"""Modified BertEncoder with relative position bias support"""
def __init__(self, config):
super().__init__()
self.layer = nn.ModuleList([DebertaLayer(config) for _ in range(config.num_hidden_layers)])
self.relative_attention = getattr(config, "relative_attention", False)
if self.relative_attention:
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size)
self.gradient_checkpointing = False
def get_rel_embedding(self):
rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
return rel_embeddings
def get_attention_mask(self, attention_mask):
if attention_mask.dim() <= 2:
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
elif attention_mask.dim() == 3:
attention_mask = attention_mask.unsqueeze(1)
return attention_mask
def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
if self.relative_attention and relative_pos is None:
q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
relative_pos = build_relative_position(q, hidden_states.size(-2), hidden_states.device)
return relative_pos
def forward(
self,
hidden_states,
attention_mask,
output_hidden_states=True,
output_attentions=False,
query_states=None,
relative_pos=None,
return_dict=True,
):
attention_mask = self.get_attention_mask(attention_mask)
relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[0]
else:
next_kv = hidden_states
rel_embeddings = self.get_rel_embedding()
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
layer_module.__call__,
next_kv,
attention_mask,
query_states,
relative_pos,
rel_embeddings,
output_attentions,
)
else:
hidden_states = layer_module(
next_kv,
attention_mask,
query_states=query_states,
relative_pos=relative_pos,
rel_embeddings=rel_embeddings,
output_attentions=output_attentions,
)
if output_attentions:
hidden_states, att_m = hidden_states
if query_states is not None:
query_states = hidden_states
if isinstance(hidden_states, Sequence):
next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
else:
next_kv = hidden_states
if output_attentions:
all_attentions = all_attentions + (att_m,)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
q_ids = torch.arange(query_size, dtype=torch.long, device=device)
k_ids = torch.arange(key_size, dtype=torch.long, device=device)
rel_pos_ids = q_ids[:, None] - k_ids.view(1, -1).repeat(query_size, 1)
rel_pos_ids = rel_pos_ids[:query_size, :]
rel_pos_ids = rel_pos_ids.unsqueeze(0)
return rel_pos_ids
@torch.jit.script
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
@torch.jit.script
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
@torch.jit.script
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
class DisentangledSelfAttention(nn.Module):
"""
Disentangled self-attention module
Parameters:
config (`str`):
A model config class instance with the configuration to build a new model. The schema is similar to
*BertConfig*, for more details, please refer [`DebertaConfig`]
"""
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.in_proj = nn.Linear(config.hidden_size, self.all_head_size * 3, bias=False)
self.q_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
self.v_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
self.relative_attention = getattr(config, "relative_attention", False)
self.talking_head = getattr(config, "talking_head", False)
if self.talking_head:
self.head_logits_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
self.head_weights_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
if self.relative_attention:
self.max_relative_positions = getattr(config, "max_relative_positions", -1)
if self.max_relative_positions < 1:
self.max_relative_positions = config.max_position_embeddings
self.pos_dropout = StableDropout(config.hidden_dropout_prob)
if "c2p" in self.pos_att_type:
self.pos_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
if "p2c" in self.pos_att_type:
self.pos_q_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = StableDropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask,
output_attentions=False,
query_states=None,
relative_pos=None,
rel_embeddings=None,
def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
if relative_pos is None:
q = query_layer.size(-2)
relative_pos = build_relative_position(q, key_layer.size(-2), query_layer.device)
if relative_pos.dim() == 2:
relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
elif relative_pos.dim() == 3:
relative_pos = relative_pos.unsqueeze(1)
elif relative_pos.dim() != 4:
raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
relative_pos = relative_pos.long().to(query_layer.device)
rel_embeddings = rel_embeddings[
self.max_relative_positions - att_span : self.max_relative_positions + att_span, :
].unsqueeze(0)
score = 0
if "c2p" in self.pos_att_type:
pos_key_layer = self.pos_proj(rel_embeddings)
pos_key_layer = self.transpose_for_scores(pos_key_layer)
c2p_att = torch.matmul(query_layer, pos_key_layer.transpose(-1, -2))
c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_dynamic_expand(c2p_pos, query_layer, relative_pos))
score += c2p_att
if "p2c" in self.pos_att_type:
pos_query_layer = self.pos_q_proj(rel_embeddings)
pos_query_layer = self.transpose_for_scores(pos_query_layer)
pos_query_layer /= torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor)
if query_layer.size(-2) != key_layer.size(-2):
r_pos = build_relative_position(key_layer.size(-2), key_layer.size(-2), query_layer.device)
else:
r_pos = relative_pos
p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
p2c_att = torch.matmul(key_layer, pos_query_layer.transpose(-1, -2).to(dtype=key_layer.dtype))
p2c_att = torch.gather(
p2c_att, dim=-1, index=p2c_dynamic_expand(p2c_pos, query_layer, key_layer)
).transpose(-1, -2)
if query_layer.size(-2) != key_layer.size(-2):
pos_index = relative_pos[:, :, :, 0].unsqueeze(-1)
p2c_att = torch.gather(p2c_att, dim=-2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer))
score += p2c_att
return score
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
pad_token_id = getattr(config, "pad_token_id", 0)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
self.position_biased_input = getattr(config, "position_biased_input", True)
if not self.position_biased_input:
self.position_embeddings = None
else:
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
if config.type_vocab_size > 0:
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
if self.embedding_size != config.hidden_size:
self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
if self.position_embeddings is not None:
position_embeddings = self.position_embeddings(position_ids.long())
else:
position_embeddings = torch.zeros_like(inputs_embeds)
embeddings = inputs_embeds
if self.position_biased_input:
embeddings += position_embeddings
if self.config.type_vocab_size > 0:
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings += token_type_embeddings
if self.embedding_size != self.config.hidden_size:
embeddings = self.embed_proj(embeddings)
embeddings = self.LayerNorm(embeddings)
if mask is not None:
if mask.dim() != embeddings.dim():
if mask.dim() == 4:
mask = mask.squeeze(1).squeeze(1)
mask = mask.unsqueeze(2)
mask = mask.to(embeddings.dtype)
embeddings = embeddings * mask
embeddings = self.dropout(embeddings)
return embeddings
class DebertaPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DebertaConfig
base_model_prefix = "deberta"
_keys_to_ignore_on_load_unexpected = ["position_embeddings"]
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
这段代码定义了一个抽象类 `DebertaPreTrainedModel`,用于处理权重初始化和预训练模型的下载和加载。注释详细解释了类的各个部分和方法的作用。
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
output_attentions (`bool`, *optional*):
output_hidden_states (`bool`, *optional*):
return_dict (`bool`, *optional*):
"""
DeBERTa 模型的基础类,输出原始的隐藏状态,没有特定的输出头部。
Args:
config (DebertaConfig): 包含模型配置的对象实例
Attributes:
embeddings (DebertaEmbeddings): DeBERTa 模型的嵌入层
encoder (DebertaEncoder): DeBERTa 模型的编码器
z_steps (int): 用于某些特定功能的步骤计数
config (DebertaConfig): 模型的配置对象
Raises:
NotImplementedError: 当尝试调用未实现的修剪功能时抛出异常
Methods:
get_input_embeddings: 返回模型的输入嵌入层
set_input_embeddings: 设置模型的输入嵌入层
_prune_heads: 修剪模型的头部,但此功能在 DeBERTa 模型中尚未实现
forward: DeBERTa 模型的前向传播方法,接受多个输入参数和配置选项
"""
@add_start_docstrings(
"The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.",
DEBERTA_START_DOCSTRING,
)
class DebertaModel(DebertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.embeddings = DebertaEmbeddings(config)
self.encoder = DebertaEncoder(config)
self.z_steps = 0
self.config = config
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings):
self.embeddings.word_embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError("The prune function is not implemented in DeBERTa model.")
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
embedding_output = self.embeddings(
input_ids=input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
mask=attention_mask,
inputs_embeds=inputs_embeds,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask,
output_hidden_states=True,
output_attentions=output_attentions,
return_dict=return_dict,
)
encoded_layers = encoder_outputs[1]
if self.z_steps > 1:
hidden_states = encoded_layers[-2]
layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
query_states = encoded_layers[-1]
rel_embeddings = self.encoder.get_rel_embedding()
attention_mask = self.encoder.get_attention_mask(attention_mask)
rel_pos = self.encoder.get_rel_pos(embedding_output)
for layer in layers[1:]:
query_states = layer(
hidden_states,
attention_mask,
output_attentions=False,
query_states=query_states,
relative_pos=rel_pos,
rel_embeddings=rel_embeddings,
)
encoded_layers.append(query_states)
sequence_output = encoded_layers[-1]
if not return_dict:
return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2):]
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
attentions=encoder_outputs.attentions,
)
@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING)
class DebertaForMaskedLM(DebertaPreTrainedModel):
_tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
self.deberta = DebertaModel(config)
self.cls = DebertaOnlyMLMHead(config)
self.post_init()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_MASKED_LM,
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
mask="[MASK]",
expected_output=_MASKED_LM_EXPECTED_OUTPUT,
expected_loss=_MASKED_LM_EXPECTED_LOSS,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, MaskedLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class DebertaPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.dense = nn.Linear(config.hidden_size, self.embedding_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class DebertaLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = DebertaPredictionHeadTransform(config)
self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class DebertaOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = DebertaLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
@add_start_docstrings(
"""
DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
DEBERTA_START_DOCSTRING,
)
class DebertaForSequenceClassification(DebertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
num_labels = getattr(config, "num_labels", 2)
self.num_labels = num_labels
self.deberta = DebertaModel(config)
self.pooler = ContextPooler(config)
output_dim = self.pooler.output_dim
self.classifier = nn.Linear(output_dim, num_labels)
drop_out = getattr(config, "cls_dropout", None)
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.post_init()
def get_input_embeddings(self):
return self.deberta.get_input_embeddings()
def set_input_embeddings(self, new_embeddings):
self.deberta.set_input_embeddings(new_embeddings)
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
@add_start_docstrings(
"""
DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
DEBERTA_START_DOCSTRING,
)
class DebertaForTokenClassification(DebertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.deberta = DebertaModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
token_type_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.deberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
)
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
DEBERTA_START_DOCSTRING,
注释:
# 将隐藏状态输出作为基础层,计算“span起始logits”和“span结束logits”的上层层次。
# DEBERTA_START_DOCSTRING是预定义的常量或字符串,可能用于文档字符串或注释的格式化。
)
class DebertaForQuestionAnswering(DebertaPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.deberta = DebertaModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_QA,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_QA_EXPECTED_OUTPUT,
expected_loss=_QA_EXPECTED_LOSS,
qa_target_start_index=_QA_TARGET_START_INDEX,
qa_target_end_index=_QA_TARGET_END_INDEX,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None, # 输入的token ID序列,可选的Tensor类型
attention_mask: Optional[torch.Tensor] = None, # 注意力掩码,指示模型在计算中应忽略的位置,可选的Tensor类型
token_type_ids: Optional[torch.Tensor] = None, # token类型ID,用于区分两个句子或段落,可选的Tensor类型
position_ids: Optional[torch.Tensor] = None, # 位置ID,标识输入token的位置信息,可选的Tensor类型
inputs_embeds: Optional[torch.Tensor] = None, # 嵌入输入,替代输入IDs的嵌入表示,可选的Tensor类型
start_positions: Optional[torch.Tensor] = None, # 答案的起始位置索引,用于训练和评估,可选的Tensor类型
end_positions: Optional[torch.Tensor] = None, # 答案的结束位置索引,用于训练和评估,可选的Tensor类型
output_attentions: Optional[bool] = None, # 是否返回注意力权重,可选的布尔类型
output_hidden_states: Optional[bool] = None, # 是否返回隐藏状态,可选的布尔类型
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选的布尔类型
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
# 确定是否要返回字典格式的输出,根据配置或参数设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 使用 DeBERTa 模型进行前向推断
outputs = self.deberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取模型输出中的序列输出
sequence_output = outputs[0]
# 将序列输出送入问答输出层获取 logits
logits = self.qa_outputs(sequence_output)
# 将 logits 拆分为起始和结束 logits
start_logits, end_logits = logits.split(1, dim=-1)
# 去除多余的维度并保持连续性
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
# 如果提供了起始和结束位置的标签,则计算损失
if start_positions is not None and end_positions is not None:
# 如果在多 GPU 上运行,扩展维度以匹配模型输出
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# 忽略超出模型输入范围的位置
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
# 定义交叉熵损失函数,并计算起始和结束位置的损失
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
# 如果不需要返回字典格式的输出,则按元组方式返回结果
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
# 如果需要返回字典格式的输出,则构建 QuestionAnsweringModelOutput 对象并返回
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)