Transformers 源码解析(九十四)
.\models\rembert\modeling_tf_rembert.py
""" TF 2.0 RemBERT model."""
from __future__ import annotations
import math
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import (
TFBaseModelOutputWithPastAndCrossAttentions,
TFBaseModelOutputWithPoolingAndCrossAttentions,
TFCausalLMOutputWithCrossAttentions,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMaskedLanguageModelingLoss,
TFModelInputType,
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
)
from .configuration_rembert import RemBertConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "RemBertConfig"
TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/rembert",
]
class TFRemBertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.input_embedding_size = config.input_embedding_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
self.weight = self.add_weight(
name="weight",
shape=[self.config.vocab_size, self.input_embedding_size],
initializer=get_initializer(self.initializer_range),
)
self.token_type_embeddings = self.add_weight(
name="embeddings",
shape=[self.config.type_vocab_size, self.input_embedding_size],
initializer=get_initializer(self.initializer_range),
)
self.position_embeddings = self.add_weight(
name="embeddings",
shape=[self.max_position_embeddings, self.input_embedding_size],
initializer=get_initializer(self.initializer_range),
)
if self.built:
return
self.built = True
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.input_embedding_size])
def call(
self,
input_ids: tf.Tensor = None,
position_ids: tf.Tensor = None,
token_type_ids: tf.Tensor = None,
inputs_embeds: tf.Tensor = None,
past_key_values_length=0,
training: bool = False,
) -> tf.Tensor:
"""
Applies embedding based on inputs tensor.
Returns:
final_embeddings (`tf.Tensor`): output embedding tensor.
"""
assert not (input_ids is None and inputs_embeds is None)
if input_ids is not None:
check_embeddings_within_bounds(input_ids, self.config.vocab_size)
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
input_shape = shape_list(inputs_embeds)[:-1]
if token_type_ids is None:
token_type_ids = tf.fill(dims=input_shape, value=0)
if position_ids is None:
position_ids = tf.expand_dims(
tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0
)
position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
final_embeddings = inputs_embeds + position_embeds + token_type_embeds
final_embeddings = self.LayerNorm(inputs=final_embeddings)
final_embeddings = self.dropout(inputs=final_embeddings, training=training)
return final_embeddings
class TFRemBertSelfAttention(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
f"of attention heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(tensor, perm=[0, 2, 1, 3])
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_value: Tuple[tf.Tensor],
output_attentions: bool,
training: bool = False,
):
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.config.hidden_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.config.hidden_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.config.hidden_size])
class TFRemBertSelfOutput(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRemBertAttention(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.self_attention = TFRemBertSelfAttention(config, name="self")
self.dense_output = TFRemBertSelfOutput(config, name="output")
def prune_heads(self, heads):
raise NotImplementedError
def call(
self,
input_tensor: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_value: Tuple[tf.Tensor],
output_attentions: bool,
training: bool = False,
) -> Tuple[tf.Tensor]:
self_outputs = self.self_attention(
hidden_states=input_tensor,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = self.dense_output(
hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
)
outputs = (attention_output,) + self_outputs[1:]
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "self_attention", None) is not None:
with tf.name_scope(self.self_attention.name):
self.self_attention.build(None)
if getattr(self, "dense_output", None) is not None:
with tf.name_scope(self.dense_output.name):
self.dense_output.build(None)
class TFRemBertIntermediate(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRemBertOutput(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.dropout(inputs=hidden_states, training=training)
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
return hidden_states
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.intermediate_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFRemBertLayer(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFRemBertAttention(config, name="attention")
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = TFRemBertAttention(config, name="crossattention")
self.intermediate = TFRemBertIntermediate(config, name="intermediate")
self.bert_output = TFRemBertOutput(config, name="output")
) -> Tuple[tf.Tensor]:
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
input_tensor=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=self_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = self_attention_outputs[0]
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:]
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
if not hasattr(self, "crossattention"):
raise ValueError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
" by setting `config.add_cross_attention=True`"
)
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
input_tensor=attention_output,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
training=training,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1]
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
intermediate_output = self.intermediate(hidden_states=attention_output)
layer_output = self.bert_output(
hidden_states=intermediate_output, input_tensor=attention_output, training=training
)
outputs = (layer_output,) + outputs
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "attention", None) is not None:
with tf.name_scope(self.attention.name):
self.attention.build(None)
if getattr(self, "intermediate", None) is not None:
with tf.name_scope(self.intermediate.name):
self.intermediate.build(None)
if getattr(self, "bert_output", None) is not None:
with tf.name_scope(self.bert_output.name):
self.bert_output.build(None)
if getattr(self, "crossattention", None) is not None:
with tf.name_scope(self.crossattention.name):
self.crossattention.build(None)
class TFRemBertEncoder(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_hidden_mapping_in = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embedding_hidden_mapping_in",
)
self.layer = [TFRemBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor,
head_mask: tf.Tensor,
encoder_hidden_states: tf.Tensor,
encoder_attention_mask: tf.Tensor,
past_key_values: Tuple[Tuple[tf.Tensor]],
use_cache: bool,
output_attentions: bool,
output_hidden_states: bool,
return_dict: bool,
training: bool = False,
) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
hidden_states = self.embedding_hidden_mapping_in(inputs=hidden_states)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
past_key_value = past_key_values[i] if past_key_values is not None else None
layer_outputs = layer_module(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask[i],
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
training=training,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if self.config.add_cross_attention and encoder_hidden_states is not None:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
)
return TFBaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_attentions,
cross_attentions=all_cross_attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedding_hidden_mapping_in", None) is not None:
with tf.name_scope(self.embedding_hidden_mapping_in.name):
self.embedding_hidden_mapping_in.build([None, None, self.config.input_embedding_size])
if getattr(self, "layer", None) is not None:
for layer in self.layer:
with tf.name_scope(layer.name):
layer.build(None)
class TFRemBertPooler(keras.layers.Layer):
def __init__(self, config: RemBertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="dense",
)
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(inputs=first_token_tensor)
return pooled_output
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
class TFRemBertLMPredictionHead(keras.layers.Layer):
def __init__(self, config: RemBertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.initializer_range = config.initializer_range
self.output_embedding_size = config.output_embedding_size
self.dense = keras.layers.Dense(
config.output_embedding_size, kernel_initializer=get_initializer(self.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.activation = get_tf_activation(config.hidden_act)
else:
self.activation = config.hidden_act
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def build(self, input_shape=None):
self.decoder = self.add_weight(
name="decoder/weight",
shape=[self.config.vocab_size, self.output_embedding_size],
initializer=get_initializer(self.initializer_range),
)
self.decoder_bias = self.add_weight(
shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
)
if self.built:
return
self.built = True
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.config.hidden_size])
if getattr(self, "LayerNorm", None) is not None:
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, self.config.output_embedding_size])
def get_output_embeddings(self) -> keras.layers.Layer:
return self
def set_output_embeddings(self, value):
self.decoder = value
self.decoder.vocab_size = shape_list(value)[0]
def get_bias(self) -> Dict[str, tf.Variable]:
return {"decoder_bias": self.decoder_bias}
def set_bias(self, value: tf.Variable):
self.decoder_bias = value["decoder_bias"]
self.config.vocab_size = shape_list(value["decoder_bias"])[0]
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
hidden_states = self.dense(inputs=hidden_states)
hidden_states = self.activation(hidden_states)
seq_length = shape_list(tensor=hidden_states)[1]
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.output_embedding_size])
hidden_states = self.LayerNorm(hidden_states)
hidden_states = tf.matmul(a=hidden_states, b=self.decoder, transpose_b=True)
hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias)
return hidden_states
class TFRemBertMLMHead(keras.layers.Layer):
def __init__(self, config: RemBertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFRemBertLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
prediction_scores = self.predictions(hidden_states=sequence_output)
return prediction_scores
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "predictions", None) is not None:
with tf.name_scope(self.predictions.name):
self.predictions.build(None)
@keras_serializable
class TFRemBertMainLayer(keras.layers.Layer):
config_class = RemBertConfig
def __init__(self, config: RemBertConfig, add_pooling_layer: bool = True, **kwargs):
super().__init__(**kwargs)
self.config = config
self.is_decoder = config.is_decoder
self.embeddings = TFRemBertEmbeddings(config, name="embeddings")
self.encoder = TFRemBertEncoder(config, name="encoder")
self.pooler = TFRemBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
self.embeddings.weight = value
self.embeddings.vocab_size = shape_list(value)[0]
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
raise NotImplementedError
@unpack_inputs
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
pass
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFRemBertPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = RemBertConfig
base_model_prefix = "rembert"
REMBERT_START_DOCSTRING = r"""
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
<Tip>
TensorFlow models and layers in `transformers` accept two formats as input:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional argument.
The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
positional argument:
- a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Note that when creating models and layers with
[subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
about any of this, as you can just pass inputs like you would to any other Python function!
</Tip>
Args:
config ([`RemBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
REMBERT_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare RemBERT Model transformer outputing raw hidden-states without any specific head on top.",
REMBERT_START_DOCSTRING,
)
class TFRemBertModel(TFRemBertPreTrainedModel):
pass
def __init__(self, config: RemBertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.rembert = TFRemBertMainLayer(config, name="rembert")
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: Optional[bool] = False,
) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
r"""
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*, defaults to `True`):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`). Set to `False` during training, `True` during generation
"""
outputs = self.rembert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
@add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING)
class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config: RemBertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if config.is_decoder:
logger.warning(
"If you want to use `TFRemBertForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False)
self.mlm = TFRemBertMLMHead(config, input_embeddings=self.rembert.embeddings, name="mlm___cls")
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="google/rembert",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
"""
outputs = self.rembert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
)
class TFRemBertForCausalLM(TFRemBertPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config: RemBertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
if not config.is_decoder:
logger.warning("If you want to use `TFRemBertForCausalLM` as a standalone, add `is_decoder=True.`")
self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False)
self.mlm = TFRemBertMLMHead(config, input_embeddings=self.rembert.embeddings, name="mlm___cls")
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
if attention_mask is None:
attention_mask = tf.ones(input_shape)
if past_key_values is not None:
input_ids = input_ids[:, -1:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
@unpack_inputs
@add_code_sample_docstrings(
checkpoint="google/rembert",
output_type=TFCausalLMOutputWithCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "mlm", None) is not None:
with tf.name_scope(self.mlm.name):
self.mlm.build(None)
@add_start_docstrings(
"""
RemBERT Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks.
"""
REMBERT_START_DOCSTRING,
这样的注释能够准确描述每行代码的功能和作用,而不会过多或者过少地概括其含义。
class TFRemBertForSequenceClassification(TFRemBertPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: RemBertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.rembert = TFRemBertMainLayer(config, name="rembert")
self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="google/rembert",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs = self.rembert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(inputs=pooled_output, training=training)
logits = self.classifier(inputs=pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
RemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
REMBERT_START_DOCSTRING,
)
class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config: RemBertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.rembert = TFRemBertMainLayer(config, name="rembert")
self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint="google/rembert",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
"""
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
flat_attention_mask = (
tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
)
flat_token_type_ids = (
tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
)
flat_position_ids = (
tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None
)
flat_inputs_embeds = (
tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
if inputs_embeds is not None
else None
)
outputs = self.rembert(
input_ids=flat_input_ids,
attention_mask=flat_attention_mask,
token_type_ids=flat_token_type_ids,
position_ids=flat_position_ids,
head_mask=head_mask,
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
pooled_output = outputs[1]
pooled_output = self.dropout(inputs=pooled_output, training=training)
logits = self.classifier(inputs=pooled_output)
reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
"""
RemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
REMBERT_START_DOCSTRING,
)
class TFRemBertForTokenClassification(TFRemBertPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config: RemBertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.rembert = TFRemBertMainLayer(config, name="rembert", add_pooling_layer=False)
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="google/rembert",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
outputs = self.rembert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(inputs=sequence_output, training=training)
logits = self.classifier(inputs=sequence_output)
loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "classifier", None) is not None:
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.config.hidden_size])
"""
RemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
"""
@add_start_docstrings(
"""
RemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
REMBERT_START_DOCSTRING,
)
class TFRemBertForQuestionAnswering(TFRemBertPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config: RemBertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.rembert = TFRemBertMainLayer(config, add_pooling_layer=False, name="rembert")
self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
@unpack_inputs
@add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
checkpoint="google/rembert",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(
self,
input_ids: TFModelInputType | None = None,
attention_mask: np.ndarray | tf.Tensor | None = None,
token_type_ids: np.ndarray | tf.Tensor | None = None,
position_ids: np.ndarray | tf.Tensor | None = None,
head_mask: np.ndarray | tf.Tensor | None = None,
inputs_embeds: np.ndarray | tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
start_positions: np.ndarray | tf.Tensor | None = None,
end_positions: np.ndarray | tf.Tensor | None = None,
training: Optional[bool] = False,
"""
Forward pass of the model. This function handles various inputs for question-answering tasks
and returns model outputs such as start and end logits.
Args:
input_ids: Indices of input tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding tokens.
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
position_ids: Indices of positions of each input sequence token in the position embeddings.
head_mask: Mask to nullify selected heads of the self-attention modules.
inputs_embeds: Optionally provided embeddings instead of input_ids.
output_attentions: Whether to return attentions weights (a.k.a. self-weights).
output_hidden_states: Whether to return all hidden-states.
return_dict: Whether to return a dictionary instead of a tuple.
start_positions: Ground truth for the start position of the span in the input.
end_positions: Ground truth for the end position of the span in the input.
training: Whether the model is in training mode.
Returns:
Depending on `return_dict`, either a tuple or a dictionary containing model outputs such as
start and end logits for span prediction.
"""
) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
r"""
start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
outputs = self.rembert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
sequence_output = outputs[0]
logits = self.qa_outputs(inputs=sequence_output)
start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
start_logits = tf.squeeze(input=start_logits, axis=-1)
end_logits = tf.squeeze(input=end_logits, axis=-1)
loss = None
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "rembert", None) is not None:
with tf.name_scope(self.rembert.name):
self.rembert.build(None)
if getattr(self, "qa_outputs", None) is not None:
with tf.name_scope(self.qa_outputs.name):
self.qa_outputs.build([None, None, self.config.hidden_size])
.\models\rembert\tokenization_rembert.py
"""Tokenization classes for RemBERT."""
import os
from shutil import copyfile
from typing import List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/rembert": "https://huggingface.co/google/rembert/resolve/main/sentencepiece.model",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/rembert": 256,
}
class RemBertTokenizer(PreTrainedTokenizer):
"""
Construct a RemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
def __init__(
self,
vocab_file,
*,
tokenizer_file=None,
do_lower_case=False,
bos_token="[CLS]",
eos_token="[SEP]",
sep_token="[SEP]",
cls_token="[CLS]",
pad_token="[PAD]",
unk_token="[UNK]",
mask_token="[MASK]",
**kwargs
):
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
unk_token=unk_token,
mask_token=mask_token,
**kwargs
)
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
contains the vocabulary necessary to instantiate a tokenizer.
bos_token (`str`, *optional*, defaults to `"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.
</Tip>
eos_token (`str`, *optional*, defaults to `"[SEP]"`):
The end of sequence token.
<Tip>
When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.
</Tip>
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes:
sp_model (`SentencePieceProcessor`):
The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
"""
# 定义类变量,包含预定义的文件名字典
vocab_files_names = VOCAB_FILES_NAMES
# 包含预训练模型的词汇文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 包含预训练位置嵌入大小的字典
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 初始化方法,接收多个参数,包括词汇文件和特殊标记的设置
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=True,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs,
# 如果 mask_token 是字符串类型,则设置 lstrip=True 和 rstrip=False 的 AddedToken 对象
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# 初始化模型的参数
self.do_lower_case = do_lower_case # 是否将输入文本转换为小写
self.remove_space = remove_space # 是否移除输入文本中的空格
self.keep_accents = keep_accents # 是否保留输入文本中的重音符号
self.vocab_file = vocab_file # 词汇表文件的路径
# 使用 SentencePieceProcessor 初始化 self.sp_model 对象,并加载词汇表文件
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
# 调用父类的初始化方法,设置模型的基本参数
super().__init__(
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs,
)
@property
def vocab_size(self):
# 返回词汇表的大小,即 self.sp_model 中词汇的数量
return len(self.sp_model)
def get_vocab(self):
# 构建并返回词汇表,包括从 id 到 token 的映射和额外添加的 tokens 编码器
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def __getstate__(self):
# 返回对象的状态字典,不包括 sp_model,用于序列化对象
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
# 从状态字典中恢复对象的状态,重新加载 sp_model
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False):
"""Tokenize a string."""
# 使用 sp_model 对文本进行分词,返回分词后的结果 pieces
pieces = self.sp_model.EncodeAsPieces(text)
return pieces
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
# 根据 token 转换成对应的 id,使用 sp_model 的 PieceToId 方法
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
# 根据 id 转换成对应的 token,使用 sp_model 的 IdToPiece 方法
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
# 将 tokens 转换成字符串,使用 sp_model 的 decode_pieces 方法
out_string = self.sp_model.decode_pieces(tokens)
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
):
# 构建包含特殊 token 的输入,这里不包含实现细节,只是声明方法签名
pass
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs from two sequences. Token type IDs are binary masks identifying the type of each token
in the sequence: 0 for the first sequence, 1 for the second sequence.
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs where each ID corresponds to the type of its respective token.
"""
if token_ids_1 is None:
# If there's no second sequence, all tokens belong to the first sequence (type ID 0)
return [0] * len(token_ids_0)
# Create token type IDs for a pair of sequences
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
# Define the separation token ID and the classification token ID
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If there is no second sequence provided, return a mask with 0s for only the first sequence
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Otherwise, return a mask with 0s for the first sequence and 1s for the second sequence
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# Check if the save_directory exists; if not, log an error and return None
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
# Define the output vocabulary file path
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# Copy the vocabulary file to the output directory if it's different from the current location
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
# If the vocabulary file doesn't exist, write the serialized sp_model to the output file
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# Return the path to the saved vocabulary file
return (out_vocab_file,)
.\models\rembert\tokenization_rembert_fast.py
""" Tokenization classes for RemBERT model."""
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
if is_sentencepiece_available():
from .tokenization_rembert import RemBertTokenizer
else:
RemBertTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/rembert": "https://huggingface.co/google/rembert/resolve/main/sentencepiece.model",
},
"tokenizer_file": {
"google/rembert": "https://huggingface.co/google/rembert/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/rembert": 256,
}
SPIECE_UNDERLINE = "▁"
class RemBertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods
"""
Args:
vocab_file (`str`):
SentencePiece 文件的路径,通常以 *.spm* 扩展名,包含实例化分词器所需的词汇表。
do_lower_case (`bool`, *optional*, defaults to `True`):
在分词时是否将输入转换为小写。
remove_space (`bool`, *optional*, defaults to `True`):
在分词时是否去除文本中的空格(去除字符串前后多余的空格)。
keep_accents (`bool`, *optional*, defaults to `False`):
在分词时是否保留重音符号。
bos_token (`str`, *optional*, defaults to `"[CLS]"`):
序列的开始标记,用于预训练。在构建序列时,实际使用的是 `cls_token`。
eos_token (`str`, *optional*, defaults to `"[SEP]"`):
序列的结束标记。在构建序列时,实际使用的是 `sep_token`。
unk_token (`str`, *optional*, defaults to `"<unk>"`):
未知标记,表示词汇表中不存在的标记。
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
分隔符标记,在构建多个序列组成的序列时使用,例如序列分类或问答任务中的文本和问题之间的分隔。
同时也是构建特殊序列的最后一个标记。
pad_token (`str`, *optional*, defaults to `"<pad>"`):
填充标记,在将不同长度的序列进行批处理时使用。
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
分类器标记,在进行序列分类时使用,是构建特殊序列的第一个标记。
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
掩码标记,在掩码语言建模训练中使用,模型尝试预测的标记。
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = RemBertTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
remove_space=True,
keep_accents=False,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="<unk>",
sep_token="[SEP]",
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs,
):
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs,
)
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
从一个或一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。RemBERT 序列的格式如下:
- 单个序列:`[CLS] X [SEP]`
- 序列对:`[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
将添加特殊标记的 ID 列表
token_ids_1 (`List[int]`, *optional*, 默认为 `None`):
第二个序列的 ID 列表(对序列任务)
Returns:
`List[int]`: 包含适当特殊标记的输入 ID 列表。
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
获取特殊标记的掩码,用于标识哪些位置是特殊标记的位置。
Args:
token_ids_0 (`List[int]`):
第一个序列的 ID 列表
token_ids_1 (`List[int]`, *optional*, 默认为 `None`):
第二个序列的 ID 列表(对序列任务)
already_has_special_tokens (`bool`, 默认为 `False`):
是否已经包含了特殊标记,如果是则为 True
Returns:
`List[int]`: 表示特殊标记位置的掩码列表。
"""
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*, defaults to `None`):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
.\models\rembert\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = {
"configuration_rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig", "RemBertOnnxConfig"]
}
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_rembert"] = ["RemBertTokenizer"]
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_rembert_fast"] = ["RemBertTokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_rembert"] = [
"REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"RemBertForCausalLM",
"RemBertForMaskedLM",
"RemBertForMultipleChoice",
"RemBertForQuestionAnswering",
"RemBertForSequenceClassification",
"RemBertForTokenClassification",
"RemBertLayer",
"RemBertModel",
"RemBertPreTrainedModel",
"load_tf_weights_in_rembert",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_rembert"] = [
"TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFRemBertForCausalLM",
"TFRemBertForMaskedLM",
"TFRemBertForMultipleChoice",
"TFRemBertForQuestionAnswering",
"TFRemBertForSequenceClassification",
"TFRemBertForTokenClassification",
"TFRemBertLayer",
"TFRemBertModel",
"TFRemBertPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig, RemBertOnnxConfig
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_rembert import RemBertTokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_rembert_fast import RemBertTokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_rembert import (
REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
RemBertForCausalLM,
RemBertForMaskedLM,
RemBertForMultipleChoice,
RemBertForQuestionAnswering,
RemBertForSequenceClassification,
RemBertForTokenClassification,
RemBertLayer,
RemBertModel,
RemBertPreTrainedModel,
load_tf_weights_in_rembert,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_rembert import (
TF_REMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFRemBertForCausalLM,
TFRemBertForMaskedLM,
TFRemBertForMultipleChoice,
TFRemBertForQuestionAnswering,
TFRemBertForSequenceClassification,
TFRemBertForTokenClassification,
TFRemBertLayer,
TFRemBertModel,
TFRemBertPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\resnet\configuration_resnet.py
""" ResNet model configuration"""
from collections import OrderedDict
from typing import Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
logger = logging.get_logger(__name__)
RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/resnet-50": "https://huggingface.co/microsoft/resnet-50/blob/main/config.json",
}
class ResNetConfig(BackboneConfigMixin, PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ResNetModel`]. It is used to instantiate an
ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the ResNet
[microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
model_type = "resnet"
layer_types = ["basic", "bottleneck"]
def __init__(
self,
num_channels=3,
embedding_size=64,
hidden_sizes=[256, 512, 1024, 2048],
depths=[3, 4, 6, 3],
layer_type="bottleneck",
hidden_act="relu",
downsample_in_first_stage=False,
downsample_in_bottleneck=False,
out_features=None,
out_indices=None,
**kwargs,
):
super().__init__(**kwargs)
if layer_type not in self.layer_types:
raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
self.num_channels = num_channels
self.embedding_size = embedding_size
self.hidden_sizes = hidden_sizes
self.depths = depths
self.layer_type = layer_type
self.hidden_act = hidden_act
self.downsample_in_first_stage = downsample_in_first_stage
self.downsample_in_bottleneck = downsample_in_bottleneck
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
class ResNetOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-3
.\models\resnet\convert_resnet_to_pytorch.py
import argparse
import json
from dataclasses import dataclass, field
from functools import partial
from pathlib import Path
from typing import List
import timm
import torch
import torch.nn as nn
from huggingface_hub import hf_hub_download
from torch import Tensor
from transformers import AutoImageProcessor, ResNetConfig, ResNetForImageClassification
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger()
@dataclass
class Tracker:
module: nn.Module
traced: List[nn.Module] = field(default_factory=list)
handles: list = field(default_factory=list)
def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
if has_not_submodules:
self.traced.append(m)
def __call__(self, x: Tensor):
for m in self.module.modules():
self.handles.append(m.register_forward_hook(self._forward_hook))
self.module(x)
[x.remove() for x in self.handles]
return self
@property
def parametrized(self):
return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
@dataclass
class ModuleTransfer:
src: nn.Module
dest: nn.Module
verbose: int = 0
src_skip: List = field(default_factory=list)
dest_skip: List = field(default_factory=list)
def __call__(self, x: Tensor):
"""
Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
hood we tracked all the operations in both modules.
"""
dest_traced = Tracker(self.dest)(x).parametrized
src_traced = Tracker(self.src)(x).parametrized
src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
if len(dest_traced) != len(src_traced):
raise Exception(
f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
f" destination module has {len(dest_traced)}."
)
for dest_m, src_m in zip(dest_traced, src_traced):
dest_m.load_state_dict(src_m.state_dict())
if self.verbose == 1:
print(f"Transfered from={src_m} to={dest_m}")
def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Path, push_to_hub: bool = True):
print(f"Converting {name}...")
with torch.no_grad():
from_model = timm.create_model(name, pretrained=True).eval()
our_model = ResNetForImageClassification(config).eval()
module_transfer = ModuleTransfer(src=from_model, dest=our_model)
x = torch.randn((1, 3, 224, 224))
module_transfer(x)
assert torch.allclose(from_model(x), our_model(x).logits), "The model logits don't match the original one."
checkpoint_name = f"resnet{'-'.join(name.split('resnet'))}"
print(checkpoint_name)
if push_to_hub:
our_model.push_to_hub(
repo_path_or_name=save_directory / checkpoint_name,
commit_message="Add model",
use_temp_dir=True,
)
image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
image_processor.push_to_hub(
repo_path_or_name=save_directory / checkpoint_name,
commit_message="Add image processor",
use_temp_dir=True,
)
print(f"Pushed {checkpoint_name}")
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
filename = "imagenet-1k-id2label.json"
num_labels = 1000
expected_shape = (1, num_labels)
repo_id = "huggingface/label-files"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
id2label = id2label
label2id = {v: k for k, v in id2label.items()}
ImageNetPreTrainedConfig = partial(ResNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
names_to_config = {
"resnet18": ImageNetPreTrainedConfig(
depths=[2, 2, 2, 2], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
),
"resnet26": ImageNetPreTrainedConfig(
depths=[2, 2, 2, 2], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
),
"resnet34": ImageNetPreTrainedConfig(
depths=[3, 4, 6, 3], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
),
"resnet50": ImageNetPreTrainedConfig(
depths=[3, 4, 6, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
),
"resnet101": ImageNetPreTrainedConfig(
depths=[3, 4, 23, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
),
"resnet152": ImageNetPreTrainedConfig(
depths=[3, 8, 36, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
),
}
if model_name:
convert_weight_and_push(model_name, names_to_config[model_name], save_directory, push_to_hub)
else:
for model_name, config in names_to_config.items():
convert_weight_and_push(model_name, config, save_directory, push_to_hub)
return config, expected_shape
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
default=None,
type=str,
help=(
"要转换的模型名称,必须是支持的 resnet* 架构之一,"
"目前支持的有:resnet18,26,34,50,101,152。如果为 `None`,则转换所有支持的模型。"
),
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=Path,
required=True,
help="输出 PyTorch 模型目录的路径。",
)
parser.add_argument(
"--push_to_hub",
default=True,
type=bool,
required=False,
help="如果为 True,将模型和图像处理器推送到 hub。",
)
args = parser.parse_args()
pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
.\models\resnet\modeling_flax_resnet.py
from functools import partial
from typing import Optional, Tuple
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.traverse_util import flatten_dict, unflatten_dict
from ...modeling_flax_outputs import (
FlaxBaseModelOutputWithNoAttention,
FlaxBaseModelOutputWithPoolingAndNoAttention,
FlaxImageClassifierOutputWithNoAttention,
)
from ...modeling_flax_utils import (
ACT2FN,
FlaxPreTrainedModel,
append_replace_return_docstrings,
overwrite_call_docstring,
)
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
from .configuration_resnet import ResNetConfig
RESNET_START_DOCSTRING = r"""
This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
This model is also a
[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
behavior.
Finally, this model supports inherent JAX features such as:
- [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
- [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
- [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
- [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
"""
Args:
pixel_values (`jax.numpy.float32` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class Identity(nn.Module):
"""Identity function."""
@nn.compact
def __call__(self, x, **kwargs):
return x
class FlaxResNetConvLayer(nn.Module):
"""
Defines a convolutional layer followed by batch normalization and activation function.
"""
out_channels: int
kernel_size: int = 3
stride: int = 1
activation: Optional[str] = "relu"
dtype: jnp.dtype = jnp.float32
def setup(self):
self.convolution = nn.Conv(
self.out_channels,
kernel_size=(self.kernel_size, self.kernel_size),
strides=self.stride,
padding=self.kernel_size // 2,
dtype=self.dtype,
use_bias=False,
kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="normal", dtype=self.dtype),
)
self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
self.activation_func = ACT2FN[self.activation] if self.activation is not None else Identity()
def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
hidden_state = self.convolution(x)
hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
hidden_state = self.activation_func(hidden_state)
return hidden_state
class FlaxResNetEmbeddings(nn.Module):
"""
ResNet Embeddings (stem) composed of a single aggressive convolution.
"""
config: ResNetConfig
dtype: jnp.dtype = jnp.float32
def setup(self):
self.embedder = FlaxResNetConvLayer(
self.config.embedding_size,
kernel_size=7,
stride=2,
activation=self.config.hidden_act,
dtype=self.dtype,
)
self.max_pool = partial(nn.max_pool, window_shape=(3, 3), strides=(2, 2), padding=((1, 1), (1, 1)))
def __call__(self, pixel_values: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
num_channels = pixel_values.shape[-1]
if num_channels != self.config.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embedding = self.embedder(pixel_values, deterministic=deterministic)
embedding = self.max_pool(embedding)
return embedding
class FlaxResNetShortCut(nn.Module):
"""
Placeholder class for Flax ResNet shortcut connections.
No implementation details provided.
"""
ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
downsample the input using `stride=2`.
"""
# 定义一个类,用于 ResNet 中的快捷连接(shortcut),将残差特征投影到正确的尺寸。如果需要,也可以用来
# 使用 `stride=2` 对输入进行下采样。
out_channels: int # 输出通道数,即卷积层输出的特征图的深度
stride: int = 2 # 步长,默认为2,用于卷积操作时的步进大小
dtype: jnp.dtype = jnp.float32 # 数据类型,默认为 jax 的 float32 类型
def setup(self):
# 设置卷积层,用于实现残差块中的投影操作,将输入特征投影到输出通道大小
self.convolution = nn.Conv(
self.out_channels,
kernel_size=(1, 1), # 卷积核大小为 1x1
strides=self.stride, # 使用类属性中定义的步长
use_bias=False, # 不使用偏置项
kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"), # 卷积核初始化方式
dtype=self.dtype, # 使用指定的数据类型
)
# 设置批归一化层,用于规范化卷积层的输出特征
self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
# 在调用时,对输入 x 进行卷积投影操作
hidden_state = self.convolution(x)
# 对投影后的特征进行批归一化处理
hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
return hidden_state
class FlaxResNetBasicLayerCollection(nn.Module):
out_channels: int # 输出通道数
stride: int = 1 # 步长,默认为1
dtype: jnp.dtype = jnp.float32 # 数据类型,默认为32位浮点数
def setup(self):
self.layer = [ # 创建层列表
FlaxResNetConvLayer(self.out_channels, stride=self.stride, dtype=self.dtype), # 卷积层1
FlaxResNetConvLayer(self.out_channels, activation=None, dtype=self.dtype), # 卷积层2
]
def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
for layer in self.layer: # 对每一层进行迭代
hidden_state = layer(hidden_state, deterministic=deterministic) # 应用层到隐藏状态
return hidden_state # 返回最终隐藏状态
class FlaxResNetBasicLayer(nn.Module):
"""
A classic ResNet's residual layer composed by two `3x3` convolutions.
"""
in_channels: int # 输入通道数
out_channels: int # 输出通道数
stride: int = 1 # 步长,默认为1
activation: Optional[str] = "relu" # 激活函数,默认为ReLU
dtype: jnp.dtype = jnp.float32 # 数据类型,默认为32位浮点数
def setup(self):
should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1 # 是否应用快捷连接
self.shortcut = (
FlaxResNetShortCut(self.out_channels, stride=self.stride, dtype=self.dtype) # 如果需要,则创建快捷连接
if should_apply_shortcut
else None
)
self.layer = FlaxResNetBasicLayerCollection( # 创建基础层集合
out_channels=self.out_channels,
stride=self.stride,
dtype=self.dtype,
)
self.activation_func = ACT2FN[self.activation] # 获取激活函数
def __call__(self, hidden_state, deterministic: bool = True):
residual = hidden_state # 保存残差
hidden_state = self.layer(hidden_state, deterministic=deterministic) # 应用基础层到隐藏状态
if self.shortcut is not None: # 如果存在快捷连接
residual = self.shortcut(residual, deterministic=deterministic) # 应用快捷连接到残差
hidden_state += residual # 添加残差到隐藏状态
hidden_state = self.activation_func(hidden_state) # 应用激活函数到隐藏状态
return hidden_state # 返回最终隐藏状态
class FlaxResNetBottleNeckLayerCollection(nn.Module):
out_channels: int # 输出通道数
stride: int = 1 # 步长,默认为1
activation: Optional[str] = "relu" # 激活函数,默认为ReLU
reduction: int = 4 # 减少倍数,默认为4
dtype: jnp.dtype = jnp.float32 # 数据类型,默认为32位浮点数
def setup(self):
reduces_channels = self.out_channels // self.reduction # 减少的通道数
self.layer = [
FlaxResNetConvLayer(reduces_channels, kernel_size=1, dtype=self.dtype, name="0"), # 第一个卷积层
FlaxResNetConvLayer(reduces_channels, stride=self.stride, dtype=self.dtype, name="1"), # 第二个卷积层
FlaxResNetConvLayer(self.out_channels, kernel_size=1, activation=None, dtype=self.dtype, name="2"), # 第三个卷积层
]
def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
for layer in self.layer: # 对每一层进行迭代
hidden_state = layer(hidden_state, deterministic=deterministic) # 应用层到隐藏状态
return hidden_state # 返回最终隐藏状态
class FlaxResNetBottleNeckLayer(nn.Module):
"""
A classic ResNet's bottleneck layer composed by three `3x3` convolutions. The first `1x1` convolution reduces the
input by a factor of `reduction` in order to make the second `3x3` convolution faster. The last `1x1` convolution
remaps the reduced features to `out_channels`.
"""
# 输入通道数,表示输入特征的数量
in_channels: int
# 输出通道数,表示输出特征的数量
out_channels: int
# 步长,默认为1,控制卷积操作的步长大小
stride: int = 1
# 激活函数类型,默认为"relu"
activation: Optional[str] = "relu"
# 降维参数,默认为4,用于瓶颈层中的维度缩减
reduction: int = 4
# 数据类型,默认为32位浮点数
dtype: jnp.dtype = jnp.float32
# 初始化方法,用于设置网络层的结构
def setup(self):
# 判断是否需要应用快捷连接(shortcut)
should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
# 如果需要应用快捷连接,则创建一个FlaxResNetShortCut对象
self.shortcut = (
FlaxResNetShortCut(self.out_channels, stride=self.stride, dtype=self.dtype)
if should_apply_shortcut
else None
)
# 创建一个FlaxResNetBottleNeckLayerCollection对象,用于构建瓶颈层集合
self.layer = FlaxResNetBottleNeckLayerCollection(
self.out_channels,
stride=self.stride,
activation=self.activation,
reduction=self.reduction,
dtype=self.dtype,
)
# 获取指定名称的激活函数
self.activation_func = ACT2FN[self.activation]
# 对象调用方法,用于执行网络层的前向传播
def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
# 将输入隐藏状态作为残差进行保存
residual = hidden_state
# 如果存在快捷连接,则对残差应用快捷连接
if self.shortcut is not None:
residual = self.shortcut(residual, deterministic=deterministic)
# 对输入隐藏状态应用瓶颈层集合的处理
hidden_state = self.layer(hidden_state, deterministic)
# 将原始输入和处理后的残差相加
hidden_state += residual
# 对相加后的结果应用激活函数
hidden_state = self.activation_func(hidden_state)
# 返回处理后的隐藏状态
return hidden_state
class FlaxResNetStageLayersCollection(nn.Module):
"""
A ResNet stage composed by stacked layers.
"""
config: ResNetConfig # 配置对象,包含 ResNet 的配置信息
in_channels: int # 输入通道数
out_channels: int # 输出通道数
stride: int = 2 # 步幅,默认为 2
depth: int = 2 # 层深度,默认为 2
dtype: jnp.dtype = jnp.float32 # 数据类型,默认为 jnp.float32
def setup(self):
layer = FlaxResNetBottleNeckLayer if self.config.layer_type == "bottleneck" else FlaxResNetBasicLayer
layers = [
# downsampling is done in the first layer with stride of 2
# 第一层进行下采样,步幅为 2
layer(
self.in_channels,
self.out_channels,
stride=self.stride,
activation=self.config.hidden_act,
dtype=self.dtype,
name="0",
),
]
for i in range(self.depth - 1):
layers.append(
layer(
self.out_channels,
self.out_channels,
activation=self.config.hidden_act,
dtype=self.dtype,
name=str(i + 1),
)
)
self.layers = layers
def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
hidden_state = x
for layer in self.layers:
hidden_state = layer(hidden_state, deterministic=deterministic)
return hidden_state
class FlaxResNetStage(nn.Module):
"""
A ResNet stage composed by stacked layers.
"""
config: ResNetConfig # 配置对象,包含 ResNet 的配置信息
in_channels: int # 输入通道数
out_channels: int # 输出通道数
stride: int = 2 # 步幅,默认为 2
depth: int = 2 # 层深度,默认为 2
dtype: jnp.dtype = jnp.float32 # 数据类型,默认为 jnp.float32
def setup(self):
self.layers = FlaxResNetStageLayersCollection(
self.config,
in_channels=self.in_channels,
out_channels=self.out_channels,
stride=self.stride,
depth=self.depth,
dtype=self.dtype,
)
def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
return self.layers(x, deterministic=deterministic)
class FlaxResNetStageCollection(nn.Module):
config: ResNetConfig # 配置对象,包含 ResNet 的配置信息
dtype: jnp.dtype = jnp.float32 # 数据类型,默认为 jnp.float32
def setup(self):
in_out_channels = zip(self.config.hidden_sizes, self.config.hidden_sizes[1:])
stages = [
FlaxResNetStage(
self.config,
self.config.embedding_size,
self.config.hidden_sizes[0],
stride=2 if self.config.downsample_in_first_stage else 1,
depth=self.config.depths[0],
dtype=self.dtype,
name="0",
)
]
for i, ((in_channels, out_channels), depth) in enumerate(zip(in_out_channels, self.config.depths[1:])):
stages.append(
FlaxResNetStage(self.config, in_channels, out_channels, depth=depth, dtype=self.dtype, name=str(i + 1))
)
self.stages = stages
# 定义类中的调用方法,用于执行模型推理过程,返回不包含注意力权重的模型输出对象
def __call__(
self,
hidden_state: jnp.ndarray,
output_hidden_states: bool = False,
deterministic: bool = True,
) -> FlaxBaseModelOutputWithNoAttention:
# 如果需要输出隐藏状态,则初始化一个空元组;否则置为 None
hidden_states = () if output_hidden_states else None
# 遍历模型的各个阶段模块
for stage_module in self.stages:
# 如果需要输出隐藏状态,则将隐藏状态进行维度转换,并添加到隐藏状态元组中
if output_hidden_states:
hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
# 调用当前阶段模块,更新隐藏状态
hidden_state = stage_module(hidden_state, deterministic=deterministic)
# 返回更新后的隐藏状态和可能的隐藏状态元组
return hidden_state, hidden_states
# 定义一个继承自 nn.Module 的类 FlaxResNetEncoder,用于实现 ResNet 编码器
class FlaxResNetEncoder(nn.Module):
# 类属性 config,类型为 ResNetConfig,用于配置模型
config: ResNetConfig
# 类属性 dtype,默认为 jnp.float32 的数据类型
dtype: jnp.dtype = jnp.float32
# 初始化方法,设置编码器的各个阶段
def setup(self):
# 创建 FlaxResNetStageCollection 对象,用于管理 ResNet 的阶段
self.stages = FlaxResNetStageCollection(self.config, dtype=self.dtype)
# 对象调用方法,实现编码器的前向传播逻辑
def __call__(
self,
hidden_state: jnp.ndarray,
output_hidden_states: bool = False,
return_dict: bool = True,
deterministic: bool = True,
) -> FlaxBaseModelOutputWithNoAttention:
# 调用编码器的阶段对象进行前向传播,得到编码后的隐藏状态和可能的中间隐藏状态列表
hidden_state, hidden_states = self.stages(
hidden_state, output_hidden_states=output_hidden_states, deterministic=deterministic
)
# 如果需要输出中间隐藏状态,将当前隐藏状态加入到隐藏状态列表中
if output_hidden_states:
hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
# 如果不需要返回字典形式的输出,则将有效的结果作为元组返回
if not return_dict:
return tuple(v for v in [hidden_state, hidden_states] if v is not None)
# 返回包含最终隐藏状态和隐藏状态列表的 FlaxBaseModelOutputWithNoAttention 对象
return FlaxBaseModelOutputWithNoAttention(
last_hidden_state=hidden_state,
hidden_states=hidden_states,
)
# 定义一个继承自 FlaxPreTrainedModel 的抽象类 FlaxResNetPreTrainedModel
class FlaxResNetPreTrainedModel(FlaxPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 类属性 config_class,指定模型配置类为 ResNetConfig
config_class = ResNetConfig
# 类属性 base_model_prefix,指定基础模型前缀为 "resnet"
base_model_prefix = "resnet"
# 类属性 main_input_name,指定主要输入名称为 "pixel_values"
main_input_name = "pixel_values"
# 类属性 module_class,用于指定模块类的类型,默认为 None
module_class: nn.Module = None
# 初始化方法,用于创建模型对象
def __init__(
self,
config: ResNetConfig,
input_shape=(1, 224, 224, 3),
seed: int = 0,
dtype: jnp.dtype = jnp.float32,
_do_init: bool = True,
**kwargs,
):
# 根据配置类和其他参数创建模块对象
module = self.module_class(config=config, dtype=dtype, **kwargs)
# 如果未指定输入形状,则使用默认形状根据配置设置
if input_shape is None:
input_shape = (1, config.image_size, config.image_size, config.num_channels)
# 调用父类的初始化方法,传递配置、模块、输入形状等参数
super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
# 初始化权重方法,用于随机初始化模型的参数
def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
# 初始化像素值为全零张量作为输入
pixel_values = jnp.zeros(input_shape, dtype=self.dtype)
# 创建随机数生成器字典,用于参数初始化
rngs = {"params": rng}
# 使用模块对象的初始化方法,初始化模型参数,返回未冻结的参数字典
random_params = self.module.init(rngs, pixel_values, return_dict=False)
# 如果提供了已有的参数字典,则将缺失的参数从随机初始化的参数中复制过来
if params is not None:
random_params = flatten_dict(unfreeze(random_params))
params = flatten_dict(unfreeze(params))
for missing_key in self._missing_keys:
params[missing_key] = random_params[missing_key]
self._missing_keys = set()
return freeze(unflatten_dict(params))
else:
return random_params
# 对象调用方法,实现模型的前向传播逻辑
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
def __call__(
self,
pixel_values,
params: dict = None,
train: bool = False,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) -> FlaxBaseModelOutput:
# 实现模型的前向传播逻辑,具体细节根据模型的实现和输入参数进行处理
pass # 这里未提供完整的方法实现,需要根据具体模型的实现补充
):
# 如果 output_hidden_states 不为 None,则使用传入的值,否则使用模型配置中的默认值
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果 return_dict 不为 None,则使用传入的值,否则使用模型配置中的默认值
return_dict = return_dict if return_dict is not None else self.config.return_dict
# 调整像素值的维度顺序,将通道维度移到最后一个维度位置
pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
# 处理可能需要的随机数生成器
rngs = {}
# 调用模型的 apply 方法进行推断或训练
return self.module.apply(
{
"params": params["params"] if params is not None else self.params["params"],
"batch_stats": params["batch_stats"] if params is not None else self.params["batch_stats"],
},
jnp.array(pixel_values, dtype=jnp.float32), # 将像素值转换为 jax 数组并传入
not train, # 如果不是训练模式,则传入 True,表示推断模式
output_hidden_states, # 传入是否需要隐藏状态的标志
return_dict, # 传入是否返回字典的标志
rngs=rngs, # 传入随机数生成器
mutable=["batch_stats"] if train else False, # 当 train 为 True 时返回包含 batch_stats 的元组
)
# 定义一个名为FlaxResNetModule的类,继承自nn.Module
class FlaxResNetModule(nn.Module):
# 类变量config,用于存储ResNet的配置信息
config: ResNetConfig
# 定义dtype变量,默认为jnp.float32,用于指定计算时的数据类型
dtype: jnp.dtype = jnp.float32 # 计算中使用的数据类型
# 定义setup方法,用于初始化模块
def setup(self):
# 创建FlaxResNetEmbeddings对象,使用self.config和self.dtype作为参数
self.embedder = FlaxResNetEmbeddings(self.config, dtype=self.dtype)
# 创建FlaxResNetEncoder对象,使用self.config和self.dtype作为参数
self.encoder = FlaxResNetEncoder(self.config, dtype=self.dtype)
# 创建部分应用了avg_pool函数的pooler对象,设置了padding参数
self.pooler = partial(
nn.avg_pool,
padding=((0, 0), (0, 0)),
)
# 定义__call__方法,实现对象的调用功能
def __call__(
self,
pixel_values,
deterministic: bool = True,
output_hidden_states: bool = False,
return_dict: bool = True,
) -> FlaxBaseModelOutputWithPoolingAndNoAttention:
# 如果output_hidden_states为None,则使用self.config.output_hidden_states
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果return_dict为None,则使用self.config.use_return_dict
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 获取嵌入输出,调用self.embedder对象
embedding_output = self.embedder(pixel_values, deterministic=deterministic)
# 获取编码器输出,调用self.encoder对象
encoder_outputs = self.encoder(
embedding_output,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
deterministic=deterministic,
)
# 获取最后隐藏状态
last_hidden_state = encoder_outputs[0]
# 对最后隐藏状态进行自适应平均池化操作
pooled_output = self.pooler(
last_hidden_state,
window_shape=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
strides=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
).transpose(0, 3, 1, 2)
# 调整最后隐藏状态的维度顺序
last_hidden_state = last_hidden_state.transpose(0, 3, 1, 2)
# 如果return_dict为False,则返回元组
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
# 返回FlaxBaseModelOutputWithPoolingAndNoAttention对象
return FlaxBaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
# 为FlaxResNetModel类添加文档注释
@add_start_docstrings(
"The bare ResNet model outputting raw features without any specific head on top.",
RESNET_START_DOCSTRING,
)
class FlaxResNetModel(FlaxResNetPreTrainedModel):
module_class = FlaxResNetModule
# 定义FLAX_VISION_MODEL_DOCSTRING常量,包含FlaxResNetModel类的文档字符串
FLAX_VISION_MODEL_DOCSTRING = """
Returns:
Examples:
```
>>> from transformers import AutoImageProcessor, FlaxResNetModel
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
>>> model = FlaxResNetModel.from_pretrained("microsoft/resnet-50")
>>> inputs = image_processor(images=image, return_tensors="np")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```
"""
# 调用overwrite_call_docstring函数,为FlaxResNetModel类覆盖文档字符串
overwrite_call_docstring(FlaxResNetModel, FLAX_VISION_MODEL_DOCSTRING)
# 调用append_replace_return_docstrings函数,为FlaxResNetModel类添加或替换返回值的文档字符串
append_replace_return_docstrings(
# 导入FlaxResNetModel类,并指定output_type和config_class参数
FlaxResNetModel, output_type=FlaxBaseModelOutputWithPoolingAndNoAttention, config_class=ResNetConfig
# 导入所需的库和模块
import jax
import requests
# FLAX_VISION_CLASSIF_DOCSTRING文档字符串,描述了FlaxResNetForImageClassification模型的返回和示例用法
FLAX_VISION_CLASSIF_DOCSTRING = """
Returns:
返回一个示例,展示如何使用该模型进行图像分类预测。
Example:
展示如何从URL下载图像并使用模型预测图像的分类。
```
>>> from transformers import AutoImageProcessor, FlaxResNetForImageClassification
>>> from PIL import Image
>>> import jax
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
>>> model = FlaxResNetForImageClassification.from_pretrained("microsoft/resnet-50")
>>> inputs = image_processor(images=image, return_tensors="np")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>>
>>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
>>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
```
"""
# 使用overwrite_call_docstring函数将FLAX_VISION_CLASSIF_DOCSTRING设置为FlaxResNetForImageClassification类的文档字符串
overwrite_call_docstring(FlaxResNetForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)
# 使用append_replace_return_docstrings函数,为FlaxResNetForImageClassification类追加和替换返回结果的文档字符串
append_replace_return_docstrings(
FlaxResNetForImageClassification, output_type=FlaxImageClassifierOutputWithNoAttention, config_class=ResNetConfig
)
.\models\resnet\modeling_resnet.py
from typing import Optional
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BackboneOutput,
BaseModelOutputWithNoAttention,
BaseModelOutputWithPoolingAndNoAttention,
ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_resnet import ResNetConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "ResNetConfig"
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "microsoft/resnet-50"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/resnet-50",
]
class ResNetConvLayer(nn.Module):
def __init__(
self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu"
):
super().__init__()
self.convolution = nn.Conv2d(
in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False
)
self.normalization = nn.BatchNorm2d(out_channels)
self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
def forward(self, input: Tensor) -> Tensor:
hidden_state = self.convolution(input)
hidden_state = self.normalization(hidden_state)
hidden_state = self.activation(hidden_state)
return hidden_state
class ResNetEmbeddings(nn.Module):
"""
ResNet Embeddings (stem) composed of a single aggressive convolution.
"""
def __init__(self, config: ResNetConfig):
super().__init__()
self.embedder = ResNetConvLayer(
config.num_channels, config.embedding_size, kernel_size=7, stride=2, activation=config.hidden_act
)
self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.num_channels = config.num_channels
def forward(self, pixel_values: Tensor) -> Tensor:
num_channels = pixel_values.shape[1]
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embedding = self.embedder(pixel_values)
embedding = self.pooler(embedding)
return embedding
class ResNetBottleNeckLayer(nn.Module):
"""
A classic ResNet's bottleneck layer composed by three `3x3` convolutions.
The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If
`downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
stride: int = 1,
activation: str = "relu",
reduction: int = 4,
downsample_in_bottleneck: bool = False,
):
super().__init__()
should_apply_shortcut = in_channels != out_channels or stride != 1
self.shortcut = (
ResNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
)
self.layer1 = ResNetConvLayer(in_channels, out_channels // reduction, kernel_size=1, stride=1)
self.layer2 = ResNetConvLayer(out_channels // reduction, out_channels // reduction, kernel_size=3, stride=stride)
self.layer3 = ResNetConvLayer(out_channels // reduction, out_channels, kernel_size=1, stride=1, activation=None)
self.activation = ACT2FN[activation]
def forward(self, hidden_state):
residual = hidden_state
hidden_state = self.layer1(hidden_state)
hidden_state = self.layer2(hidden_state)
hidden_state = self.layer3(hidden_state)
residual = self.shortcut(residual)
hidden_state += residual
hidden_state = self.activation(hidden_state)
return hidden_state
):
super().__init__()
should_apply_shortcut = in_channels != out_channels or stride != 1
reduces_channels = out_channels // reduction
self.shortcut = (
ResNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
)
self.layer = nn.Sequential(
ResNetConvLayer(
in_channels, reduces_channels, kernel_size=1, stride=stride if downsample_in_bottleneck else 1
),
ResNetConvLayer(reduces_channels, reduces_channels, stride=stride if not downsample_in_bottleneck else 1),
ResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None),
)
self.activation = ACT2FN[activation]
def forward(self, hidden_state):
residual = hidden_state
hidden_state = self.layer(hidden_state)
residual = self.shortcut(residual)
hidden_state += residual
hidden_state = self.activation(hidden_state)
return hidden_state
class ResNetStage(nn.Module):
"""
A ResNet stage composed by stacked layers.
"""
def __init__(
self,
config: ResNetConfig,
in_channels: int,
out_channels: int,
stride: int = 2,
depth: int = 2,
):
super().__init__()
layer = ResNetBottleNeckLayer if config.layer_type == "bottleneck" else ResNetBasicLayer
if config.layer_type == "bottleneck":
first_layer = layer(
in_channels,
out_channels,
stride=stride,
activation=config.hidden_act,
downsample_in_bottleneck=config.downsample_in_bottleneck,
)
else:
first_layer = layer(in_channels, out_channels, stride=stride, activation=config.hidden_act)
self.layers = nn.Sequential(
first_layer, *[layer(out_channels, out_channels, activation=config.hidden_act) for _ in range(depth - 1)]
)
def forward(self, input: Tensor) -> Tensor:
hidden_state = input
for layer in self.layers:
hidden_state = layer(hidden_state)
return hidden_state
class ResNetEncoder(nn.Module):
"""
ResNet 编码器由多个 ResNet 阶段组成。
"""
def __init__(self, config: ResNetConfig):
super().__init__()
self.stages = nn.ModuleList([])
self.stages.append(
ResNetStage(
config,
config.embedding_size,
config.hidden_sizes[0],
stride=2 if config.downsample_in_first_stage else 1,
depth=config.depths[0],
)
)
in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
self.stages.append(ResNetStage(config, in_channels, out_channels, depth=depth))
def forward(
self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
) -> BaseModelOutputWithNoAttention:
hidden_states = () if output_hidden_states else None
for stage_module in self.stages:
if output_hidden_states:
hidden_states = hidden_states + (hidden_state,)
hidden_state = stage_module(hidden_state)
if output_hidden_states:
hidden_states = hidden_states + (hidden_state,)
if not return_dict:
return tuple(v for v in [hidden_state, hidden_states] if v is not None)
return BaseModelOutputWithNoAttention(
last_hidden_state=hidden_state,
hidden_states=hidden_states,
)
class ResNetPreTrainedModel(PreTrainedModel):
"""
处理权重初始化和预训练模型下载和加载的抽象类。
"""
config_class = ResNetConfig
base_model_prefix = "resnet"
main_input_name = "pixel_values"
def _init_weights(self, module):
if isinstance(module, nn.Conv2d):
nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(module.weight, 1)
nn.init.constant_(module.bias, 0)
RESNET_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`ResNetConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
RESNET_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`ConvNextImageProcessor.__call__`] for details.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare ResNet model outputting raw features without any specific head on top.",
RESNET_START_DOCSTRING,
)
class ResNetModel(ResNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.embedder = ResNetEmbeddings(config)
self.encoder = ResNetEncoder(config)
self.pooler = nn.AdaptiveAvgPool2d((1, 1))
self.post_init()
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
) -> BaseModelOutputWithPoolingAndNoAttention:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
embedding_output = self.embedder(pixel_values)
encoder_outputs = self.encoder(
embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
)
last_hidden_state = encoder_outputs[0]
pooled_output = self.pooler(last_hidden_state)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
"""
ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
"""
class ResNetForImageClassification(ResNetPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.resnet = ResNetModel(config)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
)
self.post_init()
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=ImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> ImageClassifierOutputWithNoAttention:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.resnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return (loss,) + output if loss is not None else output
return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
@add_start_docstrings(
"""
ResNet backbone, to be used with frameworks like DETR and MaskFormer.
""",
RESNET_START_DOCSTRING,
)
class ResNetBackbone(ResNetPreTrainedModel, BackboneMixin):
def __init__(self, config):
super().__init__(config)
super()._init_backbone(config)
self.num_features = [config.embedding_size] + config.hidden_sizes
self.embedder = ResNetEmbeddings(config)
self.encoder = ResNetEncoder(config)
self.post_init()
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
) -> BackboneOutput:
"""
返回模型的输出结果。
Examples:
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
>>> model = AutoBackbone.from_pretrained(
... "microsoft/resnet-50", out_features=["stage1", "stage2", "stage3", "stage4"]
... )
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 2048, 7, 7]
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
embedding_output = self.embedder(pixel_values)
outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
hidden_states = outputs.hidden_states
feature_maps = ()
for idx, stage in enumerate(self.stage_names):
if stage in self.out_features:
feature_maps += (hidden_states[idx],)
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (outputs.hidden_states,)
return output
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=None,
)
.\models\resnet\modeling_tf_resnet.py
""" TensorFlow ResNet model."""
from typing import Optional, Tuple, Union
import tensorflow as tf
from ...activations_tf import ACT2FN
from ...modeling_tf_outputs import (
TFBaseModelOutputWithNoAttention,
TFBaseModelOutputWithPoolingAndNoAttention,
TFImageClassifierOutputWithNoAttention,
)
from ...modeling_tf_utils import (
TFPreTrainedModel,
TFSequenceClassificationLoss,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_resnet import ResNetConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "ResNetConfig"
_CHECKPOINT_FOR_DOC = "microsoft/resnet-50"
_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]
_IMAGE_CLASS_CHECKPOINT = "microsoft/resnet-50"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/resnet-50",
]
class TFResNetConvLayer(keras.layers.Layer):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
activation: str = "relu",
**kwargs,
) -> None:
super().__init__(**kwargs)
self.pad_value = kernel_size // 2
self.conv = keras.layers.Conv2D(
out_channels, kernel_size=kernel_size, strides=stride, padding="valid", use_bias=False, name="convolution"
)
self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.activation = ACT2FN[activation] if activation is not None else keras.layers.Activation("linear")
self.in_channels = in_channels
self.out_channels = out_channels
def convolution(self, hidden_state: tf.Tensor) -> tf.Tensor:
height_pad = width_pad = (self.pad_value, self.pad_value)
hidden_state = tf.pad(hidden_state, [(0, 0), height_pad, width_pad, (0, 0)])
hidden_state = self.conv(hidden_state)
return hidden_state
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = self.convolution(hidden_state)
hidden_state = self.normalization(hidden_state, training=training)
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv", None) is not None:
with tf.name_scope(self.conv.name):
self.conv.build([None, None, None, self.in_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.out_channels])
class TFResNetEmbeddings(keras.layers.Layer):
"""
ResNet Embeddings (stem) composed of a single aggressive convolution.
"""
def __init__(self, config: ResNetConfig, **kwargs) -> None:
super().__init__(**kwargs)
self.embedder = TFResNetConvLayer(
config.num_channels,
config.embedding_size,
kernel_size=7,
stride=2,
activation=config.hidden_act,
name="embedder",
)
self.pooler = keras.layers.MaxPool2D(pool_size=3, strides=2, padding="valid", name="pooler")
self.num_channels = config.num_channels
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
_, _, _, num_channels = shape_list(pixel_values)
if tf.executing_eagerly() and num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
hidden_state = pixel_values
hidden_state = self.embedder(hidden_state)
hidden_state = tf.pad(hidden_state, [[0, 0], [1, 1], [1, 1], [0, 0]])
hidden_state = self.pooler(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedder", None) is not None:
with tf.name_scope(self.embedder.name):
self.embedder.build(None)
if getattr(self, "pooler", None) is not None:
with tf.name_scope(self.pooler.name):
self.pooler.build(None)
class TFResNetShortCut(keras.layers.Layer):
"""
ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
downsample the input using `stride=2`.
"""
def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs) -> None:
super().__init__(**kwargs)
self.convolution = keras.layers.Conv2D(
out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution"
)
self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
self.in_channels = in_channels
self.out_channels = out_channels
def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
hidden_state = x
hidden_state = self.convolution(hidden_state)
hidden_state = self.normalization(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "convolution", None) is not None:
with tf.name_scope(self.convolution.name):
self.convolution.build([None, None, None, self.in_channels])
if getattr(self, "normalization", None) is not None:
with tf.name_scope(self.normalization.name):
self.normalization.build([None, None, None, self.out_channels])
class TFResNetBasicLayer(keras.layers.Layer):
"""
A classic ResNet's residual layer composed by two `3x3` convolutions.
"""
def __init__(
self, in_channels: int, out_channels: int, stride: int = 1, activation: str = "relu", **kwargs
) -> None:
super().__init__(**kwargs)
should_apply_shortcut = in_channels != out_channels or stride != 1
self.conv1 = TFResNetConvLayer(in_channels, out_channels, stride=stride, name="layer.0")
self.conv2 = TFResNetConvLayer(out_channels, out_channels, activation=None, name="layer.1")
self.shortcut = (
TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
if should_apply_shortcut
else keras.layers.Activation("linear", name="shortcut")
)
self.activation = ACT2FN[activation]
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
residual = hidden_state
hidden_state = self.conv1(hidden_state, training=training)
hidden_state = self.conv2(hidden_state, training=training)
residual = self.shortcut(residual, training=training)
hidden_state += residual
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv1", None) is not None:
with tf.name_scope(self.conv1.name):
self.conv1.build(None)
if getattr(self, "conv2", None) is not None:
with tf.name_scope(self.conv2.name):
self.conv2.build(None)
if getattr(self, "shortcut", None) is not None:
with tf.name_scope(self.shortcut.name):
self.shortcut.build(None)
class TFResNetBottleNeckLayer(keras.layers.Layer):
"""
A classic ResNet's bottleneck layer composed by three `3x3` convolutions.
The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
stride: int = 1,
activation: str = "relu",
reduction: int = 4,
**kwargs,
) -> None:
super().__init__(**kwargs)
should_apply_shortcut = in_channels != out_channels or stride != 1
reduces_channels = out_channels // reduction
self.conv0 = TFResNetConvLayer(in_channels, reduces_channels, kernel_size=1, name="layer.0")
self.conv1 = TFResNetConvLayer(reduces_channels, reduces_channels, stride=stride, name="layer.1")
self.conv2 = TFResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None, name="layer.2")
self.shortcut = (
TFResNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
if should_apply_shortcut
else keras.layers.Activation("linear", name="shortcut")
)
self.activation = ACT2FN[activation]
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
residual = hidden_state
hidden_state = self.conv0(hidden_state, training=training)
hidden_state = self.conv1(hidden_state, training=training)
hidden_state = self.conv2(hidden_state, training=training)
residual = self.shortcut(residual, training=training)
hidden_state += residual
hidden_state = self.activation(hidden_state)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "conv0", None) is not None:
with tf.name_scope(self.conv0.name):
self.conv0.build(None)
if getattr(self, "conv1", None) is not None:
with tf.name_scope(self.conv1.name):
self.conv1.build(None)
if getattr(self, "conv2", None) is not None:
with tf.name_scope(self.conv2.name):
self.conv2.build(None)
if getattr(self, "shortcut", None) is not None:
with tf.name_scope(self.shortcut.name):
self.shortcut.build(None)
class TFResNetStage(keras.layers.Layer):
"""
A ResNet stage composed of stacked layers.
"""
def __init__(
self, config: ResNetConfig, in_channels: int, out_channels: int, stride: int = 2, depth: int = 2, **kwargs
) -> None:
super().__init__(**kwargs)
layer = TFResNetBottleNeckLayer if config.layer_type == "bottleneck" else TFResNetBasicLayer
layers = [layer(in_channels, out_channels, stride=stride, activation=config.hidden_act, name="layers.0")]
layers += [
layer(out_channels, out_channels, activation=config.hidden_act, name=f"layers.{i + 1}")
for i in range(depth - 1)
]
self.stage_layers = layers
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
for layer in self.stage_layers:
hidden_state = layer(hidden_state, training=training)
return hidden_state
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "stage_layers", None) is not None:
for layer in self.stage_layers:
with tf.name_scope(layer.name):
layer.build(None)
class TFResNetEncoder(keras.layers.Layer):
def __init__(self, config: ResNetConfig, **kwargs) -> None:
super().__init__(**kwargs)
self.stages = [
TFResNetStage(
config,
config.embedding_size,
config.hidden_sizes[0],
stride=2 if config.downsample_in_first_stage else 1,
depth=config.depths[0],
name="stages.0",
)
]
for i, (in_channels, out_channels, depth) in enumerate(
zip(config.hidden_sizes, config.hidden_sizes[1:], config.depths[1:])
):
self.stages.append(TFResNetStage(config, in_channels, out_channels, depth=depth, name=f"stages.{i + 1}"))
def call(
self,
hidden_state: tf.Tensor,
output_hidden_states: bool = False,
return_dict: bool = True,
training: bool = False,
) -> TFBaseModelOutputWithNoAttention:
hidden_states = () if output_hidden_states else None
for stage_module in self.stages:
if output_hidden_states:
hidden_states = hidden_states + (hidden_state,)
hidden_state = stage_module(hidden_state, training=training)
if output_hidden_states:
hidden_states = hidden_states + (hidden_state,)
if not return_dict:
return tuple(v for v in [hidden_state, hidden_states] if v is not None)
return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "stages", None) is not None:
for layer in self.stages:
with tf.name_scope(layer.name):
layer.build(None)
@keras_serializable
class TFResNetMainLayer(keras.layers.Layer):
config_class = ResNetConfig
def __init__(self, config: ResNetConfig, **kwargs) -> None:
super().__init__(**kwargs)
self.config = config
self.embedder = TFResNetEmbeddings(config, name="embedder")
self.encoder = TFResNetEncoder(config, name="encoder")
self.pooler = keras.layers.GlobalAveragePooling2D(keepdims=True)
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPoolingAndNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
pixel_values = tf.transpose(pixel_values, perm=[0, 2, 3, 1])
embedding_output = self.embedder(pixel_values, training=training)
encoder_outputs = self.encoder(
embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
)
last_hidden_state = encoder_outputs[0]
pooled_output = self.pooler(last_hidden_state)
last_hidden_state = tf.transpose(last_hidden_state, (0, 3, 1, 2))
pooled_output = tf.transpose(pooled_output, (0, 3, 1, 2))
hidden_states = ()
for hidden_state in encoder_outputs[1:]:
hidden_states = hidden_states + tuple(tf.transpose(h, (0, 3, 1, 2)) for h in hidden_state)
if not return_dict:
return (last_hidden_state, pooled_output) + hidden_states
hidden_states = hidden_states if output_hidden_states else None
return TFBaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=hidden_states,
)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "embedder", None) is not None:
with tf.name_scope(self.embedder.name):
self.embedder.build(None)
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
@add_start_docstrings(
"The bare ResNet model outputting raw features without any specific head on top.",
RESNET_START_DOCSTRING,
)
class TFResNetModel(TFResNetPreTrainedModel):
def __init__(self, config: ResNetConfig, **kwargs) -> None:
super().__init__(config, **kwargs)
self.resnet = TFResNetMainLayer(config=config, name="resnet")
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFBaseModelOutputWithPoolingAndNoAttention,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPoolingAndNoAttention]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
resnet_outputs = self.resnet(
pixel_values=pixel_values,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
return resnet_outputs
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "resnet", None) is not None:
with tf.name_scope(self.resnet.name):
self.resnet.build(None)
@add_start_docstrings(
"""
ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
""",
RESNET_START_DOCSTRING,
)
class TFResNetForImageClassification(TFResNetPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config: ResNetConfig, **kwargs) -> None:
super().__init__(config, **kwargs)
self.num_labels = config.num_labels
self.resnet = TFResNetMainLayer(config, name="resnet")
self.classifier_layer = (
keras.layers.Dense(config.num_labels, name="classifier.1")
if config.num_labels > 0
else keras.layers.Activation("linear", name="classifier.1")
)
self.config = config
def classifier(self, x: tf.Tensor) -> tf.Tensor:
x = keras.layers.Flatten()(x)
logits = self.classifier_layer(x)
return logits
@add_start_docstrings_to_model_forward(RESNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFImageClassifierOutputWithNoAttention,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor = None,
labels: tf.Tensor = None,
output_hidden_states: bool = None,
return_dict: bool = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor], TFImageClassifierOutputWithNoAttention]:
r"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.resnet(
pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
)
pooled_output = outputs.pooler_output if return_dict else outputs[1]
logits = self.classifier(pooled_output)
loss = None if labels is None else self.hf_compute_loss(labels, logits)
if not return_dict:
output = (logits,) + outputs[2:]
return (loss,) + output if loss is not None else output
return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
def build(self, input_shape=None):
if self.built:
return
self.built = True
if getattr(self, "resnet", None) is not None:
with tf.name_scope(self.resnet.name):
self.resnet.build(None)
if getattr(self, "classifier_layer", None) is not None:
with tf.name_scope(self.classifier_layer.name):
self.classifier_layer.build([None, None, self.config.hidden_sizes[-1]])
.\models\resnet\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_flax_available,
is_tf_available,
is_torch_available,
)
_import_structure = {
"configuration_resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig", "ResNetOnnxConfig"]
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_resnet"] = [
"RESNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"ResNetForImageClassification",
"ResNetModel",
"ResNetPreTrainedModel",
"ResNetBackbone",
]
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tf_resnet"] = [
"TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFResNetForImageClassification",
"TFResNetModel",
"TFResNetPreTrainedModel",
]
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_flax_resnet"] = [
"FlaxResNetForImageClassification",
"FlaxResNetModel",
"FlaxResNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig, ResNetOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_resnet import (
RESNET_PRETRAINED_MODEL_ARCHIVE_LIST,
ResNetBackbone,
ResNetForImageClassification,
ResNetModel,
ResNetPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tf_resnet import (
TF_RESNET_PRETRAINED_MODEL_ARCHIVE_LIST,
TFResNetForImageClassification,
TFResNetModel,
TFResNetPreTrainedModel,
)
try:
if not is_flax_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
from .modeling_flax_resnet import FlaxResNetForImageClassification, FlaxResNetModel, FlaxResNetPreTrainedModel
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
.\models\roberta\configuration_roberta.py
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"FacebookAI/roberta-base": "https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json",
"FacebookAI/roberta-large": "https://huggingface.co/FacebookAI/roberta-large/resolve/main/config.json",
"FacebookAI/roberta-large-mnli": "https://huggingface.co/FacebookAI/roberta-large-mnli/resolve/main/config.json",
"distilbert/distilroberta-base": "https://huggingface.co/distilbert/distilroberta-base/resolve/main/config.json",
"openai-community/roberta-base-openai-detector": "https://huggingface.co/openai-community/roberta-base-openai-detector/resolve/main/config.json",
"openai-community/roberta-large-openai-detector": "https://huggingface.co/openai-community/roberta-large-openai-detector/resolve/main/config.json",
}
class RobertaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`RobertaModel`] or a [`TFRobertaModel`]. It is
used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
[FacebookAI/roberta-base](https://huggingface.co/FacebookAI/roberta-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Examples:
```
>>> from transformers import RobertaConfig, RobertaModel
>>> # Initializing a RoBERTa configuration
>>> configuration = RobertaConfig()
>>> # Initializing a model (with random weights) from the configuration
>>> model = RobertaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "roberta"
def __init__(
self,
vocab_size=50265,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
class RobertaOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)